"""
binpkg tar utilities
"""
import os
import stat
from functools import partial
from itertools import count
from snakeoil import compression
from snakeoil.compatibility import cmp, sorted_cmp
from snakeoil.data_source import invokable_data_source
from snakeoil.tar import tarfile
from . import contents
from .fs import fsDev, fsDir, fsFifo, fsFile, fsSymlink
_unique_inode = count(2**32).__next__
known_compressors = {
"bz2": tarfile.TarFile.bz2open,
"gz": tarfile.TarFile.gzopen,
None: tarfile.TarFile.open,
}
[docs]
def write_set(
contents_set, filepath, compressor="bzip2", absolute_paths=False, parallelize=False
):
if compressor == "bz2":
compressor = "bzip2"
tar_handle = None
handle = compression.compress_handle(compressor, filepath, parallelize=parallelize)
try:
tar_handle = tarfile.TarFile(name=filepath, fileobj=handle, mode="w")
add_contents_to_tarfile(contents_set, tar_handle)
finally:
if tar_handle is not None:
tar_handle.close()
handle.close()
[docs]
def add_contents_to_tarfile(contents_set, tar_fd, absolute_paths=False):
# first add directories, then everything else
# this is just a pkgcore optimization, it prefers to see the dirs first.
dirs = contents_set.dirs()
dirs.sort()
for x in dirs:
tar_fd.addfile(fsobj_to_tarinfo(x, absolute_paths))
del dirs
inodes = {}
for x in contents_set.iterdirs(invert=True):
t = fsobj_to_tarinfo(x, absolute_paths)
if t.isreg():
key = (x.dev, x.inode)
existing = inodes.get(key)
data = None
if existing is not None:
if x._can_be_hardlinked(existing):
t.type = tarfile.LNKTYPE
t.linkname = "./%s" % existing.location.lstrip("/")
t.size = 0
else:
inodes[key] = x
data = x.data.bytes_fileobj()
tar_fd.addfile(t, fileobj=data)
# tar_fd.addfile(t, fileobj=x.data.bytes_fileobj())
else:
tar_fd.addfile(t)
[docs]
def archive_to_fsobj(src_tar):
psep = os.path.sep
dev = _unique_inode()
# inode cache used for supporting hardlinks.
# Since the tarfile specifies a hardlink target by path (rather than internally
# consistent inode numbers), we have to normalize the path lookup into this cache
# via abspath(os.path.join('/', key))...
inodes = {}
for member in src_tar:
d = {
"uid": member.uid,
"gid": member.gid,
"mtime": member.mtime,
"mode": member.mode,
}
location = os.path.abspath(os.path.join(psep, member.name.strip(psep)))
if member.isdir():
if member.name.strip(psep) == ".":
continue
yield fsDir(location, **d)
elif member.isreg() or member.islnk():
d["dev"] = dev
if member.islnk():
target = os.path.abspath(os.path.join(psep, member.linkname))
inode = inodes.get(target)
if inode is None:
raise AssertionError(
"Tarfile file %r is a hardlink to %r, but we can't "
"find the resolved hardlink target %r in the archive. "
"This means either a bug in pkgcore, or a malformed "
"tarball." % (member.name, member.linkname, target)
)
d["inode"] = inode
else:
d["inode"] = inode = _unique_inode()
# Add the new file to the inode cache even if we're currently processing a
# hardlink; tar allows for hardlink chains of x -> y -> z; thus we have
# to ensure 'y' is in the cache alongside it's target z to support 'x'
# later lookup.
inodes[location] = inode
d["data"] = invokable_data_source.wrap_function(
partial(src_tar.extractfile, member.name),
returns_text=False,
returns_handle=True,
)
yield fsFile(location, **d)
elif member.issym() or member.islnk():
yield fsSymlink(location, member.linkname, **d)
elif member.isfifo():
yield fsFifo(location, **d)
elif member.isdev():
d["major"] = int(member.major)
d["minor"] = int(member.minor)
yield fsDev(location, **d)
else:
raise AssertionError(
"unknown type %r, %r was encounted walking tarmembers"
% (member, member.type)
)
[docs]
def fsobj_to_tarinfo(fsobj, absolute_path=True):
t = tarfile.TarInfo()
if fsobj.is_reg:
t.type = tarfile.REGTYPE
t.size = fsobj.chksums["size"]
elif fsobj.is_dir:
t.type = tarfile.DIRTYPE
elif fsobj.is_sym:
t.type = tarfile.SYMTYPE
t.linkname = fsobj.target
elif fsobj.is_fifo:
t.type = tarfile.FIFOTYPE
elif fsobj.is_dev:
if stat.S_ISCHR(fsobj.mode):
t.type = tarfile.CHRTYPE
else:
t.type = tarfile.BLKTYPE
t.devmajor = fsobj.major
t.devminor = fsobj.minor
t.name = fsobj.location
if not absolute_path:
t.name = "./%s" % (fsobj.location.lstrip("/"),)
t.mode = fsobj.mode
t.uid = fsobj.uid
t.gid = fsobj.gid
t.mtime = fsobj.mtime
return t
[docs]
def generate_contents(filepath, compressor="bz2", parallelize=True):
"""
generate a contentset from a tarball
:param filepath: string path to location on disk
:param compressor: defaults to bz2; decompressor to use, see
:obj:`known_compressors` for list of valid compressors
"""
if compressor == "bz2":
compressor = "bzip2"
tar_handle = None
handle = compression.decompress_handle(
compressor, filepath, parallelize=parallelize
)
try:
tar_handle = tarfile.TarFile(name=filepath, fileobj=handle, mode="r")
except tarfile.ReadError as e:
if not e.message.endswith("empty header"):
raise
tar_handle = []
return convert_archive(tar_handle)
[docs]
def convert_archive(archive):
# regarding the usage of del in this function... bear in mind these sets
# could easily have 10k -> 100k entries in extreme cases; thus the del
# usage, explicitly trying to ensure we don't keep refs long term.
# this one is a bit fun.
raw = list(archive_to_fsobj(archive))
# we use the data source as the unique key to get position.
files_ordering = list(enumerate(x for x in raw if x.is_reg))
files_ordering = {x.data: idx for idx, x in files_ordering}
t = contents.contentsSet(raw, mutable=True)
del raw, archive
# first rewrite affected syms.
raw_syms = t.links()
syms = contents.contentsSet(raw_syms)
while True:
for x in sorted(syms):
affected = syms.child_nodes(x.location)
if not affected:
continue
syms.difference_update(affected)
syms.update(affected.change_offset(x.location, x.resolved_target))
del affected
break
else:
break
t.difference_update(raw_syms)
t.update(syms)
del raw_syms
syms = sorted(syms, reverse=True)
# ok, syms are correct. now we get the rest.
# we shift the readds into a separate list so that we don't reinspect
# them on later runs; this slightly reduces the working set.
additions = []
for x in syms:
affected = t.child_nodes(x.location)
if not affected:
continue
t.difference_update(affected)
additions.extend(affected.change_offset(x.location, x.resolved_target))
t.update(additions)
t.add_missing_directories()
# finally... an insane sort.
def sort_func(x, y):
if x.is_dir:
if not y.is_dir:
return -1
return cmp(x, y)
elif y.is_dir:
return +1
elif x.is_reg:
if y.is_reg:
return cmp(files_ordering[x.data], files_ordering[y.data])
return +1
elif y.is_reg:
return -1
return cmp(x, y)
return contents.OrderedContentsSet(sorted_cmp(t, sort_func), mutable=False)