Source code for pkgcore.fs.tar

"""
binpkg tar utilities
"""

import os
import stat
from functools import partial
from itertools import count

from snakeoil import compression
from snakeoil.compatibility import cmp, sorted_cmp
from snakeoil.data_source import invokable_data_source
from snakeoil.tar import tarfile

from . import contents
from .fs import fsDev, fsDir, fsFifo, fsFile, fsSymlink

_unique_inode = count(2**32).__next__

known_compressors = {
    "bz2": tarfile.TarFile.bz2open,
    "gz": tarfile.TarFile.gzopen,
    None: tarfile.TarFile.open,
}


[docs] def write_set( contents_set, filepath, compressor="bzip2", absolute_paths=False, parallelize=False ): if compressor == "bz2": compressor = "bzip2" tar_handle = None handle = compression.compress_handle(compressor, filepath, parallelize=parallelize) try: tar_handle = tarfile.TarFile(name=filepath, fileobj=handle, mode="w") add_contents_to_tarfile(contents_set, tar_handle) finally: if tar_handle is not None: tar_handle.close() handle.close()
[docs] def add_contents_to_tarfile(contents_set, tar_fd, absolute_paths=False): # first add directories, then everything else # this is just a pkgcore optimization, it prefers to see the dirs first. dirs = contents_set.dirs() dirs.sort() for x in dirs: tar_fd.addfile(fsobj_to_tarinfo(x, absolute_paths)) del dirs inodes = {} for x in contents_set.iterdirs(invert=True): t = fsobj_to_tarinfo(x, absolute_paths) if t.isreg(): key = (x.dev, x.inode) existing = inodes.get(key) data = None if existing is not None: if x._can_be_hardlinked(existing): t.type = tarfile.LNKTYPE t.linkname = "./%s" % existing.location.lstrip("/") t.size = 0 else: inodes[key] = x data = x.data.bytes_fileobj() tar_fd.addfile(t, fileobj=data) # tar_fd.addfile(t, fileobj=x.data.bytes_fileobj()) else: tar_fd.addfile(t)
[docs] def archive_to_fsobj(src_tar): psep = os.path.sep dev = _unique_inode() # inode cache used for supporting hardlinks. # Since the tarfile specifies a hardlink target by path (rather than internally # consistent inode numbers), we have to normalize the path lookup into this cache # via abspath(os.path.join('/', key))... inodes = {} for member in src_tar: d = { "uid": member.uid, "gid": member.gid, "mtime": member.mtime, "mode": member.mode, } location = os.path.abspath(os.path.join(psep, member.name.strip(psep))) if member.isdir(): if member.name.strip(psep) == ".": continue yield fsDir(location, **d) elif member.isreg() or member.islnk(): d["dev"] = dev if member.islnk(): target = os.path.abspath(os.path.join(psep, member.linkname)) inode = inodes.get(target) if inode is None: raise AssertionError( "Tarfile file %r is a hardlink to %r, but we can't " "find the resolved hardlink target %r in the archive. " "This means either a bug in pkgcore, or a malformed " "tarball." % (member.name, member.linkname, target) ) d["inode"] = inode else: d["inode"] = inode = _unique_inode() # Add the new file to the inode cache even if we're currently processing a # hardlink; tar allows for hardlink chains of x -> y -> z; thus we have # to ensure 'y' is in the cache alongside it's target z to support 'x' # later lookup. inodes[location] = inode d["data"] = invokable_data_source.wrap_function( partial(src_tar.extractfile, member.name), returns_text=False, returns_handle=True, ) yield fsFile(location, **d) elif member.issym() or member.islnk(): yield fsSymlink(location, member.linkname, **d) elif member.isfifo(): yield fsFifo(location, **d) elif member.isdev(): d["major"] = int(member.major) d["minor"] = int(member.minor) yield fsDev(location, **d) else: raise AssertionError( "unknown type %r, %r was encounted walking tarmembers" % (member, member.type) )
[docs] def fsobj_to_tarinfo(fsobj, absolute_path=True): t = tarfile.TarInfo() if fsobj.is_reg: t.type = tarfile.REGTYPE t.size = fsobj.chksums["size"] elif fsobj.is_dir: t.type = tarfile.DIRTYPE elif fsobj.is_sym: t.type = tarfile.SYMTYPE t.linkname = fsobj.target elif fsobj.is_fifo: t.type = tarfile.FIFOTYPE elif fsobj.is_dev: if stat.S_ISCHR(fsobj.mode): t.type = tarfile.CHRTYPE else: t.type = tarfile.BLKTYPE t.devmajor = fsobj.major t.devminor = fsobj.minor t.name = fsobj.location if not absolute_path: t.name = "./%s" % (fsobj.location.lstrip("/"),) t.mode = fsobj.mode t.uid = fsobj.uid t.gid = fsobj.gid t.mtime = fsobj.mtime return t
[docs] def generate_contents(filepath, compressor="bz2", parallelize=True): """ generate a contentset from a tarball :param filepath: string path to location on disk :param compressor: defaults to bz2; decompressor to use, see :obj:`known_compressors` for list of valid compressors """ if compressor == "bz2": compressor = "bzip2" tar_handle = None handle = compression.decompress_handle( compressor, filepath, parallelize=parallelize ) try: tar_handle = tarfile.TarFile(name=filepath, fileobj=handle, mode="r") except tarfile.ReadError as e: if not e.message.endswith("empty header"): raise tar_handle = [] return convert_archive(tar_handle)
[docs] def convert_archive(archive): # regarding the usage of del in this function... bear in mind these sets # could easily have 10k -> 100k entries in extreme cases; thus the del # usage, explicitly trying to ensure we don't keep refs long term. # this one is a bit fun. raw = list(archive_to_fsobj(archive)) # we use the data source as the unique key to get position. files_ordering = list(enumerate(x for x in raw if x.is_reg)) files_ordering = {x.data: idx for idx, x in files_ordering} t = contents.contentsSet(raw, mutable=True) del raw, archive # first rewrite affected syms. raw_syms = t.links() syms = contents.contentsSet(raw_syms) while True: for x in sorted(syms): affected = syms.child_nodes(x.location) if not affected: continue syms.difference_update(affected) syms.update(affected.change_offset(x.location, x.resolved_target)) del affected break else: break t.difference_update(raw_syms) t.update(syms) del raw_syms syms = sorted(syms, reverse=True) # ok, syms are correct. now we get the rest. # we shift the readds into a separate list so that we don't reinspect # them on later runs; this slightly reduces the working set. additions = [] for x in syms: affected = t.child_nodes(x.location) if not affected: continue t.difference_update(affected) additions.extend(affected.change_offset(x.location, x.resolved_target)) t.update(additions) t.add_missing_directories() # finally... an insane sort. def sort_func(x, y): if x.is_dir: if not y.is_dir: return -1 return cmp(x, y) elif y.is_dir: return +1 elif x.is_reg: if y.is_reg: return cmp(files_ordering[x.data], files_ordering[y.data]) return +1 elif y.is_reg: return -1 return cmp(x, y) return contents.OrderedContentsSet(sorted_cmp(t, sort_func), mutable=False)