Source code for pkgcheck.checks.metadata_xml

import os
import re
from difflib import SequenceMatcher
from itertools import chain

from lxml import etree
from pkgcore import const as pkgcore_const
from pkgcore.ebuild.atom import MalformedAtom, atom
from pkgcore.restrictions.packages import Conditional
from pkgcore.fetch import fetchable
from snakeoil.osutils import pjoin
from snakeoil.sequences import iflatten_instance
from snakeoil.strings import pluralism

from .. import results, sources
from . import Check


class _MissingXml(results.Error):
    """Required XML file is missing."""

    def __init__(self, filename, **kwargs):
        super().__init__(**kwargs)
        self.filename = filename

    @property
    def desc(self):
        return f"{self._attr} is missing {self.filename}"


class _BadlyFormedXml(results.Error):
    """XML isn't well formed."""

    def __init__(self, filename, error, **kwargs):
        super().__init__(**kwargs)
        self.filename = filename
        self.error = error

    @property
    def desc(self):
        return f"{self._attr} {self.filename} is not well formed xml: {self.error}"


class _InvalidXml(results.Error):
    """XML fails XML Schema validation."""

    def __init__(self, filename, message, **kwargs):
        super().__init__(**kwargs)
        self.filename = filename
        self.message = message

    @property
    def desc(self):
        return f"{self._attr} {self.filename} violates metadata.xsd:\n{self.message}"


class _MetadataXmlInvalidPkgRef(results.Error):
    """metadata.xml <pkg/> references unknown/invalid package."""

    def __init__(self, filename, pkgtext, **kwargs):
        super().__init__(**kwargs)
        self.filename = filename
        self.pkgtext = pkgtext

    @property
    def desc(self):
        return (
            f"{self._attr} {self.filename} <pkg/> "
            f"references unknown/invalid package: {self.pkgtext!r}"
        )


class _MetadataXmlInvalidCatRef(results.Error):
    """metadata.xml <cat/> references unknown/invalid category."""

    def __init__(self, filename, cattext, **kwargs):
        super().__init__(**kwargs)
        self.filename = filename
        self.cattext = cattext

    @property
    def desc(self):
        return (
            f"{self._attr} {self.filename} <cat/> references "
            f"unknown/invalid category: {self.cattext!r}"
        )


[docs] class MaintainerNeeded(results.PackageResult, results.Warning): """Package with missing or invalid maintainer-needed comment in metadata.xml.""" def __init__(self, filename, needed, **kwargs): super().__init__(**kwargs) self.filename = filename self.needed = needed @property def desc(self): if not self.needed: return f"{self.filename}: missing maintainer-needed comment" return f"{self.filename}: invalid maintainer-needed comment"
[docs] class MaintainerWithoutProxy(results.PackageResult, results.Warning): """Package has a proxied maintainer without a proxy. All package maintainers have non-@gentoo.org e-mail addresses. Most likely, this means that the package is maintained by a proxied maintainer but there is no explicit proxy (developer or project) listed. This means no Gentoo developer will be CC-ed on bug reports, and most likely no developer oversees the proxied maintainer's activity. """ def __init__(self, filename, maintainers, **kwargs): super().__init__(**kwargs) self.filename = filename self.maintainers = tuple(maintainers) @property def desc(self): s = pluralism(self.maintainers) maintainers = ", ".join(self.maintainers) return f"{self.filename}: proxied maintainer{s} missing proxy dev/project: {maintainers}"
[docs] class ProxyWithoutProxied(results.PackageResult, results.Warning): """Package lists a proxy with no proxied maintainers. The package explicitly lists a proxy with no proxied maintainers. Most likely, this means that the proxied maintainer has been removed but the proxy was accidentally left. """ def __init__(self, filename, **kwargs): super().__init__(**kwargs) self.filename = filename @property def desc(self): return f"{self.filename}: proxy with no proxied maintainer"
[docs] class NonexistentProjectMaintainer(results.PackageResult, results.Warning): """Package specifying nonexistent project as a maintainer.""" def __init__(self, filename, emails, **kwargs): super().__init__(**kwargs) self.filename = filename self.emails = tuple(emails) @property def desc(self): s = pluralism(self.emails) emails = ", ".join(self.emails) return f"{self.filename}: nonexistent project maintainer{s}: {emails}"
[docs] class WrongMaintainerType(results.PackageResult, results.Warning): """A person-type maintainer matches an existing project.""" def __init__(self, filename, emails, **kwargs): super().__init__(**kwargs) self.filename = filename self.emails = tuple(emails) @property def desc(self): s = pluralism(self.emails) emails = ", ".join(self.emails) return f'{self.filename}: project maintainer{s} with type="person": {emails}'
[docs] class PkgMissingMetadataXml(_MissingXml, results.PackageResult): """Package is missing metadata.xml."""
[docs] class CatMissingMetadataXml(_MissingXml, results.CategoryResult): """Category is missing metadata.xml."""
[docs] class PkgInvalidXml(_InvalidXml, results.PackageResult): """Invalid package metadata.xml."""
[docs] class CatInvalidXml(_InvalidXml, results.CategoryResult): """Invalid category metadata.xml."""
[docs] class PkgBadlyFormedXml(_BadlyFormedXml, results.PackageResult): """Badly formed package metadata.xml."""
[docs] class CatBadlyFormedXml(_BadlyFormedXml, results.CategoryResult): """Badly formed category metadata.xml."""
[docs] class PkgMetadataXmlInvalidPkgRef(_MetadataXmlInvalidPkgRef, results.PackageResult): """Invalid package reference in package metadata.xml."""
[docs] class CatMetadataXmlInvalidPkgRef(_MetadataXmlInvalidPkgRef, results.CategoryResult): """Invalid package reference in category metadata.xml."""
[docs] class PkgMetadataXmlInvalidCatRef(_MetadataXmlInvalidCatRef, results.PackageResult): """Invalid category reference in package metadata.xml."""
[docs] class CatMetadataXmlInvalidCatRef(_MetadataXmlInvalidCatRef, results.CategoryResult): """Invalid category reference in category metadata.xml."""
class _MetadataXmlIndentation(results.BaseLinesResult, results.Style): """Inconsistent indentation in metadata.xml file. Either all tabs or all spaces should be used, not a mixture of both. """ def __init__(self, filename, **kwargs): super().__init__(**kwargs) self.filename = filename @property def desc(self): return f"{self.filename}: metadata.xml has inconsistent indentation {self.lines_str}"
[docs] class CatMetadataXmlIndentation(_MetadataXmlIndentation, results.CategoryResult): """Inconsistent indentation in category metadata.xml file. Either all tabs or all spaces should be used, not a mixture of both. """
[docs] class PkgMetadataXmlIndentation(_MetadataXmlIndentation, results.PackageResult): """Inconsistent indentation in package metadata.xml file. Either all tabs or all spaces should be used, not a mixture of both. """
class _MetadataXmlEmptyElement(results.Style): """Empty element in metadata.xml file.""" def __init__(self, filename, element, line, **kwargs): super().__init__(**kwargs) self.filename = filename self.element = element self.line = line @property def desc(self): return f"{self.filename}: empty element {self.element!r} on line {self.line}"
[docs] class CatMetadataXmlEmptyElement(_MetadataXmlEmptyElement, results.CategoryResult): """Empty element in category metadata.xml file."""
[docs] class PkgMetadataXmlEmptyElement(_MetadataXmlEmptyElement, results.PackageResult): """Empty element in package metadata.xml file."""
[docs] class RedundantLongDescription(results.PackageResult, results.Style): """Package's longdescription element in metadata.xml and DESCRIPTION are interchangeable. The longdescription element is for providing extended information that doesn't fit in DESCRIPTION. """ def __init__(self, msg, **kwargs): super().__init__(**kwargs) self.msg = msg @property def desc(self): return self.msg
[docs] class InvalidRemoteID(results.PackageResult, results.Warning): """Package's remote-id value incorrect for the specified type.""" def __init__(self, id_type, id_value, expected, **kwargs): super().__init__(**kwargs) self.id_type = id_type self.id_value = id_value self.expected = expected @property def desc(self): return ( f"remote-id value {self.id_value!r} invalid for " f"type={self.id_type!r}, expected: {self.expected!r}" )
[docs] class InvalidMetadataRestrict(results.PackageResult, results.Error): """Invalid package restrictions used in metadata.xml.""" def __init__(self, restrict: str, msg: str, **kwargs): super().__init__(**kwargs) self.restrict = restrict self.msg = msg @property def desc(self): return f"metadata.xml: invalid package restrictions {self.restrict!r}: {self.msg}"
class _XmlBaseCheck(Check): """Base class for metadata.xml scans.""" schema = None misformed_error = None invalid_error = None missing_error = None def __init__(self, *args): super().__init__(*args) self.repo_base = self.options.target_repo.location self.pkgref_cache = {} # content validation checks to run after parsing XML doc self._checks = tuple(getattr(self, x) for x in dir(self) if x.startswith("_check_")) # Prefer xsd file from the target repository or its masters, falling # back to the file installed with pkgcore. for repo in reversed(self.options.target_repo.trees): metadata_xsd = pjoin(repo.location, "metadata", "xml-schema", "metadata.xsd") if os.path.isfile(metadata_xsd): try: self.schema = etree.XMLSchema(etree.parse(metadata_xsd)) break except etree.XMLSchemaParseError: # ignore invalid xsd files pass else: metadata_xsd = pjoin(pkgcore_const.DATA_PATH, "xml-schema", "metadata.xsd") self.schema = etree.XMLSchema(etree.parse(metadata_xsd)) def _check_doc(self, pkg, loc, doc): """Perform additional document structure checks.""" # Find all root descendant elements that are empty except # 'stabilize-allarches' which is allowed to be empty and 'flag' which # is caught by MissingLocalUseDesc. for el in doc.getroot().iterdescendants(): if ( not el.getchildren() and (el.text is None or not el.text.strip()) and el.tag not in ("flag", "stabilize-allarches") ): yield self.empty_element(os.path.basename(loc), el.tag, el.sourceline, pkg=pkg) for el in doc.findall(".//cat"): c = el.text.strip() if c not in self.options.search_repo.categories: yield self.catref_error(os.path.basename(loc), c, pkg=pkg) for el in doc.findall(".//pkg"): p = el.text.strip() if p not in self.pkgref_cache: try: a = atom(p) found = self.options.search_repo.has_match(a) except MalformedAtom: found = False self.pkgref_cache[p] = found if not self.pkgref_cache[p]: yield self.pkgref_error(os.path.basename(loc), p, pkg=pkg) def _check_whitespace(self, pkg, loc, doc): """Check for indentation consistency.""" orig_indent = None indents = set() with open(loc) as f: for lineno, line in enumerate(f, 1): for i in line[: -len(line.lstrip())]: if i != orig_indent: if orig_indent is None: orig_indent = i else: indents.add(lineno) if indents: yield self.indent_error(os.path.basename(loc), lines=map(str, sorted(indents)), pkg=pkg) @staticmethod def _format_lxml_errors(error_log): for x in error_log: yield f"line {x.line}, col {x.column}: ({x.type_name}) {x.message}" def _parse_xml(self, pkg, loc): try: doc = etree.parse(loc) except (IOError, OSError): # it's only an error when missing in the main gentoo repo if self.options.gentoo_repo: yield self.missing_error(os.path.basename(loc), pkg=pkg) return except etree.XMLSyntaxError as e: yield self.misformed_error(os.path.basename(loc), str(e), pkg=pkg) return # note: while doc is available, do not pass it here as it may # trigger undefined behavior due to incorrect structure if self.schema is not None and not self.schema.validate(doc): message = "\n".join(self._format_lxml_errors(self.schema.error_log)) yield self.invalid_error(os.path.basename(loc), message, pkg=pkg) return # run all post parsing/validation checks for check in self._checks: yield from check(pkg, loc, doc) def feed(self, pkgset): pkg = pkgset[0] loc = self._get_xml_location(pkg) yield from self._parse_xml(pkg, loc)
[docs] class PackageMetadataXmlCheck(_XmlBaseCheck): """Package level metadata.xml scans.""" _source = sources.PackageRepoSource misformed_error = PkgBadlyFormedXml invalid_error = PkgInvalidXml missing_error = PkgMissingMetadataXml catref_error = PkgMetadataXmlInvalidCatRef pkgref_error = PkgMetadataXmlInvalidPkgRef indent_error = PkgMetadataXmlIndentation empty_element = PkgMetadataXmlEmptyElement known_results = frozenset( [ PkgBadlyFormedXml, PkgInvalidXml, PkgMissingMetadataXml, PkgMetadataXmlInvalidPkgRef, PkgMetadataXmlInvalidCatRef, PkgMetadataXmlIndentation, PkgMetadataXmlEmptyElement, MaintainerNeeded, MaintainerWithoutProxy, ProxyWithoutProxied, RedundantLongDescription, NonexistentProjectMaintainer, WrongMaintainerType, InvalidRemoteID, InvalidMetadataRestrict, ] ) _one_component_validator_re = re.compile(r"^[^/]+$") _two_components_validator_re = re.compile(r"^[^/]+/[^/]+$") _gitlab_validator_re = re.compile(r"^([^/]+/)*[^/]+/[^/]+$") remote_id_validators = { # {name}-style remotes "cpan": (_one_component_validator_re, "{project}"), "cpan-module": (_one_component_validator_re, "{module}"), "cran": (_one_component_validator_re, "{project}"), "ctan": (_one_component_validator_re, "{project}"), "google-code": (_one_component_validator_re, "{project}"), "osdn": (_one_component_validator_re, "{project}"), "pear": (_one_component_validator_re, "{project}"), "pecl": (_one_component_validator_re, "{project}"), "pypi": (_one_component_validator_re, "{project}"), "rubygems": (_one_component_validator_re, "{project}"), "sourceforge": (_one_component_validator_re, "{project}"), # {name} with a special check for lp: prefix "launchpad": (re.compile(r"^(?!lp:)[^/]+$"), "{project}"), # {owner}/{name}-style remotes "bitbucket": (_two_components_validator_re, "{username}/{project}"), "codeberg": (_two_components_validator_re, "{username}/{project}"), "github": (_two_components_validator_re, "{username}/{project}"), # gitlab (2+ components) "gitlab": (_gitlab_validator_re, "{username}/[{group}/...]{repo}"), "heptapod": (_gitlab_validator_re, "{username}/[{group}/...]{repo}"), # cpe "cpe": (re.compile(r"^cpe:/[aho]:[^:]+:[^:]+$"), "cpe:/[aho]:{vendor}:{product}"), # 1+ component + no ".git" suffix "gentoo": (re.compile(r"^([^/]+/)*[^/]+(?<!\.git)$"), "[{group}/...]{repo}"), # a positive decimal number "vim": (re.compile(r"^[1-9]\d*$"), "{script_id}"), } @staticmethod def _maintainer_proxied_key(m): if m.proxied is not None: return m.proxied if m.email == "proxy-maint@gentoo.org": return "proxy" if m.email.endswith("@gentoo.org"): return "no" return "yes" def _check_maintainers(self, pkg, loc, doc): """Validate maintainers in package metadata for the gentoo repo.""" if self.options.gentoo_repo: maintainer_needed = any( c.text.strip() == "maintainer-needed" for c in doc.xpath("//comment()") ) if pkg.maintainers: # check for invalid maintainer-needed comment if maintainer_needed: yield MaintainerNeeded(os.path.basename(loc), maintainer_needed, pkg=pkg) # determine proxy maintainer status proxied, devs, proxies = [], [], [] proxy_map = {"yes": proxied, "no": devs, "proxy": proxies} for m in pkg.maintainers: proxy_map[self._maintainer_proxied_key(m)].append(m) # check proxy maintainers if not devs and not proxies: maintainers = sorted(map(str, pkg.maintainers)) yield MaintainerWithoutProxy(os.path.basename(loc), maintainers, pkg=pkg) elif not proxied and proxies: yield ProxyWithoutProxied(os.path.basename(loc), pkg=pkg) elif not maintainer_needed: # check for missing maintainer-needed comment yield MaintainerNeeded(os.path.basename(loc), maintainer_needed, pkg=pkg) # check maintainer validity if projects := set(pkg.repo.projects_xml.projects): nonexistent = [] wrong_maintainers = [] for m in pkg.maintainers: if m.maint_type == "project" and m.email not in projects: nonexistent.append(m.email) elif m.maint_type == "person" and m.email in projects: wrong_maintainers.append(m.email) if nonexistent: yield NonexistentProjectMaintainer( os.path.basename(loc), sorted(nonexistent), pkg=pkg ) if wrong_maintainers: yield WrongMaintainerType( os.path.basename(loc), sorted(wrong_maintainers), pkg=pkg ) def _check_longdescription(self, pkg, loc, doc): if pkg.longdescription is not None: match_ratio = SequenceMatcher(None, pkg.description, pkg.longdescription).ratio() if match_ratio > 0.75: msg = "metadata.xml longdescription closely matches DESCRIPTION" yield RedundantLongDescription(msg, pkg=pkg) elif len(pkg.longdescription) < 80: msg = "metadata.xml longdescription is too short" yield RedundantLongDescription(msg, pkg=pkg) def _check_restricts(self, pkg, loc, doc): restricts = ( c.get("restrict") for path in ("maintainer", "use/flag") for c in doc.xpath(f"/pkgmetadata/{path}[string(@restrict)]") ) for restrict_str in restricts: try: restrict = atom(restrict_str, eapi="0") if restrict.key != pkg.key: yield InvalidMetadataRestrict( restrict_str, "references another package", pkg=pkg ) if restrict.use: yield InvalidMetadataRestrict( restrict_str, "USE-conditionals are prohibited", pkg=pkg ) except MalformedAtom as exc: yield InvalidMetadataRestrict(restrict_str, exc, pkg=pkg) def _check_remote_id(self, pkg, loc, doc): for u in pkg.upstreams: # empty values are already reported as PkgMetadataXmlEmptyElement if not u.name: continue try: validator, expected = self.remote_id_validators[u.type] except KeyError: # pragma: no cover continue if not validator.match(u.name): yield InvalidRemoteID(u.type, u.name, expected, pkg=pkg) def _get_xml_location(self, pkg): """Return the metadata.xml location for a given package.""" return pjoin(os.path.dirname(pkg.ebuild.path), "metadata.xml")
[docs] class CategoryMetadataXmlCheck(_XmlBaseCheck): """Category level metadata.xml scans.""" _source = (sources.CategoryRepoSource, (), (("source", sources.RawRepoSource),)) misformed_error = CatBadlyFormedXml invalid_error = CatInvalidXml missing_error = CatMissingMetadataXml catref_error = CatMetadataXmlInvalidCatRef pkgref_error = CatMetadataXmlInvalidPkgRef indent_error = CatMetadataXmlIndentation empty_element = CatMetadataXmlEmptyElement known_results = frozenset( [ CatBadlyFormedXml, CatInvalidXml, CatMissingMetadataXml, CatMetadataXmlInvalidPkgRef, CatMetadataXmlInvalidCatRef, CatMetadataXmlIndentation, CatMetadataXmlEmptyElement, ] ) def _get_xml_location(self, pkg): """Return the metadata.xml location for a given package's category.""" return pjoin(self.repo_base, pkg.category, "metadata.xml")
[docs] class MissingRemoteId(results.PackageResult, results.Info): """Missing remote-id which was inferred from ebuilds. Based on URIs found in SRC_URI and HOMEPAGE, a remote-id can be suggested. If a remote-id of same type is already defined in ``metadata.xml``, the suggestion won't be reported. It ignores URIs ending with ``.diff`` or ``.patch``, as they might point to a fork or developer's space. It also ignores URIs that are conditional on USE flags. """ def __init__(self, remote_type: str, value: str, uri: str, **kwarg): super().__init__(**kwarg) self.remote_type = remote_type self.value = value self.uri = uri @property def desc(self): return ( f'missing <remote-id type="{self.remote_type}">' f"{self.value}</remote-id> (inferred from URI {self.uri!r})" )
[docs] class MissingRemoteIdCheck(Check): """Detect missing remote-ids based on SRC_URI and HOMEPAGE.""" _source = sources.PackageRepoSource known_results = frozenset([MissingRemoteId]) # Exclude api groups and raw project names to conform with https://docs.gitlab.com/ee/user/reserved_names.html # with the URI's which are most likely to end up in SRC_URI _gitlab_match = r"(?P<value>((?!api/)\w[^/]*/)+(?!raw/)\w[^/]*)" remotes_map = ( ("bitbucket", r"https://bitbucket.org/(?P<value>[^/]+/[^/]+)"), ("codeberg", r"https://codeberg.org/(?P<value>[^/]+/[^/]+)"), ("freedesktop-gitlab", rf"https://gitlab.freedesktop.org/{_gitlab_match}"), ("github", r"https://github.com/(?P<value>[^/]+/[^/]+)"), ("gitlab", rf"https://gitlab.com/{_gitlab_match}"), ("gnome-gitlab", rf"https://gitlab.gnome.org/{_gitlab_match}"), ("heptapod", rf"https://foss.heptapod.net/{_gitlab_match}"), ("kde-invent", rf"https://invent.kde.org/{_gitlab_match}"), ("launchpad", r"https://launchpad.net/(?P<value>[^/]+)"), ("pypi", r"https://pypi.org/project/(?P<value>[^/]+)"), ("pypi", r"https://files.pythonhosted.org/packages/source/\S/(?P<value>[^/]+)"), ("savannah", r"https://savannah.gnu.org/projects/(?P<value>[^/]+)"), ("savannah-nongnu", r"https://savannah.nongnu.org/projects/(?P<value>[^/]+)"), ("sourceforge", r"https://downloads.sourceforge.(net|io)/(?:project/)?(?P<value>[^/]+)"), ("sourceforge", r"https://sourceforge.(net|io)/projects/(?P<value>[^/]+)"), ("sourceforge", r"https://(?P<value>[^/]+).sourceforge.(net|io)/"), ("sourcehut", r"https://sr.ht/(?P<value>[^/]+/[^/]+)"), ) def __init__(self, options, **kwargs): super().__init__(options, **kwargs) self.remotes_map = tuple( (remote_type, re.compile(regex)) for remote_type, regex in self.remotes_map )
[docs] def feed(self, pkgset): remotes = {u.type: (None, None) for u in pkgset[0].upstreams} for pkg in sorted(pkgset, reverse=True): fetchables = iflatten_instance( pkg.generate_fetchables( allow_missing_checksums=True, ignore_unknown_mirrors=True, skip_default_mirrors=True, ), (fetchable, Conditional), ) all_urls = set( chain.from_iterable(f.uri for f in fetchables if isinstance(f, fetchable)) ) urls = {url for url in all_urls if not url.endswith((".patch", ".diff"))} urls = sorted(urls.union(pkg.homepage), key=len) for remote_type, regex in self.remotes_map: if remote_type in remotes: continue for url in urls: if mo := regex.match(url): remotes[remote_type] = (mo.group("value"), url) break for remote_type, (value, url) in remotes.items(): if value is not None: yield MissingRemoteId(remote_type, value, url, pkg=pkgset[0])