2800b49
import argparse
2800b49
import csv
2800b49
import fnmatch
2800b49
import os
2800b49
import warnings
2800b49
2800b49
from collections import defaultdict
2800b49
from pathlib import PosixPath, PurePosixPath
2800b49
2800b49
2800b49
class BuildrootPath(PurePosixPath):
2800b49
    """
2800b49
    This path represents a path in a buildroot.
2800b49
    When absolute, it is "relative" to a buildroot.
2800b49
2800b49
    E.g. /usr/lib means %{buildroot}/usr/lib
2800b49
    The object carries no buildroot information.
2800b49
    """
2800b49
2800b49
    @staticmethod
2800b49
    def from_real(realpath, *, root):
2800b49
        """
2800b49
        For a given real disk path, return a BuildrootPath in the given root.
2800b49
2800b49
        For example::
2800b49
2800b49
            >>> BuildrootPath.from_real(PosixPath('/tmp/buildroot/foo'), root=PosixPath('/tmp/buildroot'))
2800b49
            BuildrootPath('/foo')
2800b49
        """
2800b49
        return BuildrootPath("/") / realpath.relative_to(root)
2800b49
2800b49
    def to_real(self, root):
2800b49
        """
2800b49
        Return a real PosixPath in the given root
2800b49
2800b49
        For example::
2800b49
2800b49
            >>> BuildrootPath('/foo').to_real(PosixPath('/tmp/buildroot'))
2800b49
            PosixPath('/tmp/buildroot/foo')
2800b49
        """
2800b49
        return root / self.relative_to("/")
2800b49
2800b49
    def normpath(self):
2800b49
        """
2800b49
        Normalize all the potential /../ parts of the path without touching real files.
2800b49
2800b49
        PurePaths don't have .resolve().
2800b49
        Paths have .resolve() but it touches real files.
2800b49
        This is an alternative. It assumes there are no symbolic links.
2800b49
2800b49
        Example:
2800b49
2800b49
            >>> BuildrootPath('/usr/lib/python/../pypy').normpath()
2800b49
            BuildrootPath('/usr/lib/pypy')
2800b49
        """
2800b49
        return type(self)(os.path.normpath(self))
2800b49
2800b49
2800b49
def locate_record(root, sitedirs):
2800b49
    """
2800b49
    Find a RECORD file in the given root.
2800b49
    sitedirs are BuildrootPaths.
2800b49
    Only RECORDs in dist-info dirs inside sitedirs are considered.
2800b49
    There can only be one RECORD file.
2800b49
2800b49
    Returns a PosixPath of the RECORD file.
2800b49
    """
2800b49
    records = []
2800b49
    for sitedir in sitedirs:
2800b49
        records.extend(sitedir.to_real(root).glob("*.dist-info/RECORD"))
2800b49
2800b49
    sitedirs_text = ", ".join(str(p) for p in sitedirs)
2800b49
    if len(records) == 0:
2800b49
        raise FileNotFoundError(f"There is no *.dist-info/RECORD in {sitedirs_text}")
2800b49
    if len(records) > 1:
2800b49
        raise FileExistsError(f"Multiple *.dist-info directories in {sitedirs_text}")
2800b49
2800b49
    return records[0]
2800b49
2800b49
2800b49
def read_record(record_path):
2800b49
    """
2800b49
    A generator yielding individual RECORD triplets.
2800b49
2800b49
    https://www.python.org/dev/peps/pep-0376/#record
2800b49
2800b49
    The triplet is str-path, hash, size -- the last two optional.
2800b49
    We will later care only for the paths anyway.
2800b49
2800b49
    Example:
2800b49
2800b49
        >>> g = read_record(PosixPath('./test_RECORD'))
2800b49
        >>> next(g)
2800b49
        ['../../../bin/__pycache__/tldr.cpython-....pyc', '', '']
2800b49
        >>> next(g)
2800b49
        ['../../../bin/tldr', 'sha256=...', '12766']
2800b49
        >>> next(g)
2800b49
        ['../../../bin/tldr.py', 'sha256=...', '12766']
2800b49
    """
2800b49
    with open(record_path, newline="", encoding="utf-8") as f:
2800b49
        yield from csv.reader(
2800b49
            f, delimiter=",", quotechar='"', lineterminator=os.linesep
2800b49
        )
2800b49
2800b49
2800b49
def parse_record(record_path, record_content):
2800b49
    """
2800b49
    Returns a generator with BuildrootPaths parsed from record_content
2800b49
2800b49
    params:
2800b49
    record_path: RECORD BuildrootPath
2800b49
    record_content: list of RECORD triplets
2800b49
                    first item is a str-path relative to directory where dist-info directory is
2800b49
                    (it can also be absolute according to the standard, but not from pip)
2800b49
2800b49
    Examples:
2800b49
2800b49
        >>> next(parse_record(BuildrootPath('/usr/lib/python3.7/site-packages/requests-2.22.0.dist-info/RECORD'),
2800b49
        ...                   [('requests/sessions.py', 'sha256=xxx', '666'), ...]))
2800b49
        BuildrootPath('/usr/lib/python3.7/site-packages/requests/sessions.py')
2800b49
2800b49
        >>> next(parse_record(BuildrootPath('/usr/lib/python3.7/site-packages/tldr-0.5.dist-info/RECORD'),
2800b49
        ...                   [('../../../bin/tldr', 'sha256=yyy', '777'), ...]))
2800b49
        BuildrootPath('/usr/bin/tldr')
2800b49
    """
2800b49
    sitedir = record_path.parent.parent  # trough the dist-info directory
2800b49
    # / with absolute right operand will remove the left operand
2800b49
    # any .. parts are resolved via normpath
2800b49
    return ((sitedir / row[0]).normpath() for row in record_content)
2800b49
2800b49
2800b49
def pycached(script, python_version):
2800b49
    """
2800b49
    For a script BuildrootPath, return a list with that path and its bytecode glob.
2800b49
    Like the %pycached macro.
2800b49
2800b49
    The glob is represented as a BuildrootPath.
2800b49
2800b49
    Examples:
2800b49
2800b49
        >>> pycached(BuildrootPath('/whatever/bar.py'), '3.8')
2800b49
        [BuildrootPath('/whatever/bar.py'), BuildrootPath('/whatever/__pycache__/bar.cpython-38{,.opt-?}.pyc')]
2800b49
2800b49
        >>> pycached(BuildrootPath('/opt/python3.10/foo.py'), '3.10')
2800b49
        [BuildrootPath('/opt/python3.10/foo.py'), BuildrootPath('/opt/python3.10/__pycache__/foo.cpython-310{,.opt-?}.pyc')]
2800b49
    """
2800b49
    assert script.suffix == ".py"
2800b49
    pyver = "".join(python_version.split(".")[:2])
2800b49
    pycname = f"{script.stem}.cpython-{pyver}{{,.opt-?}}.pyc"
2800b49
    pyc = script.parent / "__pycache__" / pycname
2800b49
    return [script, pyc]
2800b49
2800b49
2800b49
def add_file_to_module(paths, module_name, module_type, *files):
2800b49
    """
2800b49
    Helper procedure, adds given files to the module_name of a given module_type
2800b49
    """
2800b49
    for module in paths["modules"][module_name]:
2800b49
        if module["type"] == module_type:
2800b49
            if files[0] not in module["files"]:
2800b49
                module["files"].extend(files)
2800b49
            break
2800b49
    else:
2800b49
        paths["modules"][module_name].append(
2800b49
            {"type": module_type, "files": list(files)}
2800b49
        )
2800b49
2800b49
2800b49
def classify_paths(
2800b49
    record_path, parsed_record_content, sitedirs, bindir, python_version
2800b49
):
2800b49
    """
2800b49
    For each BuildrootPath in parsed_record_content classify it to a dict structure
2800b49
    that allows to filter the files for the %files section easier.
2800b49
2800b49
    For the dict structure, look at the beginning of this function's code.
2800b49
2800b49
    Each "module" is a dict with "type" ("package", "script", "extension") and "files".
2800b49
    """
2800b49
    distinfo = record_path.parent
2800b49
    paths = {
2800b49
        "metadata": {
2800b49
            "files": [],  # regular %file entries with dist-info content
2800b49
            "dirs": [distinfo],  # %dir %file entries with dist-info directory
2800b49
            "docs": [],  # to be used once there is upstream way to recognize READMEs
2800b49
            "licenses": [],  # to be used once there is upstream way to recognize LICENSEs
2800b49
        },
2800b49
        "modules": defaultdict(list),  # each importable module (directory, .py, .so)
2800b49
        "executables": {"files": []},  # regular %file entries in %{_bindir}
2800b49
        "other": {"files": []},  # regular %file entries we could not parse :(
2800b49
    }
2800b49
2800b49
    # In RECORDs generated by pip, there are no directories, only files.
2800b49
    # The example RECORD from PEP 376 does not contain directories either.
2800b49
    # Hence, we'll only assume files, but TODO get it officially documented.
2800b49
    for path in parsed_record_content:
2800b49
        if path.suffix == ".pyc":
2800b49
            # we handle bytecode separately
2800b49
            continue
2800b49
2800b49
        if path.parent == distinfo:
2800b49
            # TODO is this a license/documentation?
2800b49
            paths["metadata"]["files"].append(path)
2800b49
            continue
2800b49
2800b49
        if path.parent == bindir:
2800b49
            paths["executables"]["files"].append(path)
2800b49
            continue
2800b49
2800b49
        for sitedir in sitedirs:
2800b49
            if sitedir in path.parents:
2800b49
                if path.parent == sitedir:
2800b49
                    if path.suffix == ".so":
2800b49
                        # extension modules can have 2 suffixes
2800b49
                        name = BuildrootPath(path.stem).stem
2800b49
                        add_file_to_module(paths, name, "extension", path)
2800b49
                    elif path.suffix == ".py":
2800b49
                        name = path.stem
2800b49
                        add_file_to_module(
2800b49
                            paths, name, "script", *pycached(path, python_version)
2800b49
                        )
2800b49
                    else:
2800b49
                        # TODO classify .pth files
2800b49
                        warnings.warn(f"Unrecognized file: {path}")
2800b49
                        paths["other"]["files"].append(path)
2800b49
                else:
2800b49
                    # this file is inside a dir, we classify that dir
2800b49
                    index = path.parents.index(sitedir)
2800b49
                    module_dir = path.parents[index - 1]
2800b49
                    add_file_to_module(paths, module_dir.name, "package", module_dir)
2800b49
                break
2800b49
        else:
2800b49
            warnings.warn(f"Unrecognized file: {path}")
2800b49
            paths["other"]["files"].append(path)
2800b49
2800b49
    return paths
2800b49
2800b49
2800b49
def generate_file_list(paths_dict, module_globs, include_executables=False):
2800b49
    """
2800b49
    This function takes the classified paths_dict and turns it into lines
2800b49
    for the %files section. Returns list with text lines, no Path objects.
2800b49
2800b49
    Only includes files from modules that match module_globs, metadata and
2800b49
    optional executables.
2800b49
2800b49
    It asserts that all globs match at least one module, raises ValueError otherwise.
2800b49
    Multiple globs matching identical module(s) are OK.
2800b49
    """
2800b49
    files = set()
2800b49
2800b49
    if include_executables:
2800b49
        files.update(f"{p}" for p in paths_dict["executables"]["files"])
2800b49
2800b49
    files.update(f"{p}" for p in paths_dict["metadata"]["files"])
2800b49
    for macro in "dir", "doc", "license":
2800b49
        files.update(f"%{macro} {p}" for p in paths_dict["metadata"][f"{macro}s"])
2800b49
2800b49
    modules = paths_dict["modules"]
2800b49
    done_modules = set()
2800b49
    done_globs = set()
2800b49
2800b49
    for glob in module_globs:
2800b49
        for name in modules:
2800b49
            if fnmatch.fnmatchcase(name, glob):
2800b49
                if name not in done_modules:
2800b49
                    for module in modules[name]:
2800b49
                        if module["type"] == "package":
2800b49
                            files.update(f"{p}/" for p in module["files"])
2800b49
                        else:
2800b49
                            files.update(f"{p}" for p in module["files"])
2800b49
                    done_modules.add(name)
2800b49
                done_globs.add(glob)
2800b49
2800b49
    missed = module_globs - done_globs
2800b49
    if missed:
2800b49
        missed_text = ", ".join(sorted(missed))
2800b49
        raise ValueError(f"Globs did not match any module: {missed_text}")
2800b49
2800b49
    return sorted(files)
2800b49
2800b49
2800b49
def parse_varargs(varargs):
2800b49
    """
2800b49
    Parse varargs from the %pyproject_save_files macro
2800b49
2800b49
    Arguments starting with + are treated as a flags, everything else is a glob
2800b49
2800b49
    Returns as set of globs, boolean flag whether to include executables from bindir
2800b49
2800b49
    Raises ValueError for unknown flags and globs with dots (namespace packages).
2800b49
2800b49
    Good examples:
2800b49
2800b49
        >>> parse_varargs(['*'])
2800b49
        ({'*'}, False)
2800b49
2800b49
        >>> mods, bindir = parse_varargs(['requests*', 'kerberos', '+bindir'])
2800b49
        >>> bindir
2800b49
        True
2800b49
        >>> sorted(mods)
2800b49
        ['kerberos', 'requests*']
2800b49
2800b49
        >>> mods, bindir = parse_varargs(['tldr', 'tensorf*'])
2800b49
        >>> bindir
2800b49
        False
2800b49
        >>> sorted(mods)
2800b49
        ['tensorf*', 'tldr']
2800b49
2800b49
        >>> parse_varargs(['+bindir'])
2800b49
        (set(), True)
2800b49
2800b49
    Bad examples:
2800b49
2800b49
        >>> parse_varargs(['+kinkdir'])
2800b49
        Traceback (most recent call last):
2800b49
          ...
2800b49
        ValueError: Invalid argument: +kinkdir
2800b49
2800b49
        >>> parse_varargs(['good', '+bad', '*ugly*'])
2800b49
        Traceback (most recent call last):
2800b49
          ...
2800b49
        ValueError: Invalid argument: +bad
2800b49
2800b49
        >>> parse_varargs(['+bad', 'my.bad'])
2800b49
        Traceback (most recent call last):
2800b49
          ...
2800b49
        ValueError: Invalid argument: +bad
2800b49
2800b49
        >>> parse_varargs(['mod', 'mod.*'])
2800b49
        Traceback (most recent call last):
2800b49
          ...
2800b49
        ValueError: Attempted to use a namespaced package with dot in the glob: mod.*. ...
2800b49
2800b49
        >>> parse_varargs(['my.bad', '+bad'])
2800b49
        Traceback (most recent call last):
2800b49
          ...
2800b49
        ValueError: Attempted to use a namespaced package with dot in the glob: my.bad. ...
2800b49
    """
2800b49
    include_bindir = False
2800b49
    globs = set()
2800b49
2800b49
    for arg in varargs:
2800b49
        if arg.startswith("+"):
2800b49
            if arg == "+bindir":
2800b49
                include_bindir = True
2800b49
            else:
2800b49
                raise ValueError(f"Invalid argument: {arg}")
2800b49
        elif "." in arg:
2800b49
            top, *_ = arg.partition(".")
2800b49
            msg = (
2800b49
                f"Attempted to use a namespaced package with dot in the glob: {arg}. "
2800b49
                f"That is not (yet) supported. Use {top} instead and/or file a Bugzilla explaining your use case."
2800b49
            )
2800b49
            raise ValueError(msg)
2800b49
        else:
2800b49
            globs.add(arg)
2800b49
2800b49
    return globs, include_bindir
2800b49
2800b49
2800b49
def pyproject_save_files(buildroot, sitelib, sitearch, bindir, python_version, varargs):
2800b49
    """
2800b49
    Takes arguments from the %{pyproject_save_files} macro
2800b49
2800b49
    Returns list of paths for the %files section
2800b49
    """
2800b49
    # On 32 bit architectures, sitelib equals to sitearch
2800b49
    # This saves us browsing one directory twice
2800b49
    sitedirs = sorted({sitelib, sitearch})
2800b49
2800b49
    globs, include_bindir = parse_varargs(varargs)
2800b49
    record_path_real = locate_record(buildroot, sitedirs)
2800b49
    record_path = BuildrootPath.from_real(record_path_real, root=buildroot)
2800b49
    parsed_record = parse_record(record_path, read_record(record_path_real))
2800b49
2800b49
    paths_dict = classify_paths(
2800b49
        record_path, parsed_record, sitedirs, bindir, python_version
2800b49
    )
2800b49
    return generate_file_list(paths_dict, globs, include_bindir)
2800b49
2800b49
2800b49
def main(cli_args):
2800b49
    file_section = pyproject_save_files(
2800b49
        cli_args.buildroot,
2800b49
        cli_args.sitelib,
2800b49
        cli_args.sitearch,
2800b49
        cli_args.bindir,
2800b49
        cli_args.python_version,
2800b49
        cli_args.varargs,
2800b49
    )
2800b49
2800b49
    cli_args.output.write_text("\n".join(file_section) + "\n", encoding="utf-8")
2800b49
2800b49
2800b49
def argparser():
2800b49
    parser = argparse.ArgumentParser()
2800b49
    r = parser.add_argument_group("required arguments")
2800b49
    r.add_argument("--output", type=PosixPath, required=True)
2800b49
    r.add_argument("--buildroot", type=PosixPath, required=True)
2800b49
    r.add_argument("--sitelib", type=BuildrootPath, required=True)
2800b49
    r.add_argument("--sitearch", type=BuildrootPath, required=True)
2800b49
    r.add_argument("--bindir", type=BuildrootPath, required=True)
2800b49
    r.add_argument("--python-version", type=str, required=True)
2800b49
    parser.add_argument("varargs", nargs="+")
2800b49
    return parser
2800b49
2800b49
2800b49
if __name__ == "__main__":
2800b49
    cli_args = argparser().parse_args()
2800b49
    main(cli_args)