"""Report on the content of a Git tree-ish
The main functionality is provided by the :func:`iter_gittree()` function.
"""
from __future__ import annotations
from dataclasses import dataclass
from enum import Enum
from functools import cached_property
import logging
from pathlib import (
Path,
PurePosixPath,
)
from typing import (
Iterator,
Generator,
)
from datasalad.itertools import (
decode_bytes,
itemize,
)
from datalad_next.runners import iter_git_subproc
from .utils import PathBasedItem
lgr = logging.getLogger('datalad.ext.next.iter_collections.gittree')
# TODO Could be `StrEnum`, came with PY3.11
[docs]
class GitTreeItemType(Enum):
"""Enumeration of item types of Git trees
"""
file = 'file'
executablefile = 'executablefile'
symlink = 'symlink'
directory = 'directory'
submodule = 'submodule'
@dataclass
class GitTreeItem(PathBasedItem):
"""``PathBasedItem`` with a relative path as a name (in POSIX conventions)
"""
name: str
# gitsha is not the sha1 of the file content, but the output
# of `git hash-object` which does something like
# `printf "blob $(wc -c < "$file_name")\0$(cat "$file_name")" | sha1sum`
gitsha: str | None = None
gittype: GitTreeItemType | None = None
@cached_property
def path(self) -> PurePosixPath:
"""Returns the item name as a ``PurePosixPath`` instance"""
return PurePosixPath(self.name)
_mode_type_map = {
'100644': GitTreeItemType.file,
'100755': GitTreeItemType.executablefile,
'040000': GitTreeItemType.directory,
'120000': GitTreeItemType.symlink,
'160000': GitTreeItemType.submodule,
}
[docs]
def iter_gittree(
path: Path,
treeish: str,
*,
recursive: str = 'repository',
) -> Generator[GitTreeItem, None, None]:
"""Uses ``git ls-tree`` to report on a tree in a Git repository
Parameters
----------
path: Path
Path of a directory in a Git repository to report on. This directory
need not be the root directory of the repository, but must be part of
the repository. If the directory is not the root directory of a
non-bare repository, the iterator is constrained to items underneath
that directory.
recursive: {'repository', 'no'}, optional
Behavior for recursion into subtrees. By default (``repository``),
all tree within the repository underneath ``path``) are reported,
but not tree within submodules. If ``no``, only direct children
are reported on.
Yields
------
:class:`GitTreeItem`
The ``name`` attribute of an item is a ``str`` with the corresponding
(relative) path, as reported by Git (in POSIX conventions).
"""
# we force-convert to Path to give us the piece of mind we want.
# The docs already ask for that, but it is easy to
# forget/ignore and leads to non-obvious errors. Running this once is
# a cheap safety net
path = Path(path)
# although it would be easy to also query the object size, we do not
# do so, because it has a substantial runtime impact. It is unclear
# what the main factor for the slowdown is, but in test cases I can
# see 10x slower
#lstree_args = ['--long']
# we do not go for a custom format that would allow for a single split
# by tab, because if we do, Git starts quoting paths with special
# characters (like tab) again
#lstree_args = ['--format=%(objectmode)%x09%(objectname)%x09%(path)']
lstree_args = []
if recursive == 'repository':
lstree_args.append('-r')
for line in _git_ls_tree(path, treeish, *lstree_args):
yield _get_tree_item(line)
def _get_tree_item(spec: str) -> GitTreeItem:
props, path = spec.split('\t', maxsplit=1)
# 0::2 gets the first and third (last) item, effectively skippping the
# type name (blob/tree etc.), we have the mode lookup for that, which
# provides more detail
mode, sha = props.split(' ')[0::2]
return GitTreeItem(
name=path,
gitsha=sha,
gittype=_mode_type_map[mode],
)
def _git_ls_tree(path: Path, *args) -> Iterator[str]:
with iter_git_subproc(
[
'ls-tree',
# we rely on zero-byte splitting below
'-z',
# otherwise take whatever is coming in
*args,
],
cwd=path,
) as r:
yield from itemize(
decode_bytes(r),
sep='\0',
keep_ends=False,
)