Source code for datalad_next.iter_collections.utils

"""Utilities and types for collection iterators"""

from __future__ import annotations

from dataclasses import dataclass
from enum import Enum
import os
from pathlib import (
    Path,
    PurePath,
)
import stat
from typing import (
    TYPE_CHECKING,
    Dict,
    Union,
    Any,
    IO,
    List,
)

from datalad_next.consts import COPY_BUFSIZE
from datalad_next.utils import MultiHash

if TYPE_CHECKING:
    from .annexworktree import AnnexWorktreeFileSystemItem
    from .directory import DirectoryItem
    from .gitworktree import GitWorktreeFileSystemItem
    from io import BufferedReader
    from os import PathLike
    from tarfile import ExFileObject
    from zipfile import ZipExtFile


# TODO Could be `StrEnum`, came with PY3.11
[docs] class FileSystemItemType(Enum): """Enumeration of file system path types The associated ``str`` values are chosen to be appropriate for downstream use (e.g, as type labels in DataLad result records). """ file = 'file' directory = 'directory' symlink = 'symlink' hardlink = 'hardlink' specialfile = 'specialfile'
@dataclass class NamedItem: name: Any @dataclass class TypedItem: type: Any @dataclass class PathBasedItem(NamedItem): """An item with a path as its ``name`` A dedicated property supports the conversion of the native name representation into a ``PurePath`` instance. Any argument understood by the ``PurePath`` constructor can be used as ``name``, such as a a filename, a relative path, or an absolute path -- in string form, as path segment sequence, or a ``Path`` instance. It is recommended to use name/path values that are relative to the containing collection (directory, archive, repository, etc.). """ def path(self) -> PurePath: """Returns the item name as a ``PurePath`` instance This default implementation assumes the ``name`` to be platform path conventions. """ return PurePath(self.name)
[docs] @dataclass # sadly PY3.10+ only (kw_only=True) class FileSystemItem(PathBasedItem, TypedItem): type: FileSystemItemType size: int mtime: float | None = None mode: int | None = None uid: int | None = None gid: int | None = None link_target: str | PathLike[str] | None = None fp: IO | None = None
[docs] @classmethod def from_path( cls, path: Path, *, link_target: bool = True, ) -> Union[ DirectoryItem, AnnexWorktreeFileSystemItem, FileSystemItem, GitWorktreeFileSystemItem, ]: """Populate item properties from a single `stat` and `readlink` call The given ``path`` must exist. The ``link_target`` flag indicates whether to report the result of ``readlink`` for a symlink-type path. """ cstat = path.lstat() cmode = cstat.st_mode if stat.S_ISLNK(cmode): ctype = FileSystemItemType.symlink elif stat.S_ISDIR(cmode): ctype = FileSystemItemType.directory else: # the rest is a file # there could be fifos and sockets, etc. # but we do not recognize them here ctype = FileSystemItemType.file item = cls( name=path, type=ctype, size=cstat.st_size, mode=cmode, mtime=cstat.st_mtime, uid=cstat.st_uid, gid=cstat.st_gid, ) if link_target and ctype == FileSystemItemType.symlink: # could be p.readlink() from PY3.9+ # but check performance difference item.link_target = os.readlink(path) return item
def compute_multihash_from_fp( fp: Union[BufferedReader, ExFileObject, ZipExtFile], hash: List[str], bufsize: int = COPY_BUFSIZE, ) -> Dict[str, str]: """Compute multiple hashes from a file-like """ mhash = MultiHash(hash) while True: chunk = fp.read(bufsize) if not chunk: break mhash.update(chunk) return mhash.get_hexdigest()