Source code for datalad_next.types.archivist

"""``dl+archive:`` archive member locator"""

from __future__ import annotations

from dataclasses import dataclass
from pathlib import PurePosixPath
import re

from .annexkey import AnnexKey
from .enums import ArchiveType


# be relatively permissive
_recognized_urls = re.compile(r'^dl\+archive:(?P<key>.*)#(?P<props>.*)')
# each archive member is identified by a (relative) path inside
# the archive.
_archive_member_props = re.compile(
    # a path may contain any char but '&'
    # TODO check that something in the machinery ensures proper
    # quoting
    'path=(?P<path>[^&]+)'
    # size info (in bytes) is optional
    '(&size=(?P<size>[0-9]+)|)'
    # archive type label is optional
    '(&atype=(?P<atype>[a-z0-9]+)|)'
)


[docs] @dataclass class ArchivistLocator: """Representation of a ``dl+archive:`` archive member locator These locators are used by the ``datalad-archives`` and ``archivist`` git-annex special remotes. They identify a member of a archive that is itself identified by an annex key. Each member is annotated with its size (in bytes). Optionally, the file format type of the archive can be annotated too. Syntax of ``dl+archives:`` locators ----------------------------------- The locators the following minimal form:: dl+archive:<archive-key>#path=<path-in-archive> where ``<archive-key>`` is a regular git-annex key of an archive file, and ``<path-in-archive>`` is a POSIX-style relative path pointing to a member within the archive. Two optional, additional attributes ``size`` and ``atype`` are recognized (only ``size`` is also understood by the ``datalad-archives`` special remote). ``size`` declares the size of the (extracted) archive member in bytes:: dl+archive:<archive-key>#path=<path-in-archive>&size=<size-in-bytes> ``atype`` declares the type of the containing archive using a label. Currently recognized labels are ``tar`` (a TAR archive, compressed or not), and ``zip`` (a ZIP archive). See :class:`~datalad_next.types.enums.ArchiveType` for all recognized labels. If no type information is given, :func:`ArchivistLocator.from_str()` will try to determine the archive type from the archive key (via ``*E``-type git-annex backends, such as DataLad's default ``MD5E``). The order in the fragment part of the URL (after ``#``) is significant. ``path`` must come first, followed by ``size`` or ``atype``. If both ``size`` and ``atype`` are present, ``size`` must be declared first. A complete example of a URL is:: dl+archive:MD5-s389--e9f624eb778e6f945771c543b6e9c7b2#path=dir/file.csv&size=234&atype=tar """ akey: AnnexKey member: PurePosixPath size: int | None = None # datalad-archives did not have the type info, we want to be # able to handle those too, make optional atype: ArchiveType | None = None def __str__(self) -> str: return 'dl+archive:{akey}#path={member}&size={size}{atype}'.format( akey=self.akey, # TODO needs quoting? member=self.member, size=self.size, atype=f'&atype={self.atype.value}' if self.atype else '', )
[docs] @classmethod def from_str(cls, url: str): """Return ``ArchivistLocator`` from ``str`` form""" url_match = _recognized_urls.match(url) if not url_match: raise ValueError('Unrecognized dl+archives locator syntax') url_matched = url_match.groupdict() # convert to desired type akey = AnnexKey.from_str(url_matched['key']) # archive member properties props_match = _archive_member_props.match(url_matched['props']) if not props_match: # without at least a 'path' there is nothing we can do here raise ValueError( 'dl+archives locator contains invalid archive member ' f'specification: {url_matched["props"]!r}') props_matched = props_match.groupdict() amember_path = PurePosixPath(props_matched['path']) if amember_path.is_absolute(): raise ValueError( 'dl+archives locator contains absolute archive member path') if '..' in amember_path.parts: raise ValueError( 'dl+archives locator archive member path contains ".."') # size is optional, regex ensure that it is an int size = props_matched.get('size') if size is not None: size = int(size) # archive type, could be None atype = props_matched.get('atype') if atype is not None: # if given, most be known type try: atype = getattr(ArchiveType, atype) except AttributeError as e: raise ValueError( 'dl+archives locator archive type unrecognized') from e if atype is None and akey.backend.endswith('E'): # try by key name extension suf = PurePosixPath(akey.name).suffixes if '.zip' == suf[-1]: atype = ArchiveType.zip elif '.tar' in suf: atype = ArchiveType.tar elif '.tgz' in suf: atype = ArchiveType.tar return cls( akey=akey, member=amember_path, size=size, atype=atype, )