Source code for datalad_core.create

"""Create repositories and datasets


.. currentmodule:: datalad_core.create
.. autosummary::
   :toctree: generated

   create_annexrepo
   create_dataset
   create_repo
   DatasetInitMode
"""

__all__ = [
    'DatasetInitMode',
    'create_annexrepo',
    'create_dataset',
    'create_repo',
]


import enum
import random
import uuid
from os import environ
from pathlib import (
    Path,
    PurePosixPath,
)

from datalad_core.annex_utils import AnnexInitMode
from datalad_core.git_utils import apply_changeset
from datalad_core.repo import (
    Repo,
    Worktree,
)
from datalad_core.runners import (
    call_git,
    call_git_oneline,
)


[docs] class DatasetInitMode(enum.Flag): ID = enum.auto() PROTECTGIT = enum.auto() BACKEND = enum.auto()
[docs] def create_repo( directory: Path, *, bare: bool = False, branch: str | None = None, quiet: bool = False, ) -> Repo | Worktree: """Create a repository in a new or empty directory This is a thin frontend for the ``git-init`` command for creating a new repository in ``directory``. All parameters correspond exactly to the ``git-clone`` CLI arguments. If the target directory does not yet exist, it will be created. Depending on the ``bare`` parameter, a ``Repo`` or ``Worktree`` instance is returned. """ if not directory.exists(): directory.mkdir(parents=True) cmd = ['init'] if bare: cmd.append('--bare') if branch: cmd.extend(('--initial-branch', branch)) if quiet: cmd.append('--quiet') cmd.append(str(directory)) call_git(cmd) return Repo(directory) if bare else Worktree(directory)
[docs] def create_annexrepo( directory: Path, *, bare: bool = False, branch: str | None = None, description: str | None = None, annex_init: AnnexInitMode = AnnexInitMode.FULL, private: bool = False, quiet: bool = False, ) -> Repo | Worktree: """Create an annex-repository in a new/empty directory This function wraps :func:`create_repo` and relays its return value. Consequently, it passes the parameters ``directory``, ``bare``, ``branch``, and ``quiet`` unmodified to :func:`create_repo`. In addition, it can initialize a repository annex (either with standard behavior (default), or without enabling special remotes set to auto-enable). With ``private`` enabled, the local repository annex will not be recorded in the git-annex branch (see `private annexes <https://git-annex.branchable.com/tips/cloning_a_repository_privately>`_). A ``description`` can be given to label the local repository annex. """ created = create_repo( directory, bare=bare, branch=branch, quiet=quiet, ) # if it comes back, we know that directory exists and is good if annex_init == AnnexInitMode.OFF: # all below is about annex initialization return created _init_annex(directory, private=private, description=description) return created
[docs] def create_dataset( directory: Path, *, bare: bool = False, branch: str | None = None, # full datalad dataset init by default dataset_init: DatasetInitMode | None = DatasetInitMode.ID | DatasetInitMode.PROTECTGIT | DatasetInitMode.BACKEND, description: str | None = None, message: str | None = None, annex_init: AnnexInitMode = AnnexInitMode.FULL, private: bool = False, quiet: bool = False, ) -> Repo | Worktree: """Create a DataLad dataset in a new/empty directory This function is highly similar to :func:`create_annexrepo`, but additionally initializes the default or given ``branch`` with the basic components that define a DataLad dataset, and commits this setup with a customizable ``message``. Different flavors of dataset initialization are supported, and can be individually combined: :data:`DatasetInitMode.ID` sets a unique dataset identifier; :data:`DatasetInitMode.PROTECTGIT` configures ``gitattributes`` to prevent annexing any `.git*` file or directory; :data:`DatasetInitMode.BACKEND` sets a fixed annex key backend for all annex keys (either indicated by the ``annex.backend`` Git configuration setting or ``MD5E``; see `backends <https://git-annex.branchable.com/backends>`_). The desired selection of initialization modes can be OR'ed and given as ``dataset_init`` parameter. By default, all initialization aspects are performed. """ created = create_repo( directory, bare=bare, branch=branch, quiet=quiet, ) # if it comes back, we know that directory exists and is good # init the dataset BEFORE any annex init that can add the complication # of adjusted modes if dataset_init is not None: _init_dataset( created, branch=branch, message=message, mode=dataset_init, ) if annex_init == AnnexInitMode.OFF: # all below is about annex initialization return created _init_annex(directory, private=private, description=description) return created
def _init_annex( directory: Path, *, private: bool = False, description: str | None = None, ): # we are not recycling any other implementation here (e.g., from clone()), # because we know this is simpler. It is a new repo, no remotes, # no special remotes, nothing to enable, no prompting... if private: call_git(['config', '--add', 'annex.private', 'true'], cwd=directory) init_cmd = ['annex', 'init', '--quiet'] if description: init_cmd.append(description) call_git(init_cmd, cwd=directory) def _init_dataset( created: Repo | Worktree, *, branch: str | None, message: str | None, mode: DatasetInitMode, ): content: dict[PurePosixPath | str, str] = {} if DatasetInitMode.ID in mode: # dataset id is a random UUID4, but when we have an explicit # SEED set, use "random" bits that are made deterministic # via setting this seed in `datalad_core.__init__` ds_uuid = ( uuid.UUID(int=random.getrandbits(128)) if 'DATALAD_SEED' in environ else uuid.uuid4() ) # the complete content of the config file content['.datalad/config'] = f'[datalad "dataset"]\n\tid = {ds_uuid}\n' content['.datalad/.gitattributes'] = 'config annex.largefiles=nothing\n' global_attributes = '' if DatasetInitMode.BACKEND in mode: backend = call_git_oneline( ['config', '--default', 'MD5E', 'annex.backend'], cwd=created.path, ) global_attributes += f'* annex.backend={backend}\n' if DatasetInitMode.PROTECTGIT in mode: global_attributes += '**/.git* annex.largefiles=nothing\n' if global_attributes: content['.gitattributes'] = global_attributes # the rest is about committing the changes made above. this function # is only called when there is anything to init, hence we can be confident # that there is something to commit apply_changeset( created, content, message=message or 'initialized dataset', branch=branch, )