"""Create repositories and datasets
.. currentmodule:: datalad_core.create
.. autosummary::
:toctree: generated
create_annexrepo
create_dataset
create_repo
DatasetInitMode
"""
__all__ = [
'DatasetInitMode',
'create_annexrepo',
'create_dataset',
'create_repo',
]
import enum
import random
import uuid
from os import environ
from pathlib import (
Path,
PurePosixPath,
)
from datalad_core.annex_utils import AnnexInitMode
from datalad_core.git_utils import apply_changeset
from datalad_core.repo import (
Repo,
Worktree,
)
from datalad_core.runners import (
call_git,
call_git_oneline,
)
[docs]
class DatasetInitMode(enum.Flag):
ID = enum.auto()
PROTECTGIT = enum.auto()
BACKEND = enum.auto()
[docs]
def create_repo(
directory: Path,
*,
bare: bool = False,
branch: str | None = None,
quiet: bool = False,
) -> Repo | Worktree:
"""Create a repository in a new or empty directory
This is a thin frontend for the ``git-init`` command for creating a new
repository in ``directory``. All parameters correspond exactly to the
``git-clone`` CLI arguments. If the target directory does not yet exist, it
will be created.
Depending on the ``bare`` parameter, a ``Repo`` or ``Worktree`` instance
is returned.
"""
if not directory.exists():
directory.mkdir(parents=True)
cmd = ['init']
if bare:
cmd.append('--bare')
if branch:
cmd.extend(('--initial-branch', branch))
if quiet:
cmd.append('--quiet')
cmd.append(str(directory))
call_git(cmd)
return Repo(directory) if bare else Worktree(directory)
[docs]
def create_annexrepo(
directory: Path,
*,
bare: bool = False,
branch: str | None = None,
description: str | None = None,
annex_init: AnnexInitMode = AnnexInitMode.FULL,
private: bool = False,
quiet: bool = False,
) -> Repo | Worktree:
"""Create an annex-repository in a new/empty directory
This function wraps :func:`create_repo` and relays its return value.
Consequently, it passes the parameters ``directory``, ``bare``,
``branch``, and ``quiet`` unmodified to :func:`create_repo`.
In addition, it can initialize a repository annex (either with standard
behavior (default), or without enabling special remotes set to
auto-enable). With ``private`` enabled, the local repository annex will
not be recorded in the git-annex branch (see `private annexes
<https://git-annex.branchable.com/tips/cloning_a_repository_privately>`_).
A ``description`` can be given to label the local repository annex.
"""
created = create_repo(
directory,
bare=bare,
branch=branch,
quiet=quiet,
)
# if it comes back, we know that directory exists and is good
if annex_init == AnnexInitMode.OFF:
# all below is about annex initialization
return created
_init_annex(directory, private=private, description=description)
return created
[docs]
def create_dataset(
directory: Path,
*,
bare: bool = False,
branch: str | None = None,
# full datalad dataset init by default
dataset_init: DatasetInitMode | None = DatasetInitMode.ID
| DatasetInitMode.PROTECTGIT
| DatasetInitMode.BACKEND,
description: str | None = None,
message: str | None = None,
annex_init: AnnexInitMode = AnnexInitMode.FULL,
private: bool = False,
quiet: bool = False,
) -> Repo | Worktree:
"""Create a DataLad dataset in a new/empty directory
This function is highly similar to :func:`create_annexrepo`, but
additionally initializes the default or given ``branch`` with
the basic components that define a DataLad dataset, and commits
this setup with a customizable ``message``.
Different flavors of dataset initialization are supported, and can be
individually combined: :data:`DatasetInitMode.ID` sets a unique dataset
identifier; :data:`DatasetInitMode.PROTECTGIT` configures ``gitattributes``
to prevent annexing any `.git*` file or directory;
:data:`DatasetInitMode.BACKEND` sets a fixed annex key backend for all
annex keys (either indicated by the ``annex.backend`` Git configuration
setting or ``MD5E``; see `backends <https://git-annex.branchable.com/backends>`_). The
desired selection of initialization modes can be OR'ed and given as
``dataset_init`` parameter. By default, all initialization aspects are
performed.
"""
created = create_repo(
directory,
bare=bare,
branch=branch,
quiet=quiet,
)
# if it comes back, we know that directory exists and is good
# init the dataset BEFORE any annex init that can add the complication
# of adjusted modes
if dataset_init is not None:
_init_dataset(
created,
branch=branch,
message=message,
mode=dataset_init,
)
if annex_init == AnnexInitMode.OFF:
# all below is about annex initialization
return created
_init_annex(directory, private=private, description=description)
return created
def _init_annex(
directory: Path,
*,
private: bool = False,
description: str | None = None,
):
# we are not recycling any other implementation here (e.g., from clone()),
# because we know this is simpler. It is a new repo, no remotes,
# no special remotes, nothing to enable, no prompting...
if private:
call_git(['config', '--add', 'annex.private', 'true'], cwd=directory)
init_cmd = ['annex', 'init', '--quiet']
if description:
init_cmd.append(description)
call_git(init_cmd, cwd=directory)
def _init_dataset(
created: Repo | Worktree,
*,
branch: str | None,
message: str | None,
mode: DatasetInitMode,
):
content: dict[PurePosixPath | str, str] = {}
if DatasetInitMode.ID in mode:
# dataset id is a random UUID4, but when we have an explicit
# SEED set, use "random" bits that are made deterministic
# via setting this seed in `datalad_core.__init__`
ds_uuid = (
uuid.UUID(int=random.getrandbits(128))
if 'DATALAD_SEED' in environ
else uuid.uuid4()
)
# the complete content of the config file
content['.datalad/config'] = f'[datalad "dataset"]\n\tid = {ds_uuid}\n'
content['.datalad/.gitattributes'] = 'config annex.largefiles=nothing\n'
global_attributes = ''
if DatasetInitMode.BACKEND in mode:
backend = call_git_oneline(
['config', '--default', 'MD5E', 'annex.backend'],
cwd=created.path,
)
global_attributes += f'* annex.backend={backend}\n'
if DatasetInitMode.PROTECTGIT in mode:
global_attributes += '**/.git* annex.largefiles=nothing\n'
if global_attributes:
content['.gitattributes'] = global_attributes
# the rest is about committing the changes made above. this function
# is only called when there is anything to init, hence we can be confident
# that there is something to commit
apply_changeset(
created,
content,
message=message or 'initialized dataset',
branch=branch,
)