Source code for datalad.distribution.dataset

# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
#   See COPYING file distributed along with the datalad package for the
#   copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Implements class Dataset
"""

import inspect
import logging
from functools import wraps
from os.path import (
    curdir,
    exists,
)
from os.path import join as opj
from os.path import (
    normpath,
    pardir,
)
from weakref import WeakValueDictionary

import datalad.utils as ut
from datalad import cfg
from datalad.config import ConfigManager
from datalad.core.local.repo import repo_from_path
from datalad.dataset.repo import (
    PathBasedFlyweight,
    path_based_str_repr,
)
from datalad.support import path as op
from datalad.support.annexrepo import AnnexRepo
from datalad.support.constraints import Constraint
# DueCredit
from datalad.support.due import due
from datalad.support.due_utils import duecredit_dataset
from datalad.support.exceptions import NoDatasetFound
from datalad.support.gitrepo import GitRepo
from datalad.utils import \
    get_dataset_root  # TODO remove after a while, when external consumers have adjusted; to use get_dataset_root()
from datalad.utils import (
    Path,
    PurePath,
    ensure_list,
    get_sig_param_names,
    getpwd,
    optional_args,
)

lgr = logging.getLogger('datalad.dataset')
lgr.log(5, "Importing dataset")



[docs]
@path_based_str_repr
class Dataset(object, metaclass=PathBasedFlyweight):
    """Representation of a DataLad dataset/repository

    This is the core data type of DataLad: a representation of a dataset.
    At its core, datasets are (git-annex enabled) Git repositories. This
    class provides all operations that can be performed on a dataset.

    Creating a dataset instance is cheap, all actual operations are
    delayed until they are actually needed. Creating multiple `Dataset`
    class instances for the same Dataset location will automatically
    yield references to the same object.

    A dataset instance comprises of two major components: a `repo`
    attribute, and a `config` attribute. The former offers access to
    low-level functionality of the Git or git-annex repository. The
    latter gives access to a dataset's configuration manager.

    Most functionality is available via methods of this class, but also
    as stand-alone functions with the same name in `datalad.api`.
    """
    # Begin Flyweight
    _unique_instances = WeakValueDictionary()

    @classmethod
    def _flyweight_preproc_path(cls, path):
        """Custom handling for few special abbreviations for datasets"""
        path_ = path
        if path in ('^', '^.'):
            dsroot = get_dataset_root(curdir)
            if dsroot is None:
                raise NoDatasetFound('No dataset contains path: {}'.format(
                    str(Path.cwd())))
            if path == '^':
                # get the topmost dataset from current location. Note that 'zsh'
                # might have its ideas on what to do with ^, so better use as -d^
                path_ = Dataset(dsroot).get_superdataset(
                    topmost=True).path
            elif path == '^.':
                # the dataset containing current directory
                path_ = dsroot
        elif path == '///':
            # TODO: logic/UI on installing a default dataset could move here
            # from search?
            path_ = cfg.obtain('datalad.locations.default-dataset')
        if path != path_:
            lgr.debug("Resolved dataset alias %r to path %r", path, path_)
        return path_

    @classmethod
    def _flyweight_postproc_path(cls, path):
        # we want an absolute path, but no resolved symlinks
        if not op.isabs(path):
            path = op.join(op.getpwd(), path)

        # use canonical paths only:
        return op.normpath(path)

    def _flyweight_invalid(self):
        """Invalidation of Flyweight instance

        Dataset doesn't need to be invalidated during its lifetime at all. Instead the underlying *Repo instances are.
        Dataset itself can represent a not yet existing path.
        """
        return False
    # End Flyweight

    def __hash__(self):
        # the flyweight key is already determining unique instances
        # add the class name to distinguish from strings of a path
        return hash((self.__class__.__name__, self.__weakref__.key))


[docs]
    def __init__(self, path):
        """
        Parameters
        ----------
        path : str or Path
          Path to the dataset location. This location may or may not exist
          yet.
        """
        self._pathobj = path if isinstance(path, ut.Path) else None
        if isinstance(path, ut.PurePath):
            path = str(path)
        self._path = path
        self._repo = None
        self._id = None
        self._cfg = None
        self._cfg_bound = None


    @property
    def pathobj(self):
        """pathobj for the dataset"""
        # XXX this relies on the assumption that self._path as managed
        # by the base class is always a native path
        if not self._pathobj:
            self._pathobj = ut.Path(self._path)
        return self._pathobj

    def __eq__(self, other):
        if not hasattr(other, 'pathobj'):
            return False
        # Ben: https://github.com/datalad/datalad/pull/4057#discussion_r370153586
        # It's pointing to the same thing, while not being the same object
        # (in opposition to the *Repo classes). So `ds1 == ds2`,
        # `but ds1 is not ds2.` I thought that's a useful distinction. On the
        # other hand, I don't think we use it anywhere outside tests yet.
        me_exists = self.pathobj.exists()
        other_exists = other.pathobj.exists()
        if me_exists != other_exists:
            # no chance this could be the same
            return False
        elif me_exists:
            # check on filesystem
            return self.pathobj.samefile(other.pathobj)
        else:
            # we can only do lexical comparison.
            # this will fail to compare a long and a shortpath.
            # on windows that could actually point to the same thing
            # if it would exists, but this is how far we go with this.
            return self.pathobj == other.pathobj

    def __getattr__(self, attr):
        # Assure that we are not just missing some late binding @datasetmethod .
        if not attr.startswith('_'):  # do not even consider those
            lgr.debug("Importing datalad.api to possibly discover possibly not yet bound method %r", attr)
            # load entire datalad.api which will also bind datasetmethods
            # from extensions.
            import datalad.api

            # which would bind all known interfaces as well.
            # Although adds overhead, good for UX
        return super(Dataset, self).__getattribute__(attr)

    def close(self):
        """Perform operations which would close any possible process using this Dataset
        """
        repo = self._repo
        self._repo = None
        if repo:
            # might take care about lingering batched processes etc
            del repo

    @property
    def path(self):
        """path to the dataset"""
        return self._path

    @property
    def repo(self):
        """Get an instance of the version control system/repo for this dataset,
        or None if there is none yet (or none anymore).

        If testing the validity of an instance of GitRepo is guaranteed to be
        really cheap this could also serve as a test whether a repo is present.

        Note, that this property is evaluated every time it is used. If used
        multiple times within a function it's probably a good idea to store its
        value in a local variable and use this variable instead.

        Returns
        -------
        GitRepo or AnnexRepo
        """

        # If we already got a *Repo instance, check whether it's still valid;
        # Note, that this basically does part of the testing that would
        # (implicitly) be done in the loop below again. So, there's still
        # potential to speed up when we actually need to get a new instance
        # (or none). But it's still faster for the vast majority of cases.
        #
        # TODO: Dig deeper into it and melt with new instance guessing. This
        # should also involve to reduce redundancy of testing such things from
        # within Flyweight.__call__, AnnexRepo.__init__ and GitRepo.__init__!
        #
        # Also note, that this could be forged into a single big condition, but
        # that is hard to read and we should be well aware of the actual
        # criteria here:
        if self._repo is not None and self.pathobj.resolve() == self._repo.pathobj:
            # we got a repo and path references still match
            if isinstance(self._repo, AnnexRepo):
                # it's supposed to be an annex
                # Here we do the same validation that Flyweight would do beforehand if there was a call to AnnexRepo()
                if self._repo is AnnexRepo._unique_instances.get(
                        self._repo.path, None) and not self._repo._flyweight_invalid():
                    # it's still the object registered as flyweight and it's a
                    # valid annex repo
                    return self._repo
            elif isinstance(self._repo, GitRepo):
                # it's supposed to be a plain git
                # same kind of checks as for AnnexRepo above, but additionally check whether it was changed to have an
                # annex now.
                # TODO: Instead of is_with_annex, we might want the cheaper check for an actually initialized annex.
                #       However, that's not completely clear. On the one hand, if it really changed to be an annex
                #       it seems likely that this happened locally and it would also be an initialized annex. On the
                #       other hand, we could have added (and fetched) a remote with an annex, which would turn it into
                #       our current notion of an uninitialized annex. Question is whether or not such a change really
                #       need to be detected. For now stay on the safe side and detect it.
                if self._repo is GitRepo._unique_instances.get(
                        self._repo.path, None) and not self._repo._flyweight_invalid() and not \
                        self._repo.is_with_annex():
                    # it's still the object registered as flyweight, it's a
                    # valid git repo and it hasn't turned into an annex
                    return self._repo

        # Note: Although it looks like the "self._repo = None" assignments
        # could be used instead of variable "valid", that's a big difference!
        # The *Repo instances are flyweights, not singletons. self._repo might
        # be the last reference, which would lead to those objects being
        # destroyed and therefore the constructor call would result in an
        # actually new instance. This is unnecessarily costly.
        try:
            self._repo = repo_from_path(self._path)
        except ValueError:
            lgr.log(5, "Failed to detect a valid repo at %s", self.path)
            self._repo = None
            return

        if due.active:
            # TODO: Figure out, when exactly this is needed. Don't think it
            #       makes sense to do this for every dataset,
            #       no matter what => we want .repo to be as cheap as it gets.
            # Makes sense only on installed dataset - @never_fail'ed
            duecredit_dataset(self)

        return self._repo

    @property
    def id(self):
        """Identifier of the dataset.

        This identifier is supposed to be unique across datasets, but identical
        for different versions of the same dataset (that have all been derived
        from the same original dataset repository).

        Note, that a plain git/git-annex repository doesn't necessarily have
        a dataset id yet. It is created by `Dataset.create()` and stored in
        .datalad/config. If None is returned while there is a valid repository,
        there may have never been a call to `create` in this branch before
        current commit.

        Note, that this property is evaluated every time it is used. If used
        multiple times within a function it's probably a good idea to store its
        value in a local variable and use this variable instead.

        Returns
        -------
        str
          This is either a stored UUID, or `None`.
        """

        return self.config.get('datalad.dataset.id', None)

    @property
    def config(self):
        """Get a ``ConfigManager`` instance for a dataset's configuration

        In case a dataset does not (yet) have an existing corresponding
        repository, the returned ``ConfigManager`` is the global instance
        that is also provided via ``datalad.cfg``.

        Note, that this property is evaluated every time it is used. If used
        multiple times within a function it's probably a good idea to store its
        value in a local variable and use this variable instead.

        Returns
        -------
        ConfigManager
        """
        # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad
        repo = self.repo
        if repo is None:
            # if there's no repo (yet or anymore), we can't read/write config at
            # dataset level, but only at user/system level
            # However, if this was the case before as well, we don't want a new
            # instance of ConfigManager, but use the global one
            if self._cfg_bound in (True, None):
                # for the sake of uniformity assign datalad.cfg to self._cfg
                self._cfg = cfg
                self._cfg_bound = False

        else:
            self._cfg = repo.config
            self._cfg_bound = True

        return self._cfg

    def recall_state(self, whereto):
        """Something that can be used to checkout a particular state
        (tag, commit) to "undo" a change or switch to a otherwise desired
        previous state.

        Parameters
        ----------
        whereto: str
        """
        if not self.is_installed():
            raise RuntimeError(
                "cannot remember a state when a dataset is not yet installed")
        self.repo.checkout(whereto)

    def is_installed(self):
        """Returns whether a dataset is installed.

        A dataset is installed when a repository for it exists on the filesystem.

        Returns
        -------
        bool
        """

        return self.path is not None and exists(self.path) and \
            self.repo is not None

    def get_superdataset(self, datalad_only=False, topmost=False,
                         registered_only=True):
        """Get the dataset's superdataset

        Parameters
        ----------
        datalad_only : bool, optional
          Whether to consider only "datalad datasets" (with non-None
          id), or (if False, which is default) - any git repository
        topmost : bool, optional
          Return the topmost super-dataset. Might then be the current one.
        registered_only : bool, optional
          Test whether any discovered superdataset actually contains the
          dataset in question as a registered subdataset (as opposed to
          just being located in a subdirectory without a formal relationship).

        Returns
        -------
        Dataset or None
        """
        path = self.path
        sds_path = path if topmost else None

        def res_filter(res):
            return res.get('status') == 'ok' and res.get('type') == 'dataset'

        def subds_contains_path(ds, path):
            return path in sds.subdatasets(recursive=False,
                                           contains=path,
                                           result_filter=res_filter,
                                           on_failure='ignore',
                                           result_xfm='paths',
                                           result_renderer='disabled')

        while path:
            # normalize the path after adding .. so we guaranteed to not
            # follow into original directory if path itself is a symlink
            par_path = normpath(opj(path, pardir))
            sds_path_ = get_dataset_root(par_path)
            if sds_path_ is None:
                # no more parents, use previous found
                break

            sds = Dataset(sds_path_)
            if datalad_only:
                # test if current git is actually a dataset?
                if not sds.id:
                    break
            if registered_only:
                if not subds_contains_path(sds, path):
                    break

            # That was a good candidate
            sds_path = sds_path_
            path = par_path
            if not topmost:
                # no looping
                break

        if sds_path is None:
            # None was found
            return None

        # No postprocessing now should be necessary since get_toppath
        # tries its best to not resolve symlinks now

        return Dataset(sds_path)



@optional_args
def datasetmethod(f, name=None, dataset_argname='dataset'):
    """Decorator to bind functions to Dataset class.

    The decorated function is still directly callable and additionally serves
    as method `name` of class Dataset.  To achieve this, the first positional
    argument is redirected to original keyword argument 'dataset_argname'. All
    other arguments stay in order (and keep their names, of course). That
    means, that the signature of the bound function is name(self, a, b) if the
    original signature is name(a, dataset, b) for example.

    The decorator has no effect on the actual function decorated with it.
    """

    if not name:
        name = f.__name__

    @wraps(f)
    def apply_func(instance, *args, **kwargs):
        # Wrapper function to assign arguments of the bound function to
        # original function.
        #
        # Note
        # ----
        # This wrapper is NOT returned by the decorator, but only used to bind
        # the function `f` to the Dataset class.
        kwargs = kwargs.copy()

        # due to use of functools.wraps and inability of of getarspec to get
        # those, we use .signature.
        # More information in de-wrapt PR https://github.com/datalad/datalad/pull/6190
        from datalad.utils import get_sig_param_names
        f_args, f_kwonlyargs = get_sig_param_names(f, ('pos_any', 'kw_only'))

        # If bound function is used with wrong signature (especially by
        # explicitly passing a dataset), let's raise a proper exception instead
        # of a 'list index out of range', that is not very telling to the user.
        # In case whenever kwonlyargs are used, 'dataset' would not be listed
        # among args, so we would account for it (possibly) be there.
        if len(args) >= len(f_args) + int(bool(f_kwonlyargs)):
            non_dataset_args = ["self"] + [a for a in f_args if a != dataset_argname]
            raise TypeError(
                f"{name}() takes at most {len(f_args)} arguments ({len(args)} given): "
                f"{non_dataset_args}")
        if dataset_argname in kwargs:
            raise TypeError(
                f"{name}() got an unexpected keyword argument {dataset_argname}")
        kwargs[dataset_argname] = instance
        if dataset_argname in f_kwonlyargs:
            # * was used to enforce kwargs, so we just would pass things as is
            pass
        else:
            # so it is "old" style, where it is a regular kwargs - we pass everything
            # via kwargs
            # TODO: issue a DX oriented warning that we advise to separate out kwargs,
            # dataset included, with * from positional args?
            ds_index = f_args.index(dataset_argname)
            for i in range(0, len(args)):
                if i < ds_index:
                    kwargs[f_args[i]] = args[i]
                elif i >= ds_index:
                    kwargs[f_args[i+1]] = args[i]
            args = []
        return f(*args, **kwargs)

    setattr(Dataset, name, apply_func)
    # set the ad-hoc attribute so that @build_doc could also bind built doc
    # to the dataset method
    if getattr(f, '_dataset_method', None):
        raise RuntimeError(f"_dataset_method of {f} is already set to {f._dataset_method}")
    setattr(f, '_dataset_method', apply_func)
    return f


# Note: Cannot be defined within constraints.py, since then dataset.py needs to
# be imported from constraints.py, which needs to be imported from dataset.py
# for another constraint
class EnsureDataset(Constraint):
    """Despite its name, this constraint does not actually ensure that the
    argument is a valid dataset, because for procedural reasons this would
    typically duplicate subsequent checks and processing. However, it can
    be used to achieve uniform documentation of `dataset` arguments."""

    def __call__(self, value):
        if isinstance(value, Dataset):
            return value
        elif isinstance(value, (str, PurePath)):
            # we cannot convert to a Dataset class right here
            # - duplicates require_dataset() later on
            # - we need to be able to distinguish between a bound
            #   dataset method call and a standalone call for
            #   relative path argument disambiguation
            #return Dataset(path=value)
            return value
        else:
            raise ValueError("Can't create Dataset from %s." % type(value))

    def short_description(self):
        return "Dataset"

    def long_description(self):
        return """Value must be a Dataset or a valid identifier of a Dataset
        (e.g. a path)"""


def require_dataset(dataset, check_installed=True, purpose=None):
    """Helper function to resolve a dataset.

    This function tries to resolve a dataset given an input argument,
    or based on the process' working directory, if `None` is given.

    Parameters
    ----------
    dataset : None or path or Dataset
      Some value identifying a dataset or `None`. In the latter case
      a dataset will be searched based on the process working directory.
    check_installed : bool, optional
      If True, an optional check whether the resolved dataset is
      properly installed will be performed.
    purpose : str, optional
      This string will be inserted in error messages to make them more
      informative. The pattern is "... dataset for <STRING>".

    Returns
    -------
    Dataset
      If a dataset could be determined.

    Raises
    ------
    NoDatasetFound
      If not dataset could be determined.
    """
    if dataset is not None and not isinstance(dataset, Dataset):
        dataset = Dataset(dataset)

    if dataset is None:  # possible scenario of cmdline calls
        dspath = get_dataset_root(getpwd())
        if not dspath:
            raise NoDatasetFound(
                "No dataset found at '{}'{}.  Specify a dataset to work with "
                "by providing its path via the `dataset` option, "
                "or change the current working directory to be in a "
                "dataset.".format(
                    getpwd(),
                    " for the purpose {!r}".format(purpose) if purpose else ''
                )
            )
        dataset = Dataset(dspath)

    assert(dataset is not None)
    lgr.debug(u"Resolved dataset%s: %s",
              u' to {}'.format(purpose) if purpose else '',
              dataset.path)

    if check_installed and not dataset.is_installed():
        raise NoDatasetFound(
            f"No installed dataset found at {dataset.path}")

    return dataset


# New helpers, courtesy of datalad-revolution.


# note: not thread safe if threads chdir - uses getpwd
def resolve_path(path, ds=None, ds_resolved=None):
    """Resolve a path specification (against a Dataset location)

    Any path is returned as an absolute path. If, and only if, a dataset
    object instance is given as `ds`, relative paths are interpreted as
    relative to the given dataset. In all other cases, relative paths are
    treated as relative to the current working directory.

    Note however, that this function is not able to resolve arbitrarily
    obfuscated path specifications. All operations are purely lexical, and no
    actual path resolution against the filesystem content is performed.
    Consequently, common relative path arguments like '../something' (relative
    to PWD) can be handled properly, but things like 'down/../under' cannot, as
    resolving this path properly depends on the actual target of any
    (potential) symlink leading up to '..'.

    Parameters
    ----------
    path : str or PathLike or list
      Platform-specific path specific path specification. Multiple path
      specifications can be given as a list
    ds : Dataset or PathLike or None
      Dataset instance to resolve relative paths against.
    ds_resolved : Dataset or None
      A dataset instance that was created from `ds` outside can be provided
      to avoid multiple instantiation on repeated calls.

    Returns
    -------
    `pathlib.Path` object or list(Path)
      When a list was given as input a list is returned, a Path instance
      otherwise.
    """
    got_ds_instance = isinstance(ds, Dataset)
    if ds is not None and not got_ds_instance:
        ds = ds_resolved or require_dataset(
            ds, check_installed=False, purpose='path resolution')
    out = []
    pwd_parts = None  # get it upon first use but only once
    for p in ensure_list(path):
        if ds is None or not got_ds_instance:
            # no dataset at all or no instance provided -> CWD is always the reference
            # nothing needs to be done here. Path-conversion and absolutification
            # are done next
            pass
        # we have a given datasets instance
        elif not Path(p).is_absolute():
            # we have a dataset and no abspath nor an explicit relative path ->
            # resolve it against the dataset
            p = ds.pathobj / p

        p = ut.Path(p)

        # make sure we return an absolute path, but without actually
        # resolving anything
        if not p.is_absolute():
            # in general it is almost impossible to use resolve() when
            # we can have symlinks in the root path of a dataset
            # (that we don't want to resolve here), symlinks to annex'ed
            # files (that we never want to resolve), and other within-repo
            # symlinks that we (sometimes) want to resolve (i.e. symlinked
            # paths for addressing content vs adding content)
            # CONCEPT: do the minimal thing to catch most real-world inputs
            # ASSUMPTION: the only sane relative path input that needs
            # handling and can be handled are upward references like
            # '../../some/that', whereas stuff like 'down/../someotherdown'
            # are intellectual exercises
            # ALGORITHM: match any number of leading '..' path components
            # and shorten the PWD by that number
            # NOT using ut.Path.cwd(), because it has symlinks resolved!!
            if not pwd_parts:
                pwd_parts = ut.Path(getpwd()).parts
            path_parts = p.parts
            leading_parents = 0
            for pp in p.parts:
                if pp == op.pardir:
                    leading_parents += 1
                    path_parts = path_parts[1:]
                elif pp == op.curdir:
                    # we want to discard that, but without stripping
                    # a corresponding parent
                    path_parts = path_parts[1:]
                else:
                    break
            p = ut.Path(
                op.join(
                    *(pwd_parts[:-leading_parents if leading_parents else None]
                      + path_parts)))
        # note that we will not "normpath()" the result, check the
        # pathlib docs for why this is the only sane choice in the
        # face of the possibility of symlinks in the path
        out.append(p)
    return out[0] if isinstance(path, (str, PurePath)) else out

# TODO keep this around for a while so that extensions can be updated
rev_resolve_path = resolve_path


def path_under_rev_dataset(ds, path):
    ds_path = ds.pathobj
    try:
        rpath = str(ut.Path(path).relative_to(ds_path))
        if not rpath.startswith(op.pardir):
            # path is already underneath the dataset
            return path
    except Exception:
        # whatever went wrong, we gotta play save
        pass

    root = get_dataset_root(str(path))
    while root is not None and not ds_path.samefile(root):
        # path and therefore root could be relative paths,
        # hence in the next round we cannot use dirname()
        # to jump in the the next directory up, but we have
        # to use ./.. and get_dataset_root() will handle
        # the rest just fine
        root = get_dataset_root(op.join(root, op.pardir))
    if root is None:
        return None
    return ds_path / op.relpath(str(path), root)


lgr.log(5, "Done importing dataset")