# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Implements class Dataset
"""
import inspect
import logging
from functools import wraps
from os.path import (
curdir,
exists,
)
from os.path import join as opj
from os.path import (
normpath,
pardir,
)
from weakref import WeakValueDictionary
import datalad.utils as ut
from datalad import cfg
from datalad.config import ConfigManager
from datalad.core.local.repo import repo_from_path
from datalad.dataset.repo import (
PathBasedFlyweight,
path_based_str_repr,
)
from datalad.support import path as op
from datalad.support.annexrepo import AnnexRepo
from datalad.support.constraints import Constraint
# DueCredit
from datalad.support.due import due
from datalad.support.due_utils import duecredit_dataset
from datalad.support.exceptions import NoDatasetFound
from datalad.support.gitrepo import GitRepo
from datalad.utils import \
get_dataset_root # TODO remove after a while, when external consumers have adjusted; to use get_dataset_root()
from datalad.utils import (
Path,
PurePath,
ensure_list,
get_sig_param_names,
getpwd,
optional_args,
)
lgr = logging.getLogger('datalad.dataset')
lgr.log(5, "Importing dataset")
[docs]
@path_based_str_repr
class Dataset(object, metaclass=PathBasedFlyweight):
"""Representation of a DataLad dataset/repository
This is the core data type of DataLad: a representation of a dataset.
At its core, datasets are (git-annex enabled) Git repositories. This
class provides all operations that can be performed on a dataset.
Creating a dataset instance is cheap, all actual operations are
delayed until they are actually needed. Creating multiple `Dataset`
class instances for the same Dataset location will automatically
yield references to the same object.
A dataset instance comprises of two major components: a `repo`
attribute, and a `config` attribute. The former offers access to
low-level functionality of the Git or git-annex repository. The
latter gives access to a dataset's configuration manager.
Most functionality is available via methods of this class, but also
as stand-alone functions with the same name in `datalad.api`.
"""
# Begin Flyweight
_unique_instances = WeakValueDictionary()
@classmethod
def _flyweight_preproc_path(cls, path):
"""Custom handling for few special abbreviations for datasets"""
path_ = path
if path in ('^', '^.'):
dsroot = get_dataset_root(curdir)
if dsroot is None:
raise NoDatasetFound('No dataset contains path: {}'.format(
str(Path.cwd())))
if path == '^':
# get the topmost dataset from current location. Note that 'zsh'
# might have its ideas on what to do with ^, so better use as -d^
path_ = Dataset(dsroot).get_superdataset(
topmost=True).path
elif path == '^.':
# the dataset containing current directory
path_ = dsroot
elif path == '///':
# TODO: logic/UI on installing a default dataset could move here
# from search?
path_ = cfg.obtain('datalad.locations.default-dataset')
if path != path_:
lgr.debug("Resolved dataset alias %r to path %r", path, path_)
return path_
@classmethod
def _flyweight_postproc_path(cls, path):
# we want an absolute path, but no resolved symlinks
if not op.isabs(path):
path = op.join(op.getpwd(), path)
# use canonical paths only:
return op.normpath(path)
def _flyweight_invalid(self):
"""Invalidation of Flyweight instance
Dataset doesn't need to be invalidated during its lifetime at all. Instead the underlying *Repo instances are.
Dataset itself can represent a not yet existing path.
"""
return False
# End Flyweight
def __hash__(self):
# the flyweight key is already determining unique instances
# add the class name to distinguish from strings of a path
return hash((self.__class__.__name__, self.__weakref__.key))
def __init__(self, path):
"""
Parameters
----------
path : str or Path
Path to the dataset location. This location may or may not exist
yet.
"""
self._pathobj = path if isinstance(path, ut.Path) else None
if isinstance(path, ut.PurePath):
path = str(path)
self._path = path
self._repo = None
self._id = None
self._cfg = None
self._cfg_bound = None
@property
def pathobj(self):
"""pathobj for the dataset"""
# XXX this relies on the assumption that self._path as managed
# by the base class is always a native path
if not self._pathobj:
self._pathobj = ut.Path(self._path)
return self._pathobj
def __eq__(self, other):
if not hasattr(other, 'pathobj'):
return False
# Ben: https://github.com/datalad/datalad/pull/4057#discussion_r370153586
# It's pointing to the same thing, while not being the same object
# (in opposition to the *Repo classes). So `ds1 == ds2`,
# `but ds1 is not ds2.` I thought that's a useful distinction. On the
# other hand, I don't think we use it anywhere outside tests yet.
me_exists = self.pathobj.exists()
other_exists = other.pathobj.exists()
if me_exists != other_exists:
# no chance this could be the same
return False
elif me_exists:
# check on filesystem
return self.pathobj.samefile(other.pathobj)
else:
# we can only do lexical comparison.
# this will fail to compare a long and a shortpath.
# on windows that could actually point to the same thing
# if it would exists, but this is how far we go with this.
return self.pathobj == other.pathobj
def __getattr__(self, attr):
# Assure that we are not just missing some late binding @datasetmethod .
if not attr.startswith('_'): # do not even consider those
lgr.debug("Importing datalad.api to possibly discover possibly not yet bound method %r", attr)
# load entire datalad.api which will also bind datasetmethods
# from extensions.
import datalad.api
# which would bind all known interfaces as well.
# Although adds overhead, good for UX
return super(Dataset, self).__getattribute__(attr)
[docs]
def close(self):
"""Perform operations which would close any possible process using this Dataset
"""
repo = self._repo
self._repo = None
if repo:
# might take care about lingering batched processes etc
del repo
@property
def path(self):
"""path to the dataset"""
return self._path
@property
def repo(self):
"""Get an instance of the version control system/repo for this dataset,
or None if there is none yet (or none anymore).
If testing the validity of an instance of GitRepo is guaranteed to be
really cheap this could also serve as a test whether a repo is present.
Note, that this property is evaluated every time it is used. If used
multiple times within a function it's probably a good idea to store its
value in a local variable and use this variable instead.
Returns
-------
GitRepo or AnnexRepo
"""
# If we already got a *Repo instance, check whether it's still valid;
# Note, that this basically does part of the testing that would
# (implicitly) be done in the loop below again. So, there's still
# potential to speed up when we actually need to get a new instance
# (or none). But it's still faster for the vast majority of cases.
#
# TODO: Dig deeper into it and melt with new instance guessing. This
# should also involve to reduce redundancy of testing such things from
# within Flyweight.__call__, AnnexRepo.__init__ and GitRepo.__init__!
#
# Also note, that this could be forged into a single big condition, but
# that is hard to read and we should be well aware of the actual
# criteria here:
if self._repo is not None and self.pathobj.resolve() == self._repo.pathobj:
# we got a repo and path references still match
if isinstance(self._repo, AnnexRepo):
# it's supposed to be an annex
# Here we do the same validation that Flyweight would do beforehand if there was a call to AnnexRepo()
if self._repo is AnnexRepo._unique_instances.get(
self._repo.path, None) and not self._repo._flyweight_invalid():
# it's still the object registered as flyweight and it's a
# valid annex repo
return self._repo
elif isinstance(self._repo, GitRepo):
# it's supposed to be a plain git
# same kind of checks as for AnnexRepo above, but additionally check whether it was changed to have an
# annex now.
# TODO: Instead of is_with_annex, we might want the cheaper check for an actually initialized annex.
# However, that's not completely clear. On the one hand, if it really changed to be an annex
# it seems likely that this happened locally and it would also be an initialized annex. On the
# other hand, we could have added (and fetched) a remote with an annex, which would turn it into
# our current notion of an uninitialized annex. Question is whether or not such a change really
# need to be detected. For now stay on the safe side and detect it.
if self._repo is GitRepo._unique_instances.get(
self._repo.path, None) and not self._repo._flyweight_invalid() and not \
self._repo.is_with_annex():
# it's still the object registered as flyweight, it's a
# valid git repo and it hasn't turned into an annex
return self._repo
# Note: Although it looks like the "self._repo = None" assignments
# could be used instead of variable "valid", that's a big difference!
# The *Repo instances are flyweights, not singletons. self._repo might
# be the last reference, which would lead to those objects being
# destroyed and therefore the constructor call would result in an
# actually new instance. This is unnecessarily costly.
try:
self._repo = repo_from_path(self._path)
except ValueError:
lgr.log(5, "Failed to detect a valid repo at %s", self.path)
self._repo = None
return
if due.active:
# TODO: Figure out, when exactly this is needed. Don't think it
# makes sense to do this for every dataset,
# no matter what => we want .repo to be as cheap as it gets.
# Makes sense only on installed dataset - @never_fail'ed
duecredit_dataset(self)
return self._repo
@property
def id(self):
"""Identifier of the dataset.
This identifier is supposed to be unique across datasets, but identical
for different versions of the same dataset (that have all been derived
from the same original dataset repository).
Note, that a plain git/git-annex repository doesn't necessarily have
a dataset id yet. It is created by `Dataset.create()` and stored in
.datalad/config. If None is returned while there is a valid repository,
there may have never been a call to `create` in this branch before
current commit.
Note, that this property is evaluated every time it is used. If used
multiple times within a function it's probably a good idea to store its
value in a local variable and use this variable instead.
Returns
-------
str
This is either a stored UUID, or `None`.
"""
return self.config.get('datalad.dataset.id', None)
@property
def config(self):
"""Get a ``ConfigManager`` instance for a dataset's configuration
In case a dataset does not (yet) have an existing corresponding
repository, the returned ``ConfigManager`` is the global instance
that is also provided via ``datalad.cfg``.
Note, that this property is evaluated every time it is used. If used
multiple times within a function it's probably a good idea to store its
value in a local variable and use this variable instead.
Returns
-------
ConfigManager
"""
# OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad
repo = self.repo
if repo is None:
# if there's no repo (yet or anymore), we can't read/write config at
# dataset level, but only at user/system level
# However, if this was the case before as well, we don't want a new
# instance of ConfigManager, but use the global one
if self._cfg_bound in (True, None):
# for the sake of uniformity assign datalad.cfg to self._cfg
self._cfg = cfg
self._cfg_bound = False
else:
self._cfg = repo.config
self._cfg_bound = True
return self._cfg
[docs]
def recall_state(self, whereto):
"""Something that can be used to checkout a particular state
(tag, commit) to "undo" a change or switch to a otherwise desired
previous state.
Parameters
----------
whereto: str
"""
if not self.is_installed():
raise RuntimeError(
"cannot remember a state when a dataset is not yet installed")
self.repo.checkout(whereto)
[docs]
def is_installed(self):
"""Returns whether a dataset is installed.
A dataset is installed when a repository for it exists on the filesystem.
Returns
-------
bool
"""
return self.path is not None and exists(self.path) and \
self.repo is not None
[docs]
def get_superdataset(self, datalad_only=False, topmost=False,
registered_only=True):
"""Get the dataset's superdataset
Parameters
----------
datalad_only : bool, optional
Whether to consider only "datalad datasets" (with non-None
id), or (if False, which is default) - any git repository
topmost : bool, optional
Return the topmost super-dataset. Might then be the current one.
registered_only : bool, optional
Test whether any discovered superdataset actually contains the
dataset in question as a registered subdataset (as opposed to
just being located in a subdirectory without a formal relationship).
Returns
-------
Dataset or None
"""
path = self.path
sds_path = path if topmost else None
def res_filter(res):
return res.get('status') == 'ok' and res.get('type') == 'dataset'
def subds_contains_path(ds, path):
return path in sds.subdatasets(recursive=False,
contains=path,
result_filter=res_filter,
on_failure='ignore',
result_xfm='paths',
result_renderer='disabled')
while path:
# normalize the path after adding .. so we guaranteed to not
# follow into original directory if path itself is a symlink
par_path = normpath(opj(path, pardir))
sds_path_ = get_dataset_root(par_path)
if sds_path_ is None:
# no more parents, use previous found
break
sds = Dataset(sds_path_)
if datalad_only:
# test if current git is actually a dataset?
if not sds.id:
break
if registered_only:
if not subds_contains_path(sds, path):
break
# That was a good candidate
sds_path = sds_path_
path = par_path
if not topmost:
# no looping
break
if sds_path is None:
# None was found
return None
# No postprocessing now should be necessary since get_toppath
# tries its best to not resolve symlinks now
return Dataset(sds_path)
@optional_args
def datasetmethod(f, name=None, dataset_argname='dataset'):
"""Decorator to bind functions to Dataset class.
The decorated function is still directly callable and additionally serves
as method `name` of class Dataset. To achieve this, the first positional
argument is redirected to original keyword argument 'dataset_argname'. All
other arguments stay in order (and keep their names, of course). That
means, that the signature of the bound function is name(self, a, b) if the
original signature is name(a, dataset, b) for example.
The decorator has no effect on the actual function decorated with it.
"""
if not name:
name = f.__name__
@wraps(f)
def apply_func(instance, *args, **kwargs):
# Wrapper function to assign arguments of the bound function to
# original function.
#
# Note
# ----
# This wrapper is NOT returned by the decorator, but only used to bind
# the function `f` to the Dataset class.
kwargs = kwargs.copy()
# due to use of functools.wraps and inability of of getarspec to get
# those, we use .signature.
# More information in de-wrapt PR https://github.com/datalad/datalad/pull/6190
from datalad.utils import get_sig_param_names
f_args, f_kwonlyargs = get_sig_param_names(f, ('pos_any', 'kw_only'))
# If bound function is used with wrong signature (especially by
# explicitly passing a dataset), let's raise a proper exception instead
# of a 'list index out of range', that is not very telling to the user.
# In case whenever kwonlyargs are used, 'dataset' would not be listed
# among args, so we would account for it (possibly) be there.
if len(args) >= len(f_args) + int(bool(f_kwonlyargs)):
non_dataset_args = ["self"] + [a for a in f_args if a != dataset_argname]
raise TypeError(
f"{name}() takes at most {len(f_args)} arguments ({len(args)} given): "
f"{non_dataset_args}")
if dataset_argname in kwargs:
raise TypeError(
f"{name}() got an unexpected keyword argument {dataset_argname}")
kwargs[dataset_argname] = instance
if dataset_argname in f_kwonlyargs:
# * was used to enforce kwargs, so we just would pass things as is
pass
else:
# so it is "old" style, where it is a regular kwargs - we pass everything
# via kwargs
# TODO: issue a DX oriented warning that we advise to separate out kwargs,
# dataset included, with * from positional args?
ds_index = f_args.index(dataset_argname)
for i in range(0, len(args)):
if i < ds_index:
kwargs[f_args[i]] = args[i]
elif i >= ds_index:
kwargs[f_args[i+1]] = args[i]
args = []
return f(*args, **kwargs)
setattr(Dataset, name, apply_func)
# set the ad-hoc attribute so that @build_doc could also bind built doc
# to the dataset method
if getattr(f, '_dataset_method', None):
raise RuntimeError(f"_dataset_method of {f} is already set to {f._dataset_method}")
setattr(f, '_dataset_method', apply_func)
return f
# Note: Cannot be defined within constraints.py, since then dataset.py needs to
# be imported from constraints.py, which needs to be imported from dataset.py
# for another constraint
class EnsureDataset(Constraint):
"""Despite its name, this constraint does not actually ensure that the
argument is a valid dataset, because for procedural reasons this would
typically duplicate subsequent checks and processing. However, it can
be used to achieve uniform documentation of `dataset` arguments."""
def __call__(self, value):
if isinstance(value, Dataset):
return value
elif isinstance(value, (str, PurePath)):
# we cannot convert to a Dataset class right here
# - duplicates require_dataset() later on
# - we need to be able to distinguish between a bound
# dataset method call and a standalone call for
# relative path argument disambiguation
#return Dataset(path=value)
return value
else:
raise ValueError("Can't create Dataset from %s." % type(value))
def short_description(self):
return "Dataset"
def long_description(self):
return """Value must be a Dataset or a valid identifier of a Dataset
(e.g. a path)"""
def require_dataset(dataset, check_installed=True, purpose=None):
"""Helper function to resolve a dataset.
This function tries to resolve a dataset given an input argument,
or based on the process' working directory, if `None` is given.
Parameters
----------
dataset : None or path or Dataset
Some value identifying a dataset or `None`. In the latter case
a dataset will be searched based on the process working directory.
check_installed : bool, optional
If True, an optional check whether the resolved dataset is
properly installed will be performed.
purpose : str, optional
This string will be inserted in error messages to make them more
informative. The pattern is "... dataset for <STRING>".
Returns
-------
Dataset
If a dataset could be determined.
Raises
------
NoDatasetFound
If not dataset could be determined.
"""
if dataset is not None and not isinstance(dataset, Dataset):
dataset = Dataset(dataset)
if dataset is None: # possible scenario of cmdline calls
dspath = get_dataset_root(getpwd())
if not dspath:
raise NoDatasetFound(
"No dataset found at '{}'{}. Specify a dataset to work with "
"by providing its path via the `dataset` option, "
"or change the current working directory to be in a "
"dataset.".format(
getpwd(),
" for the purpose {!r}".format(purpose) if purpose else ''
)
)
dataset = Dataset(dspath)
assert(dataset is not None)
lgr.debug(u"Resolved dataset%s: %s",
u' to {}'.format(purpose) if purpose else '',
dataset.path)
if check_installed and not dataset.is_installed():
raise NoDatasetFound(
f"No installed dataset found at {dataset.path}")
return dataset
# New helpers, courtesy of datalad-revolution.
# note: not thread safe if threads chdir - uses getpwd
def resolve_path(path, ds=None, ds_resolved=None):
"""Resolve a path specification (against a Dataset location)
Any path is returned as an absolute path. If, and only if, a dataset
object instance is given as `ds`, relative paths are interpreted as
relative to the given dataset. In all other cases, relative paths are
treated as relative to the current working directory.
Note however, that this function is not able to resolve arbitrarily
obfuscated path specifications. All operations are purely lexical, and no
actual path resolution against the filesystem content is performed.
Consequently, common relative path arguments like '../something' (relative
to PWD) can be handled properly, but things like 'down/../under' cannot, as
resolving this path properly depends on the actual target of any
(potential) symlink leading up to '..'.
Parameters
----------
path : str or PathLike or list
Platform-specific path specific path specification. Multiple path
specifications can be given as a list
ds : Dataset or PathLike or None
Dataset instance to resolve relative paths against.
ds_resolved : Dataset or None
A dataset instance that was created from `ds` outside can be provided
to avoid multiple instantiation on repeated calls.
Returns
-------
`pathlib.Path` object or list(Path)
When a list was given as input a list is returned, a Path instance
otherwise.
"""
got_ds_instance = isinstance(ds, Dataset)
if ds is not None and not got_ds_instance:
ds = ds_resolved or require_dataset(
ds, check_installed=False, purpose='path resolution')
out = []
pwd_parts = None # get it upon first use but only once
for p in ensure_list(path):
if ds is None or not got_ds_instance:
# no dataset at all or no instance provided -> CWD is always the reference
# nothing needs to be done here. Path-conversion and absolutification
# are done next
pass
# we have a given datasets instance
elif not Path(p).is_absolute():
# we have a dataset and no abspath nor an explicit relative path ->
# resolve it against the dataset
p = ds.pathobj / p
p = ut.Path(p)
# make sure we return an absolute path, but without actually
# resolving anything
if not p.is_absolute():
# in general it is almost impossible to use resolve() when
# we can have symlinks in the root path of a dataset
# (that we don't want to resolve here), symlinks to annex'ed
# files (that we never want to resolve), and other within-repo
# symlinks that we (sometimes) want to resolve (i.e. symlinked
# paths for addressing content vs adding content)
# CONCEPT: do the minimal thing to catch most real-world inputs
# ASSUMPTION: the only sane relative path input that needs
# handling and can be handled are upward references like
# '../../some/that', whereas stuff like 'down/../someotherdown'
# are intellectual exercises
# ALGORITHM: match any number of leading '..' path components
# and shorten the PWD by that number
# NOT using ut.Path.cwd(), because it has symlinks resolved!!
if not pwd_parts:
pwd_parts = ut.Path(getpwd()).parts
path_parts = p.parts
leading_parents = 0
for pp in p.parts:
if pp == op.pardir:
leading_parents += 1
path_parts = path_parts[1:]
elif pp == op.curdir:
# we want to discard that, but without stripping
# a corresponding parent
path_parts = path_parts[1:]
else:
break
p = ut.Path(
op.join(
*(pwd_parts[:-leading_parents if leading_parents else None]
+ path_parts)))
# note that we will not "normpath()" the result, check the
# pathlib docs for why this is the only sane choice in the
# face of the possibility of symlinks in the path
out.append(p)
return out[0] if isinstance(path, (str, PurePath)) else out
# TODO keep this around for a while so that extensions can be updated
rev_resolve_path = resolve_path
def path_under_rev_dataset(ds, path):
ds_path = ds.pathobj
try:
rpath = str(ut.Path(path).relative_to(ds_path))
if not rpath.startswith(op.pardir):
# path is already underneath the dataset
return path
except Exception:
# whatever went wrong, we gotta play save
pass
root = get_dataset_root(str(path))
while root is not None and not ds_path.samefile(root):
# path and therefore root could be relative paths,
# hence in the next round we cannot use dirname()
# to jump in the the next directory up, but we have
# to use ./.. and get_dataset_root() will handle
# the rest just fine
root = get_dataset_root(op.join(root, op.pardir))
if root is None:
return None
return ds_path / op.relpath(str(path), root)
lgr.log(5, "Done importing dataset")