Source code for datalad_metalad.extractors.annex
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""MetadataRecord extractor for Git-annex metadata
This extractor only deals with the metadata that can be assigned to annexed
files via git-annex's `metadata` command. It does not deal with other implicit
git-annex metadata, such as file availability information. This is already
handled by the `metalad_core` extractor.
There is no standard way to define a vocabulary that is used for this kind of
metadata.
"""
from .base import MetadataExtractor
from six import text_type
import logging
lgr = logging.getLogger('datalad.metadata.extractors.metalad_annex')
from datalad.log import log_progress
from datalad.utils import (
Path,
PurePosixPath,
)
from datalad.support.annexrepo import AnnexRepo
[docs]class AnnexMetadataExtractor(MetadataExtractor):
def __call__(self, dataset, refcommit, process_type, status):
# shortcut
ds = dataset
repo = ds.repo # OPT: .repo could be relatively expensive
if not isinstance(repo, AnnexRepo):
# nothing to be done
return
if process_type not in ('all', 'content'):
return
# no progress bar, we are only making a one-shot call to
# annex, the rest is pretty much instantaneous
# limit query to paths that are annexed
query_paths = [
# go relative to minimize cmdline footprint of annex call
text_type(Path(s['path']).relative_to(ds.pathobj))
for s in status
# anything that looks like an annexed file
if s.get('type', None) == 'file' \
and s.get('key', None) is not None
]
log_progress(
lgr.info,
'extractorannex',
'Start annex metadata extraction from %s', ds,
total=len(query_paths),
label='Annex metadata extraction',
unit=' Files',
)
for fpath, meta in repo.get_metadata(
query_paths,
# no timestamps, we are describing the status quo
timestamps=False,
# because we have filtered the query to only contained
# annexed files, we can use batch mode and deal with
# many files
batch=True):
log_progress(
lgr.info,
'extractorannex',
'Extracted annex metadata from %s', fpath,
update=1,
increment=True)
meta = {
k:
v[0] if isinstance(v, list) and len(v) == 1 else v
for k, v in meta.items()}
if not meta:
# only talk about files that actually carry metadata
continue
yield dict(
# git annex reports the path in POSIX conventions
path=PurePosixPath(fpath),
metadata=meta,
type='file',
status='ok',
)
log_progress(
lgr.info,
'extractorannex',
'Finished annex metadata extraction from %s', ds,
)
[docs] def get_state(self, dataset):
#from datalad.support.external_versions import external_versions
return dict(
# report on the annex version used to report metadata
#gitannex_version=external_versions['cmd:annex']
# do not report on the git-annex version itself, as
# any git-annex update would trigger a re-extraction of metadata
# most likely without any change.
# Instead, increment/change this version whenever the extractor
# output would change in the future (maybe due to changes in
# git-annex, or for other reasons).
version=1,
)