Source code for datalad_metalad.extractors.custom

# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
#   See COPYING file distributed along with the datalad package for the
#   copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""MetadataRecord extractor for custom (JSON-LD) metadata contained in a dataset

One or more source files with metadata can be specified via the
'datalad.metadata.custom-dataset-source' configuration variable.
The content of these files must be a JSON object, and a metadata
dictionary is built by updating it with the content of the JSON
objects in the order in which they are given.

By default a single file is read: '.metadata/dataset.json'
"""

from .base import MetadataExtractor

import os.path as op
from six import text_type
import logging
lgr = logging.getLogger('datalad.metadata.extractors.custom')

from datalad.log import log_progress
from datalad.support.json_py import load as jsonload
from datalad.dochelpers import exc_str
from datalad.utils import (
    ensure_list,
    Path,
    PurePosixPath,
)
from .. import get_file_id


[docs]class CustomMetadataExtractor(MetadataExtractor):
[docs] def get_required_content(self, dataset, process_type, status): if process_type in ('all', 'content'): mfile_expr = _get_fmeta_expr(dataset) for rec in status: # build metadata file path meta_fpath = _get_fmeta_objpath(dataset, mfile_expr, rec) # use op.lexists to also match broken symlinks if meta_fpath is not None and op.lexists(meta_fpath): yield dict(path=meta_fpath) if process_type in ('all', 'dataset'): srcfiles, _ = _get_dsmeta_srcfiles(dataset) for f in srcfiles: f = text_type(dataset.pathobj / f) if op.lexists(f): yield dict(path=f)
def __call__(self, dataset, refcommit, process_type, status): # shortcut ds = dataset log_progress( lgr.info, 'extractorcustom', 'Start custom metadata extraction from %s', ds, total=len(status) + 1, label='Custom metadata extraction', unit=' Files', ) if process_type in ('all', 'content'): mfile_expr = _get_fmeta_expr(ds) for rec in status: log_progress( lgr.info, 'extractorcustom', 'Extracted custom metadata from %s', rec['path'], update=1, increment=True) # build metadata file path meta_fpath = _get_fmeta_objpath(ds, mfile_expr, rec) if meta_fpath is not None and op.exists(meta_fpath): try: meta = jsonload(text_type(meta_fpath)) if isinstance(meta, dict) and meta \ and '@id' not in meta: # in case we have a single, top-level # document, and it has no ID: assume that # it describes the file and assign the # datalad file ID meta['@id'] = get_file_id(rec) if meta: yield dict( path=rec['path'], metadata=meta, type=rec['type'], status='ok', ) except Exception as e: yield dict( path=rec['path'], type=rec['type'], status='error', message=exc_str(e), ) if process_type in ('all', 'dataset'): for r in _yield_dsmeta(ds): yield r log_progress( lgr.info, 'extractorcustom', 'Extracted custom metadata from %s', ds.path, update=1, increment=True) log_progress( lgr.info, 'extractorcustom', 'Finished custom metadata extraction from %s', ds.path )
[docs] def get_state(self, dataset): ds = dataset return { 'dataset-source': ds.config.get( 'datalad.metadata.custom-dataset-source', '.metadata/dataset.json'), 'content-source': _get_fmeta_expr(ds), }
def _get_dsmeta_srcfiles(ds): # which files to look at cfg_srcfiles = ds.config.obtain( 'datalad.metadata.custom-dataset-source', []) cfg_srcfiles = ensure_list(cfg_srcfiles) # OK to be always POSIX srcfiles = ['.metadata/dataset.json'] \ if not cfg_srcfiles and op.lexists( text_type(ds.pathobj / '.metadata' / 'dataset.json')) \ else cfg_srcfiles return srcfiles, cfg_srcfiles def _get_fmeta_expr(ds): return ds.config.obtain( 'datalad.metadata.custom-content-source', '.metadata/content/{freldir}/{fname}.json') def _get_fmeta_objpath(ds, expr, rec): fpath = Path(rec['path']) if rec.get('type', None) != 'file': # pragma: no cover # nothing else in here return # build associated metadata file path from POSIX # pieces and convert to platform conventions at the end return text_type( ds.pathobj / PurePosixPath(expr.format( freldir=fpath.relative_to( ds.pathobj).parent.as_posix(), fname=fpath.name))) def _yield_dsmeta(ds): srcfiles, cfg_srcfiles = _get_dsmeta_srcfiles(ds) dsmeta = {} for srcfile in srcfiles: abssrcfile = ds.pathobj / PurePosixPath(srcfile) # TODO get annexed files, or do in a central place? if not abssrcfile.exists(): # nothing to load # warn if this was configured if srcfile in cfg_srcfiles: yield dict( path=ds.path, type='dataset', status='impossible', message=( 'configured custom metadata source is not ' 'available in %s: %s', ds, srcfile), ) # no further operation on half-broken metadata return lgr.debug('Load custom metadata from %s', abssrcfile) meta = jsonload(text_type(abssrcfile)) dsmeta.update(meta) if dsmeta: if '@id' not in dsmeta: dsmeta['@id'] = ds.id yield dict( path=ds.path, metadata=dsmeta, type='dataset', status='ok', )