Source code for datalad_metalad.extractors.legacy.datalad_core

# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
#   See COPYING file distributed along with the datalad package for the
#   copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Metadata extractor for DataLad's own core storage"""

from ..base import BaseMetadataExtractor

import logging
lgr = logging.getLogger('datalad.metadata.extractors.datalad_core')
from datalad.log import log_progress

from os.path import join as opj
from os.path import exists

from datalad.consts import WEB_SPECIAL_REMOTE_UUID
from datalad.support.json_py import load as jsonload
from datalad.support.annexrepo import AnnexRepo
from datalad.coreapi import subdatasets

from . import (
    DATASET_METADATA_FILE,
    DATALAD_DOTDIR,
)

# use main version as core version

# this must stay, despite being a seemingly unused import, each extractor defines a version
from .definitions import version as vocabulary_version


[docs]class DataladCoreMetadataExtractor(BaseMetadataExtractor): NEEDS_CONTENT = False _unique_exclude = {"url"} def _get_dataset_metadata(self): """ Returns ------- dict keys are homogenized datalad metadata keys, values are arbitrary """ fpath = opj(self.ds.path, DATASET_METADATA_FILE) obj = {} if exists(fpath): obj = jsonload(fpath, fixup=True) if 'definition' in obj: obj['@context'] = obj['definition'] del obj['definition'] obj['@id'] = self.ds.id subdsinfo = [{ # this version would change anytime we aggregate metadata, let's not # do this for now #'version': sds['revision'], 'type': sds['type'], 'name': sds['gitmodule_name'], } for sds in subdatasets( dataset=self.ds, recursive=False, return_type='generator', result_renderer='disabled', on_failure='ignore') ] if subdsinfo: obj['haspart'] = subdsinfo superds = self.ds.get_superdataset(registered_only=True, topmost=False) if superds: obj['ispartof'] = { '@id': superds.id, 'type': 'dataset', } return obj def _get_content_metadata(self): """Get ALL metadata for all dataset content. Returns ------- generator((location, metadata_dict)) """ log_progress( lgr.info, 'extractordataladcore', 'Start core metadata extraction from %s', self.ds, total=len(self.paths), label='Core metadata extraction', unit=' Files', ) if not isinstance(self.ds.repo, AnnexRepo): for p in self.paths: # this extractor does give a response for ANY file as it serves # an an indicator of file presence (i.e. a file list) in the # content metadata, even if we know nothing but the filename # about a file yield (p, dict()) log_progress( lgr.info, 'extractordataladcore', 'Finished core metadata extraction from %s', self.ds ) return valid_paths = None if self.paths and sum(len(i) for i in self.paths) > 500000: valid_paths = set(self.paths) # Availability information for file, whereis in self.ds.repo.whereis( self.paths if self.paths and valid_paths is None else '.', output='full').items(): if file.startswith(DATALAD_DOTDIR) or valid_paths and file not in valid_paths: # do not report on our own internal annexed files (e.g. metadata blobs) continue log_progress( lgr.info, 'extractordataladcore', 'Extracted core metadata from %s', file, update=1, increment=True) # pull out proper (public) URLs # TODO possibly extend with special remote info later on meta = {'url': whereis[remote].get('urls', []) for remote in whereis # "web" remote if remote == WEB_SPECIAL_REMOTE_UUID and whereis[remote].get('urls', None)} yield (file, meta) log_progress( lgr.info, 'extractordataladcore', 'Finished core metadata extraction from %s', self.ds )