Source code for datalad.metadata.extract_metadata

# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 noet:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
#   See COPYING file distributed along with the datalad package for the
#   copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Run one or more metadata extractors on a dataset or file(s)"""

__docformat__ = 'restructuredtext'

from os import curdir
from os.path import join as opj

from datalad.interface.base import Interface
from datalad.interface.base import build_doc
from datalad.interface.results import get_status_dict
from datalad.interface.utils import eval_results
from datalad.distribution.dataset import datasetmethod
from datalad.distribution.dataset import EnsureDataset
from datalad.distribution.dataset import require_dataset
from datalad.support.param import Parameter
from datalad.support.constraints import EnsureNone, EnsureStr
from datalad.metadata.metadata import _get_metadata
from datalad.metadata.metadata import _get_metadatarelevant_paths


@build_doc
class ExtractMetadata(Interface):
    """Run one or more of DataLad's metadata extractors on a dataset or file.

    The result(s) are structured like the metadata DataLad would extract
    during metadata aggregation. There is one result per dataset/file.

    Examples:

      Extract metadata with two extractors from a dataset in the current directory
      and also from all its files::

        $ datalad extract-metadata -d . --type frictionless_datapackage --type datalad_core

      Extract XMP metadata from a single PDF that is not part of any dataset::

        $ datalad extract-metadata --type xmp Downloads/freshfromtheweb.pdf
    """

    _params_ = dict(
        types=Parameter(
            args=("--type",),
            dest="types",
            metavar=("NAME"),
            action='append',
            required=True,
            doc="""Name of a metadata extractor to be executed.
            [CMD: This option can be given more than once CMD]"""),
        files=Parameter(
            args=("files",),
            metavar="FILE",
            nargs="*",
            doc="Path of a file to extract metadata from.",
            constraints=EnsureStr() | EnsureNone()),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc=""""Dataset to extract metadata from. If no `file` is given,
            metadata is extracted from all files of the dataset.""",
            constraints=EnsureDataset() | EnsureNone()),
    )

    @staticmethod
    @datasetmethod(name='extract_metadata')
    @eval_results
    def __call__(types, files=None, dataset=None):
        dataset = require_dataset(dataset or curdir,
                                  purpose="extract metadata",
                                  check_installed=not files)
        if not files:
            ds = require_dataset(dataset, check_installed=True)
            subds = ds.subdatasets(recursive=False, result_xfm='relpaths')
            files = list(_get_metadatarelevant_paths(ds, subds))

        dsmeta, contentmeta, error = _get_metadata(
            dataset,
            types,
            global_meta=True,
            content_meta=bool(files),
            paths=files)

        if dataset is not None and dataset.is_installed():
            res = get_status_dict(
                action='metadata',
                ds=dataset,
                refds=dataset.path,
                metadata=dsmeta,
                status='error' if error else 'ok')
            yield res

        for p in contentmeta:
            res = get_status_dict(
                action='metadata',
                path=opj(dataset.path, p) if dataset else p,
                refds=dataset.path,
                metadata=contentmeta[p],
                type='file',
                status='error' if error else 'ok')
            if dataset:
                res['parentds'] = dataset.path
            yield res