Source code for datalad_metalad.extractors.legacy.datalad_rfc822
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Extractor for RFC822-based metadata specifications
This is inspired by (and very similar to) Debian's package metadata format.
The main difference is that information spread across multiple files in Debian
packages, is concentrated in one file.
The main advantage of this format is that it is proven to be hand-editable,
i.e. can be composed from scratch, by hand, in an editor -- with a good
chance of producing syntax-compliant content with the first attempt.
"""
import logging
lgr = logging.getLogger('datalad.metadata.extractors.datalad_rfc822')
from os.path import exists
import email
import email.parser # necessary on Python 2.7.6 (trusty)
from os.path import join as opj
from datalad.interface.base import dedent_docstring
from ..base import BaseMetadataExtractor
def _split_list_field(content):
return [i.strip() for i in content.split(',') if i.strip()]
def _beautify_multiline_field(content):
content = dedent_docstring(content)
lines = content.split('\n')
title = ''
if len(lines):
title = lines[0]
if len(lines) > 1:
content = ''
for l in lines[1:]:
l = l.strip()
content = '{}{}{}'.format(
content,
' ' if len(content) and l != '.' and content[-1] != '\n' else '',
l if l != '.' else '\n')
return title, content
[docs]class DataladRFC822MetadataExtractor(BaseMetadataExtractor):
_metadata_compliance = "http://docs.datalad.org/metadata.html#v0-1"
_core_metadata_filename = opj('.datalad', 'meta.rfc822')
_key2stdkey = {
'name': 'name',
'license': 'license',
'author': 'author',
'maintainer': 'maintainer',
'audience': 'audience',
'homepage': 'homepage',
'version': 'version',
'funding': 'fundedby',
'issue-tracker': 'issuetracker',
'cite-as': 'citation',
'doi': 'sameas',
'description': None,
}
def _get_dataset_metadata(self):
meta = {}
if not exists(opj(self.ds.path, self._core_metadata_filename)):
return meta
spec = email.parser.Parser().parse(
open(opj(self.ds.path, self._core_metadata_filename)),
headersonly=True)
for term in self._key2stdkey:
if term not in spec:
continue
hkey = self._key2stdkey[term]
content = spec[term]
if term == 'description':
short, long = _beautify_multiline_field(content)
meta['shortdescription'] = short
meta['description'] = long
elif term == 'license':
# TODO if title looks like a URL, use it as @id
label, desc = _beautify_multiline_field(content)
if label:
meta[hkey] = [label, desc]
else:
meta[hkey] = desc
elif term in ('maintainer', 'author'):
meta[hkey] = _split_list_field(content)
elif term == 'doi':
meta[hkey] = 'http://dx.doi.org/{}'.format(content)
else:
meta[hkey] = content
meta['conformsto'] = self._metadata_compliance
return meta
def _get_content_metadata(self):
return []