Source code for datalad.plugin.export_to_figshare

# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 noet:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
#   See COPYING file distributed along with the datalad package for the
#   copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""export a dataset as a TAR/ZIP archive to figshare"""

__docformat__ = 'restructuredtext'

from datalad.utils import unlink
from datalad.interface.base import Interface
from datalad.interface.base import build_doc


[docs]class FigshareRESTLaison(object): """A little helper to provide minimal interface to interact with Figshare """ API_URL = 'https://api.figshare.com/v2' def __init__(self): self._token = None from datalad.ui import ui self.ui = ui # we will be chatty here @property def token(self): if self._token is None: from datalad.downloaders.providers import Providers providers = Providers.from_config_files() provider = providers.get_provider(self.API_URL) credential = provider.credential self._token = credential().get('token') return self._token def __call__(self, m, url, data=None, success=None, binary=False, headers=None, return_json=True): """A wrapper around requests calls to interpolate deposition_id, do basic checks and conversion """ import json if '://' not in url: url_ = self.API_URL + '/' + url else: url_ = url headers = headers or {} if data is not None and not binary: data = json.dumps(data) headers["Content-Type"] = "application/json" headers['Authorization'] = "token %s" % self.token r = m(url_, data=data, headers=headers) status_code = r.status_code if (success != "donotcheck") and \ ((success and status_code not in success) or (not success and status_code >= 400)): msg = "Got return code %(status_code)s for %(m)s(%(url_)s." \ % locals() raise RuntimeError("Error status %s" % msg) if return_json: return r.json() if r.content else {} else: return r.content
[docs] def put(self, *args, **kwargs): import requests return self(requests.put, *args, **kwargs)
[docs] def post(self, *args, **kwargs): import requests return self(requests.post, *args, **kwargs)
[docs] def get(self, *args, **kwargs): import requests return self(requests.get, *args, **kwargs)
[docs] def upload_file(self, fname, files_url): # In v2 API seems no easy way to "just upload". Need to initiate, # do uploads # and finalize # TODO: check if the file with the same name already available, and offer # to remove/prune it import os from datalad.utils import md5sum from datalad.ui import ui file_rec = {'md5': md5sum(fname), 'name': os.path.basename(fname), 'size': os.stat(fname).st_size } # Initiate upload j = self.post(files_url, file_rec) file_endpoint = j['location'] file_info = self.get(file_endpoint) file_upload_info = self.get(file_info['upload_url']) pbar = ui.get_progressbar(label=fname, # fill_text=f.name, total=file_rec['size']) with open(fname, 'rb') as f: for part in file_upload_info['parts']: udata = dict(file_info, **part) if part['status'] == 'PENDING': f.seek(part['startOffset']) data = f.read(part['endOffset'] - part['startOffset'] + 1) url = '{upload_url}/{partNo}'.format(**udata) ok = self.put(url, data=data, binary=True, return_json=False) assert ok == b'OK' pbar.update(part['endOffset'], increment=False) pbar.finish() # complete upload jcomplete = self.post(file_endpoint, return_json=False) return file_info
[docs] def get_article_ids(self): articles = self.get('account/articles') ids = [] for item in articles or []: self.ui.message(' {id} {url} - {title}'.format(**item)) ids.append(item['id']) return ids
[docs] def create_article(self, title): data = { 'title': title } # we could prefill more fields interactively if desired result = self.post('account/articles', data=data) result = self.get(result['location']) return result
[docs]@build_doc class ExportToFigshare(Interface): """Export the content of a dataset as a ZIP archive to figshare Very quick and dirty approach. Ideally figshare should be supported as a proper git annex special remote. Unfortunately, figshare does not support having directories, and can store only a flat list of files. That makes it impossible for any sensible publishing of complete datasets. The only workaround is to publish dataset as a zip-ball, where the entire content is wrapped into a .zip archive for which figshare would provide a navigator. """ from datalad.support.param import Parameter from datalad.distribution.dataset import datasetmethod from datalad.interface.utils import eval_results from datalad.distribution.dataset import EnsureDataset from datalad.support.constraints import ( EnsureChoice, EnsureInt, EnsureNone, EnsureStr, ) _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc=""""specify the dataset to export. If no dataset is given, an attempt is made to identify the dataset based on the current working directory.""", constraints=EnsureDataset() | EnsureNone()), filename=Parameter( args=("filename",), metavar="PATH", nargs='?', doc="""File name of the generated ZIP archive. If no file name is given the archive will be generated in the top directory of the dataset and will be named: datalad_<dataset_uuid>.zip.""", constraints=EnsureStr() | EnsureNone()), no_annex=Parameter( args=("--no-annex",), action="store_true", doc="""By default the generated .zip file would be added to annex, and all files would get registered in git-annex to be available from such a tarball. Also upon upload we will register for that archive to be a possible source for it in annex. Setting this flag disables this behavior."""), missing_content=Parameter( args=("--missing-content",), doc="""By default, any discovered file with missing content will result in an error and the plugin is aborted. Setting this to 'continue' will issue warnings instead of failing on error. The value 'ignore' will only inform about problem at the 'debug' log level. The latter two can be helpful when generating a TAR archive from a dataset where some file content is not available locally.""", constraints=EnsureChoice("error", "continue", "ignore")), # article_id=Parameter( # args=("--project-id",), # metavar="ID", # doc="""If given, article (if article_id is not provided) will be # created in that project.""", # constraints=EnsureInt() | EnsureNone()), article_id=Parameter( args=("--article-id",), metavar="ID", doc="""Which article to publish to.""", constraints=EnsureInt() | EnsureNone()), ) @staticmethod @datasetmethod(name='export_to_figshare') @eval_results def __call__(dataset, filename=None, missing_content='error', no_annex=False, # TODO: support working with projects and articles within them # project_id=None, article_id=None): import os import logging lgr = logging.getLogger('datalad.plugin.export_to_figshare') from datalad.ui import ui from datalad.api import add_archive_content from datalad.api import export_archive from datalad.distribution.dataset import require_dataset from datalad.support.annexrepo import AnnexRepo dataset = require_dataset(dataset, check_installed=True, purpose='export to figshare') if not isinstance(dataset.repo, AnnexRepo): raise ValueError( "%s is not an annex repo, so annexification could be done" % dataset ) if dataset.repo.dirty: raise RuntimeError( "Paranoid authors of DataLad refuse to proceed in a dirty repository" ) if filename is None: filename = dataset.path lgr.info( "Exporting current tree as an archive under %s since figshare " "does not support directories", filename ) archive_out = next( export_archive( dataset, filename=filename, archivetype='zip', missing_content=missing_content, return_type="generator" ) ) assert archive_out['status'] == 'ok' fname = archive_out['path'] lgr.info("Uploading %s to figshare", fname) figshare = FigshareRESTLaison() if not article_id: # TODO: ask if it should be an article within a project if ui.is_interactive: # or should we just upload to a new article? if ui.yesno( "Would you like to create a new article to upload to? " "If not - we will list existing articles", title="Article" ): article = figshare.create_article( title=os.path.basename(dataset.path) ) lgr.info( "Created a new (private) article %(id)s at %(url_private_html)s. " "Please visit it, enter additional meta-data and make public", article ) article_id = article['id'] else: article_id = int(ui.question( "Which of the articles should we upload to.", choices=list(map(str, figshare.get_article_ids())) )) if not article_id: raise ValueError("We need an article to upload to.") file_info = figshare.upload_file( fname, files_url='account/articles/%s/files' % article_id ) if no_annex: lgr.info("Removing generated tarball") unlink(fname) else: # I will leave all the complaining etc to the dataset add if path # is outside etc lgr.info("'Registering' %s within annex", fname) repo = dataset.repo repo.add(fname, git=False) key = repo.get_file_key(fname) lgr.info("Adding URL %(download_url)s for it", file_info) repo._annex_custom_command([], [ "git", "annex", "registerurl", '-c', 'annex.alwayscommit=false', key, file_info['download_url'] ] ) lgr.info("Registering links back for the content of the archive") add_archive_content( fname, annex=dataset.repo, delete_after=True, # just remove extracted into a temp dir allow_dirty=True, # since we have a tarball commit=False # we do not want to commit anything we have done here ) lgr.info("Removing generated and now registered in annex archive") repo.drop(key, key=True, options=['--force']) repo.remove(fname, force=True) # remove the tarball # if annex in {'delete'}: # dataset.repo.remove(fname) # else: # # kinda makes little sense I guess. # # Made more sense if export_archive could export an arbitrary treeish # # so we could create a branch where to dump and export to figshare # # (kinda closer to my idea) # dataset.save(fname, message="Added the entire dataset into a zip file") # TODO: add to downloader knowledge about figshare token so it could download-url # those zipballs before they go public yield dict( status='ok', # TODO: add article url (which needs to be queried if only ID is known message="Published archive {}".format( file_info['download_url']), file_info=file_info, path=dataset, action='export_to_figshare', logger=lgr )
__datalad_plugin__ = ExportToFigshare