Source code for datalad_crawler.crawl

# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 noet:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
#   See COPYING file distributed along with the datalad package for the
#   copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Interface for crawling a webpage and push extracted data into a dataset"""

__docformat__ = 'restructuredtext'


from os.path import exists
from datalad.interface.base import Interface
from datalad.interface.base import build_doc

from datalad.support.param import Parameter
from datalad.support.constraints import EnsureStr, EnsureNone
from datalad_crawler.pipeline import initiate_pipeline_config
from datalad.support.stats import ActivityStats
from datalad import utils

from logging import getLogger
lgr = getLogger('datalad.api.crawl')

from datalad import cfg


[docs]@build_doc class Crawl(Interface): """Crawl online resource to create or update a dataset. Examples: $ datalad crawl # within a dataset having .datalad/crawl/crawl.cfg """ # XXX prevent common args from being added to the docstring _no_eval_results = True _params_ = dict( # Dry run is untested and largely probably not working in this implementation # so let's not expose it for now at all # dry_run=Parameter( # args=("-n", "--dry-run"), # action="store_true", # doc="""flag if file manipulations to be invoked (e.g., adding to git/annex). # If not, commands are only printed to the stdout"""), is_pipeline=Parameter( args=("--is-pipeline",), action="store_true", doc="""flag if provided file is a Python script which defines pipeline()"""), is_template=Parameter( args=("-t", "--is-template"), action="store_true", doc="""flag if provided value is the name of the template to use"""), recursive=Parameter( args=("-r", "--recursive"), action="store_true", doc="""flag to crawl subdatasets as well (for now serially)"""), chdir=Parameter( args=("-C", "--chdir"), constraints=EnsureStr() | EnsureNone(), doc="""directory to chdir to for crawling"""), path=Parameter( args=('path',), metavar='file', nargs='?', constraints=EnsureStr() | EnsureNone(), doc="""configuration (or pipeline if --is-pipeline) file defining crawling, or a directory of a dataset on which to perform crawling using its standard crawling specification"""), ) @staticmethod def __call__(path=None, is_pipeline=False, is_template=False, recursive=False, chdir=None): # dry_run=False, dry_run = False from datalad_crawler.pipeline import ( load_pipeline_from_config, load_pipeline_from_module, get_repo_pipeline_config_path, get_repo_pipeline_script_path ) from datalad_crawler.pipeline import run_pipeline from datalad.utils import chpwd # import late so we could mock during tests with chpwd(chdir): assert not (is_pipeline and is_template), "it is either a pipeline or a template name, can't be both" if is_template: # generate a config and overload path with its filename path = initiate_pipeline_config(template=path, # kwargs=TODO, commit=True) # TODO: centralize via _params_ handling if dry_run: dryrun_optlabel = 'datalad.crawl.dryrun' if dryrun_optlabel in cfg: cfg.unset(dryrun_optlabel, where='local', reload=False) cfg.add(dryrun_optlabel, "True", where='local') if path is None: # get config from the current repository/dataset if is_pipeline: raise ValueError("You must specify the file if --pipeline") # Let's see if there is a config or pipeline in this repo path = get_repo_pipeline_config_path() if not path or not exists(path): # Check if there may be the pipeline provided path = get_repo_pipeline_script_path() if path and exists(path): is_pipeline = True stats = ActivityStats() if not path: raise RuntimeError("Cannot locate crawler config or pipeline file") if is_pipeline: lgr.info("Loading pipeline definition from %s" % path) pipeline = load_pipeline_from_module(path) else: lgr.info("Loading pipeline specification from %s" % path) pipeline = load_pipeline_from_config(path) lgr.info("Running pipeline %s" % str(pipeline)) # TODO: capture the state of all branches so in case of crash # we could gracefully reset back try: output = run_pipeline(pipeline, stats=stats) except Exception as exc: # TODO: config.crawl.failure = full-reset | last-good-master # probably ask via ui which action should be performed unless # explicitly specified raise stats.datasets_crawled += 1 # TODO: Move gc/clean over here! stats_total = stats.get_total() if recursive: # get all subdatasets, and crawl them too! ## ? assert path_orig is None, "Otherwise not sure what to do with path=%r in subdatasets" % path import os from datalad.distribution.dataset import Dataset from datalad.api import crawl from datalad.utils import swallow_logs from datalad.dochelpers import exc_str # Note: we could collect all datasets to be crawled here or pass recursive=True # into the subdatasets' crawl. We will collect all of them here so we might later # also introduce automatic commits when super-dataset got successfully updated subdatasets = Dataset(os.curdir).subdatasets(recursive=recursive, result_xfm='relpaths') lgr.info("Crawling %d subdatasets", len(subdatasets)) output = [output] # TODO: parallelize # TODO: assumes that all sub-datasets are 'crawllable', and if not # just adds them to crawl_failed count. But may be we should make it more # explicit, that some sub-datasets might not need to be crawled, so they get # skipped explicitly? for ds_ in subdatasets: ds_logfile = utils.get_logfilename(ds_, 'crawl') try: # TODO: might be cool to be able to report a 'heart beat' from the swallow into pbar or smth with swallow_logs(file_=ds_logfile) as cml: output_, stats_ = crawl(chdir=ds_) stats_total += stats_ output.append(output_) lgr.info("Crawled %s: %s (log: %s)", ds_, stats_.as_str(mode='line'), ds_logfile) except Exception as exc: stats_total.datasets_crawl_failed += 1 stats_total.datasets_crawled += 1 output += [None] lgr.warning("Crawling of %s has failed (more in %s): %s.", # Log output: %s", ds_, ds_logfile, exc_str(exc)) # , cml.out) lgr.info("Total stats: %s", stats_total.as_str(mode='line')) return output, stats_total