Source code for datalad_container.containers_add

"""Add a container environment to a dataset"""

__docformat__ = 'restructuredtext'

import json
import logging
import os
import os.path as op
import re
from pathlib import (
    Path,
    PurePosixPath,
)
from shutil import copyfile

from datalad.cmd import WitlessRunner
from datalad.distribution.dataset import (
    EnsureDataset,
    datasetmethod,
    require_dataset,
)
from datalad.interface.base import (
    Interface,
    build_doc,
    eval_results,
)
from datalad.interface.results import get_status_dict
from datalad.support.constraints import (
    EnsureNone,
    EnsureStr,
)
from datalad.support.exceptions import InsufficientArgumentsError
from datalad.support.param import Parameter

from .utils import get_container_configuration

lgr = logging.getLogger("datalad.containers.containers_add")

# The DataLad special remote has built-in support for Singularity Hub URLs. Let
# it handle shub:// URLs if it's available.
_HAS_SHUB_DOWNLOADER = True
try:
    import datalad.downloaders.shub
except ImportError:
    lgr.debug("DataLad's shub downloader not found. "
              "Custom handling for shub:// will be used")
    _HAS_SHUB_DOWNLOADER = False


def _resolve_img_url(url):
    """Takes a URL and tries to resolve it to an actual download
    URL that `annex addurl` can handle"""
    if not _HAS_SHUB_DOWNLOADER and url.startswith('shub://'):
        # TODO: Remove this handling once the minimum DataLad version is at
        # least 0.14.
        lgr.debug('Query singularity-hub for image download URL')
        import requests
        req = requests.get(
            'https://www.singularity-hub.org/api/container/{}'.format(
                url[7:]))
        shub_info = json.loads(req.text)
        url = shub_info['image']
    return url


def _guess_call_fmt(ds, name, url):
    """Helper to guess a container exec setup based on
    - a name (to be able to look up more config
    - a plain url to make inference based on the source location

    Should return `None` is no guess can be made.
    """
    if url is None:
        return None
    elif url.startswith('shub://') or url.startswith('docker://'):
        return 'singularity exec {img} {cmd}'
    elif url.startswith('dhub://'):
        # {python} is replaced with sys.executable on *execute*
        return '{python} -m datalad_container.adapters.docker run {img} {cmd}'


def _ensure_datalad_remote(repo):
    """Initialize and enable datalad special remote if it isn't already."""
    dl_remote = None
    for info in repo.get_special_remotes().values():
        if info.get("externaltype") == "datalad":
            dl_remote = info["name"]
            break

    if not dl_remote:
        from datalad.consts import DATALAD_SPECIAL_REMOTE
        from datalad.customremotes.base import init_datalad_remote

        init_datalad_remote(repo, DATALAD_SPECIAL_REMOTE, autoenable=True)
    elif repo.is_special_annex_remote(dl_remote, check_if_known=False):
        lgr.debug("datalad special remote '%s' is already enabled",
                  dl_remote)
    else:
        lgr.debug("datalad special remote '%s' found. Enabling",
                  dl_remote)
        repo.enable_remote(dl_remote)


[docs] @build_doc # all commands must be derived from Interface class ContainersAdd(Interface): # first docstring line is used a short description in the cmdline help # the rest is put in the verbose help and manpage """Add a container to a dataset """ # parameters of the command, must be exhaustive _params_ = dict( dataset=Parameter( args=("-d", "--dataset"), doc="""specify the dataset to add the container to. If no dataset is given, an attempt is made to identify the dataset based on the current working directory""", constraints=EnsureDataset() | EnsureNone() ), name=Parameter( args=("name",), doc="""The name to register the container under. This also determines the default location of the container image within the dataset.""", metavar="NAME", constraints=EnsureStr(), ), url=Parameter( args=("-u", "--url"), doc="""A URL (or local path) to get the container image from. If the URL scheme is one recognized by Singularity (e.g., 'shub://neurodebian/dcm2niix:latest' or 'docker://debian:stable-slim'), a command format string for Singularity-based execution will be auto-configured when [CMD: --call-fmt CMD][PY: call_fmt PY] is not specified. For Docker-based container execution with the URL scheme 'dhub://', the rest of the URL will be interpreted as the argument to 'docker pull', the image will be saved to a location specified by `name`, and the call format will be auto-configured to run docker, unless overwritten. The auto-configured call to docker run mounts the CWD to '/tmp' and sets the working directory to '/tmp'.""", metavar="URL", constraints=EnsureStr() | EnsureNone(), ), # TODO: The "prepared command stuff should ultimately go somewhere else # (probably datalad-run). But first figure out, how exactly to address # container datasets call_fmt=Parameter( args=("--call-fmt",), doc="""Command format string indicating how to execute a command in this container, e.g. "singularity exec {img} {cmd}". Where '{img}' is a placeholder for the path to the container image and '{cmd}' is replaced with the desired command. Additional placeholders: '{img_dspath}' is relative path to the dataset containing the image, '{img_dirpath}' is the directory containing the '{img}'. '{python}' expands to the path of the Python executable that is running the respective DataLad session, for example a 'datalad containers-run' command. """, metavar="FORMAT", constraints=EnsureStr() | EnsureNone(), ), extra_input=Parameter( args=("--extra-input",), doc="""Additional file the container invocation depends on (e.g. overlays used in --call-fmt). Can be specified multiple times. Similar to --call-fmt, the placeholders {img_dspath} and {img_dirpath} are available. Will be stored in the dataset config and later added alongside the container image to the `extra_inputs` field in the run-record and thus automatically be fetched when needed. """, action="append", default=[], metavar="FILE", # Can't use EnsureListOf(str) yet as it handles strings as iterables... # See this PR: https://github.com/datalad/datalad/pull/7267 # constraints=EnsureListOf(str) | EnsureNone(), ), image=Parameter( args=("-i", "--image"), doc="""Relative path of the container image within the dataset. If not given, a default location will be determined using the `name` argument.""", metavar="IMAGE", constraints=EnsureStr() | EnsureNone(), ), update=Parameter( args=("--update",), action="store_true", doc="""Update the existing container for `name`. If no other options are specified, URL will be set to 'updateurl', if configured. If a container with `name` does not already exist, this option is ignored.""" ) ) @staticmethod @datasetmethod(name='containers_add') @eval_results def __call__(name, url=None, dataset=None, call_fmt=None, image=None, update=False, extra_input=None): if not name: raise InsufficientArgumentsError("`name` argument is required") ds = require_dataset(dataset, check_installed=True, purpose='add container') runner = WitlessRunner() # prevent madness in the config file if not re.match(r'^[0-9a-zA-Z-]+$', name): raise ValueError( "Container names can only contain alphanumeric characters " "and '-', got: '{}'".format(name)) container_cfg = get_container_configuration(ds, name) if 'image' in container_cfg: if not update: yield get_status_dict( action="containers_add", ds=ds, logger=lgr, status="impossible", message=("Container named %r already exists. " "Use --update to reconfigure.", name)) return if not (url or image or call_fmt): # No updated values were provided. See if an update url is # configured (currently relevant only for Singularity Hub). url = container_cfg.get("updateurl") if not url: yield get_status_dict( action="containers_add", ds=ds, logger=lgr, status="impossible", message="No values to update specified") return call_fmt = call_fmt or container_cfg.get("cmdexec") image = image or container_cfg.get("image") if not image: loc_cfg_var = "datalad.containers.location" container_loc = \ ds.config.obtain( loc_cfg_var, # if not False it would actually modify the # dataset config file -- undesirable store=False, ) image = op.join(ds.path, container_loc, name, 'image') else: image = op.join(ds.path, image) result = get_status_dict( action="containers_add", path=image, type="file", logger=lgr, ) if call_fmt is None: # maybe built in knowledge can help call_fmt = _guess_call_fmt(ds, name, url) # collect bits for a final and single save() call to_save = [] imgurl = url was_updated = False if url: if update and op.lexists(image): was_updated = True # XXX: check=False is used to avoid dropping the image. It # should use drop=False if remove() gets such an option (see # DataLad's gh-2673). for r in ds.remove(image, save=False, check=False, return_type="generator"): yield r imgurl = _resolve_img_url(url) lgr.debug('Attempt to obtain container image from: %s', imgurl) if url.startswith("dhub://"): from .adapters import docker docker_image = url[len("dhub://"):] lgr.debug( "Running 'docker pull %s and saving image to %s", docker_image, image) runner.run(["docker", "pull", docker_image]) docker.save(docker_image, image) elif url.startswith("docker://"): image_dir, image_basename = op.split(image) if not image_basename: raise ValueError("No basename in path {}".format(image)) if image_dir and not op.exists(image_dir): os.makedirs(image_dir) lgr.info("Building Singularity image for %s " "(this may take some time)", url) runner.run(["singularity", "build", image_basename, url], cwd=image_dir or None) elif op.exists(url): lgr.info("Copying local file %s to %s", url, image) image_dir = op.dirname(image) if image_dir and not op.exists(image_dir): os.makedirs(image_dir) copyfile(url, image) else: if _HAS_SHUB_DOWNLOADER and url.startswith('shub://'): _ensure_datalad_remote(ds.repo) try: ds.repo.add_url_to_file(image, imgurl) except Exception as e: result["status"] = "error" result["message"] = str(e) yield result # TODO do we have to take care of making the image executable # if --call_fmt is not provided? to_save.append(image) # continue despite a remote access failure, the following config # setting will enable running the command again with just the name # given to ease a re-run if not op.lexists(image): result["status"] = "error" result["message"] = ('no image at %s', image) yield result return # store configs cfgbasevar = "datalad.containers.{}".format(name) if imgurl != url: # store originally given URL, as it resolves to something # different and maybe can be used to update the container # at a later point in time ds.config.set("{}.updateurl".format(cfgbasevar), url) # force store the image, and prevent multiple entries ds.config.set( "{}.image".format(cfgbasevar), # always store a POSIX path, relative to dataset root str(PurePosixPath(Path(image).relative_to(ds.pathobj))), force=True) if call_fmt: ds.config.set( "{}.cmdexec".format(cfgbasevar), call_fmt, force=True) # --extra-input sanity check # TODO: might also want to do that for --call-fmt above? extra_input_placeholders = dict(img_dirpath="", img_dspath="") for xi in (extra_input or []): try: xi.format(**extra_input_placeholders) except KeyError as exc: yield get_status_dict( action="containers_add", ds=ds, logger=lgr, status="error", message=("--extra-input %r contains unknown placeholder %s. " "Available placeholders: %s", repr(xi), exc, ', '.join(extra_input_placeholders))) return # actually setting --extra-input config cfgextravar = "{}.extra-input".format(cfgbasevar) if ds.config.get(cfgextravar) is not None: ds.config.unset(cfgextravar) for xi in (extra_input or []): ds.config.add(cfgextravar, xi) # store changes to_save.append(op.join(".datalad", "config")) for r in ds.save( path=to_save, message="[DATALAD] {do} containerized environment '{name}'".format( do="Update" if was_updated else "Configure", name=name)): yield r result["status"] = "ok" yield result