# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 noet:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Proxy basic file operations (e.g. open) to auto-obtain files upon I/O
"""
import sys
# OPT delay import for expensive mock until used
#from unittest.mock import patch
import builtins
import lzma
import logging
import io
import os
from os.path import dirname, lexists, realpath, sep as pathsep
from os.path import exists
from os.path import isabs
from os.path import join as opj
from datalad.support.exceptions import InvalidGitRepositoryError
from datalad.utils import getpwd
from datalad.dochelpers import exc_str
from datalad.support.annexrepo import AnnexRepo
from datalad.cmdline.helpers import get_repo_instance
from datalad.consts import DATALAD_DOTDIR
# To be used for a quick detection of path being under .git/
_DOT_GIT_DIR = pathsep + '.git' + pathsep
lgr = logging.getLogger("datalad.auto")
h5py = None
try:
import h5py
except ImportError:
pass
except Exception as exc:
# could happen due to misbehaving handlers provided by git module
# see https://github.com/gitpython-developers/GitPython/issues/600
# we could overload the handler by providing a blank one, but I do not
# think it is worthwhile at this point. So let's just issue a warning
lgr.warning(
"Failed to import h5py, so no automagic handling for it atm: %s",
exc_str(exc)
)
# TODO: RF to reduce code duplication among cases, also RF tests for the same reason
class _EarlyExit(Exception):
"""Helper to early escape try/except logic in wrapped open"""
def __init__(self, msg, *args):
self.msg = msg
self.args = args
[docs]
class AutomagicIO(object):
"""Class to proxy commonly used API for accessing files so they get automatically fetched
Currently supports builtin open() and h5py.File when those are read
"""
[docs]
def __init__(self, autoget=True, activate=False, check_once=False):
"""
Parameters
----------
autoget
activate
check_once: bool, optional
To speed things up and avoid unnecessary repeated checks,
if True, paths considered for proxying and corresponding repositories
are remembered, and are not subject to datalad checks on subsequent calls.
This option is to be used if you do not expect new git repositories to not
be created and files not to get dropped while operating under
AutomagicIO supervision.
"""
self._active = False
self._builtin_open = builtins.open
self._io_open = io.open
self._os_stat = os.stat
self._builtin_exists = os.path.exists
self._builtin_isfile = os.path.isfile
if h5py:
self._h5py_File = h5py.File
else:
self._h5py_File = None
self._lzma_LZMAFile = lzma.LZMAFile
self._autoget = autoget
self._in_open = False
self._log_online = True
from unittest.mock import patch
self._patch = patch
self._paths_cache = set() if check_once else None
self._repos_cache = {} if check_once else None
self._getting_repo_instance = False
if activate:
self.activate()
def __enter__(self):
self.activate()
return self
def __exit__(self, exc_type, exc_value, traceback):
self.deactivate()
@property
def autoget(self):
return self._autoget
@property
def active(self):
return self._active
def _proxy_open_name_mode(self, origname, origfunc, *args, **kwargs):
"""Proxy for various "open" which have first argument name and 2nd - mode
"""
# wrap it all for resilience to errors -- proxying must do no harm!
try:
if self._in_open:
raise _EarlyExit("within open already")
if self._getting_repo_instance:
raise _EarlyExit("we are opening an annex repo - no files must be annexed")
self._in_open = True # just in case someone kept alias/assignment
# return stock open for the duration of handling so that
# logging etc could workout correctly
with self._patch(origname, origfunc):
lgr.log(3, "Proxying open with %r %r", args, kwargs)
# had to go with *args since in PY2 it is name, in PY3 file
# deduce arguments
if len(args) > 0:
# name/file was provided
file = args[0]
else:
filearg = "file"
if filearg not in kwargs:
# so the name was missing etc, just proxy into original open call and let it puke
raise _EarlyExit("no name/file was given")
file = kwargs.get(filearg)
if isinstance(file, int):
raise _EarlyExit("already a file descriptor")
if self._paths_cache is not None:
filefull = file if isabs(file) else os.path.abspath(file)
if filefull in self._paths_cache:
raise _EarlyExit("considered before")
else:
self._paths_cache.add(filefull)
if _DOT_GIT_DIR in file:
raise _EarlyExit("we ignore paths under .git/")
mode = 'r'
if len(args) > 1:
mode = args[1]
elif 'mode' in kwargs:
mode = kwargs['mode']
if 'r' in mode:
self._dataset_auto_get(file)
else:
raise _EarlyExit("mode=%r", mode)
except _EarlyExit as e:
lgr.log(2, " skipping since " + e.msg, *e.args,
extra={'notraceback': True})
except Exception as e:
# If anything goes wrong -- we should complain and proceed
with self._patch(origname, origfunc):
lgr.warning("Failed proxying open with %r, %r: %s", args, kwargs, exc_str(e))
finally:
self._in_open = False
# finally give it back to stock open
return origfunc(*args, **kwargs)
def _proxy_open(self, *args, **kwargs):
return self._proxy_open_name_mode('builtins.open', self._builtin_open,
*args, **kwargs)
def _proxy_io_open(self, *args, **kwargs):
return self._proxy_open_name_mode('io.open', self._io_open,
*args, **kwargs)
def _proxy_os_stat(self, *args, **kwargs):
return self._proxy_open_name_mode('os.stat', self._os_stat,
*args, **kwargs)
def _proxy_h5py_File(self, *args, **kwargs):
return self._proxy_open_name_mode('h5py.File', self._h5py_File,
*args, **kwargs)
def _proxy_lzma_LZMAFile(self, *args, **kwargs):
return self._proxy_open_name_mode('lzma.LZMAFile', self._lzma_LZMAFile,
*args, **kwargs)
def _proxy_exists(self, path):
# TODO: decide either it should may be retrieved right away.
# For now, as long as it is a symlink pointing to under .git/annex
if exists(path):
return True
return lexists(path) and 'annex/objects' in str(realpath(path))
def _proxy_isfile(self, path):
return self._proxy_open_name_mode(
'os.path.isfile', self._builtin_isfile, path
)
def _dataset_auto_get(self, filepath):
"""Verify that filepath is under annex, and if so and not present - get it"""
if not self._autoget:
return
# if filepath is not there at all (program just "checked" if it could access it
if not lexists(filepath):
lgr.log(2, " skipping %s since it is not there", filepath)
return
# deduce directory for filepath
filedir = dirname(filepath)
annex = None
if self._repos_cache is not None:
filedir_parts = filedir.split(pathsep)
# ATM we do not expect subdatasets under .datalad, so we could take the top
# level dataset for that
try:
filedir = pathsep.join(
filedir_parts[:filedir_parts.index(DATALAD_DOTDIR)]
)
except ValueError:
# would happen if no .datalad
pass
try:
annex = self._repos_cache[filedir]
except KeyError:
pass
if annex is None:
try:
# TODO: verify logic for create -- we shouldn't 'annexify' non-annexified
# see https://github.com/datalad/datalad/issues/204
#
# While getting a repo instance, we should not bother to proxy any call
# since there should be no file we are to get. This should address
# circular call-in and causing a lockdown of https://github.com/datalad/datalad/issues/5379
self._getting_repo_instance = True
annex = get_repo_instance(filedir)
lgr.log(2, "Got the repository %s id:%s containing %s", annex, id(annex), filedir)
except (RuntimeError, InvalidGitRepositoryError) as e:
# must be not under annex etc
return
finally:
self._getting_repo_instance = False
if self._repos_cache is not None:
self._repos_cache[filedir] = annex
if not isinstance(annex, AnnexRepo):
# not an annex -- can do nothing
lgr.log(2, " skipping %s since the repo is not annex", filepath)
return
# since Git/AnnexRepo functionality treats relative paths relative to the
# top of the repository and might be outside, get a full path
if not isabs(filepath):
filepath = opj(getpwd(), filepath)
# "quick" check first if under annex at all
try:
# might fail. TODO: troubleshoot when it does e.g.
# datalad/tests/test_auto.py:test_proxying_open_testrepobased
under_annex = annex.is_under_annex(filepath, batch=True)
except Exception as exc: # MIH: really? what if MemoryError
lgr.log(5, " cannot determine if %s under annex: %s", filepath, exc_str(exc))
under_annex = None
# either it has content
if (under_annex or under_annex is None) and not annex.file_has_content(filepath):
lgr.info("AutomagicIO: retrieving file content of %s", filepath)
out = annex.get(filepath)
if out and not out.get('success', False):
# to assure that it is present and without trailing/leading new lines
out['note'] = out.get('note', '').strip()
lgr.error("Failed to retrieve %(file)s: %(note)s", out)
def activate(self):
# we should stay below info for this message. With PR #1630 we
# start to use this functionality internally, and this will show
# up frequently even in cases where it does nothing at all
lgr.debug("Activating DataLad's AutoMagicIO")
# Some beasts (e.g. tornado used by IPython) override outputs, and
# provide fileno which throws exception. In such cases we should not log online
self._log_online = hasattr(sys.stdout, 'fileno') and hasattr(sys.stderr, 'fileno')
try:
if self._log_online:
sys.stdout.fileno()
sys.stderr.fileno()
except: # MIH: IOError?
self._log_online = False
if self.active:
# this is not a warning, because there is nothing going
# wrong or being undesired. Nested invocation could happen
# caused by independent pieces of code, e.g. user code
# that invokes our own metadata handling.
lgr.debug("%s already active. No action taken", self)
return
# overloads
builtins.open = self._proxy_open
io.open = self._proxy_io_open
os.stat = self._proxy_os_stat
os.path.exists = self._proxy_exists
os.path.isfile = self._proxy_isfile
if h5py:
h5py.File = self._proxy_h5py_File
lzma.LZMAFile = self._proxy_lzma_LZMAFile
self._active = True
def deactivate(self):
# just debug level -- see activate()
lgr.debug("Deactivating DataLad's AutoMagicIO")
if not self.active:
lgr.warning("%s is not active, can't deactivate" % self)
return
builtins.open = self._builtin_open
io.open = self._io_open
os.stat = self._os_stat
if h5py:
h5py.File = self._h5py_File
lzma.LZMAFile = self._lzma_LZMAFile
os.path.exists = self._builtin_exists
os.path.isfile = self._builtin_isfile
self._active = False
def __del__(self):
try:
if self._active:
self.deactivate()
except: # MIH: IOError?
pass
try:
super(self.__class__, self).__del__()
except:
pass