Source code for datalad_core.tests.fixtures

"""Collection of fixtures for facilitating test implementations

.. currentmodule:: datalad_core.tests.fixtures
.. autosummary::
   :toctree: generated

   annexrepo
   baregitrepo
   bareannexrepo
   cfgman
   gitrepo
   http_fileserver
   modify_dataset
   progress_handler
   skip_when_symlinks_not_supported
   symlinks_supported
   test_datasets
   verify_pristine_gitconfig_global
"""

from __future__ import annotations

__all__ = [
    'annexrepo',
    'bareannexrepo',
    'baregitrepo',
    'cfgman',
    'gitrepo',
    'http_fileserver',
    'modify_dataset',
    'progress_handler',
    'skip_when_symlinks_not_supported',
    'symlinks_supported',
    'test_datasets',
    'verify_pristine_gitconfig_global',
]


import base64
import contextlib
import os
import socket
import threading
from dataclasses import dataclass
from http.server import (
    SimpleHTTPRequestHandler,
    ThreadingHTTPServer,
)
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import (
    TYPE_CHECKING,
)

import pytest

if TYPE_CHECKING:
    from collections.abc import (
        Generator,
        Iterator,
    )

from datalad_core.config import get_manager
from datalad_core.git_utils import (
    apply_changeset,
)
from datalad_core.git_utils import (
    types as gt,
)
from datalad_core.repo import Repo
from datalad_core.runners import (
    CommandError,
    call_git,
    call_git_lines,
    call_git_oneline,
)
from datalad_core.tests import AuditProgressHandler
from datalad_core.tests.utils import (
    call_git_commit,
    create_populated_dataset_annex_wt,
    modify_dataset,
    patched_env,
)

magic_marker = 'c4d0de12-8008-11ef-86ea-3776083add61'
standard_gitconfig = f"""\
[datalad "magic"]
    test-marker = {magic_marker}
[user]
    name = DataLad Tester
    email = test@example.com
[extensions]
    worktreeConfig = false
[annex "security"]
    # from annex 6.20180626 file:/// and http://localhost access isn't
    # allowed by default
    allowed-url-schemes = http https
    allowed-http-addresses = all
    allowed-ip-addresses = all
"""

git_name = 'Datalad Tester'
git_email = 'test@example.com'
# this is different from the base configuration above.
# the `cfgman` fixture is function-scope, hence it cannot
# be used for higher-scope fixtures. This environment
# is for those higher-level fixtures that must make commits.
test_committer_env = {
    'GIT_AUTHOR_NAME': git_name,
    'GIT_AUTHOR_EMAIL': git_email,
    'GIT_COMMITTER_NAME': git_name,
    'GIT_COMMITTER_EMAIL': git_email,
    'GIT_CONFIG_COUNT': '1',
    'GIT_CONFIG_KEY_0': 'init.defaultbranch',
    'GIT_CONFIG_VALUE_0': 'unusualdefault',
}


@dataclass(kw_only=True)
class HttpFileServerInstance:
    url: str
    path: Path


[docs] @pytest.fixture(autouse=False, scope='function') # noqa: PT003 def cfgman(monkeypatch): """Yield a configuration manager with a test-specific global scope Any test using this fixture will be skipped for Git versions earlier than 2.32, because the `GIT_CONFIG_GLOBAL` environment variable used here was only introduced with that version. """ manager = get_manager() ggc = manager.sources['git-global'] with TemporaryDirectory( prefix='datalad_gitcfg_global_', ) as tmpdir: cfgpath = Path(tmpdir) / 'gitconfig' cfgpath.write_text(standard_gitconfig) with monkeypatch.context() as m: m.setenv('GIT_CONFIG_GLOBAL', str(cfgpath)) ggc = manager.sources['git-global'] ggc.reinit() ggc.load() if ( ggc['datalad.magic.test-marker'].pristine_value != magic_marker ): # pragma: no cover pytest.skip( 'Cannot establish isolated global Git config scope ' '(possibly Git too old (needs v2.32)' ) yield manager # reload to put the previous config in effect again ggc.reinit() ggc.load()
[docs] @pytest.fixture(autouse=True, scope='function') # noqa: PT003 def verify_pristine_gitconfig_global(): """No test must modify a user's global Git config. If such modifications are needed, a custom configuration setup limited to the scope of the test requiring it must be arranged. """ from datalad_core.config import GlobalGitConfig # noqa PLC0415 def get_ggc_state(): ggc = GlobalGitConfig() return {k: ggc[k].pristine_value for k in ggc} pre = get_ggc_state() yield if pre != get_ggc_state(): # pragma: no cover # this is hard to test, because we are inside an autoused fixture. # FWIW: this has been tested manually when it was implemented # originally msg = ( 'Global Git config modification detected. ' 'Test must be modified to use a temporary configuration target. ' 'Hint: use the `isolated_global_cfg` fixture.' ) raise AssertionError(msg)
[docs] @pytest.fixture(autouse=False, scope='function') # noqa: PT003 def gitrepo(tmp_path_factory) -> Generator[Path]: """Yield the path to an initialized Git repository""" # must use the factory to get a unique path even when a concrete # test also uses `tmp_path` path = tmp_path_factory.mktemp('gitrepo') call_git( ['init'], cwd=path, capture_output=True, ) return path
[docs] @pytest.fixture(autouse=False, scope='function') # noqa: PT003 def baregitrepo(tmp_path_factory) -> Generator[Path]: """Yield the path to an initialized, bare Git repository""" # must use the factory to get a unique path even when a concrete # test also uses `tmp_path` path = tmp_path_factory.mktemp('gitrepo') call_git( ['init', '--bare'], cwd=path, capture_output=True, ) return path
[docs] @pytest.fixture(autouse=False, scope='function') # noqa: PT003 def bareannexrepo(baregitrepo) -> Generator[Path]: """Yield the path to a bare Git repository with an initialized annex""" call_git( ['annex', 'init'], cwd=baregitrepo, capture_output=True, ) return baregitrepo
[docs] @pytest.fixture(autouse=False, scope='function') # noqa: PT003 def annexrepo(gitrepo) -> Generator[Path]: """Yield the path to a Git repository with an initialized annex""" call_git( ['annex', 'init'], cwd=gitrepo, capture_output=True, ) return gitrepo
@pytest.fixture(scope='session') def modified_dataset(tmp_path_factory): """Produces a dataset with various modifications The fixture is module-scope, aiming to be reused by many tests focused on reporting. It does not support any further modification. The fixture will fail, if any such modification is detected. Use the helper ``modify_dataset()`` to apply these modification to an existing repository to circumwent these restriction. ``git status`` will report:: > git status -uall On branch dl-test-branch Changes to be committed: (use "git restore --staged <file>..." to unstage) new file: dir_m/file_a new file: file_a new file: file_am Changes not staged for commit: (use "git add/rm <file>..." to update what will be committed) (use "git restore <file>..." to discard changes in working directory) (commit or discard the untracked or modified content in submodules) deleted: dir_d/file_d deleted: dir_m/file_d modified: dir_m/file_m deleted: dir_sm/sm_d modified: dir_sm/sm_m (modified content) modified: dir_sm/sm_mu (modified content, untracked content) modified: dir_sm/sm_n (new commits) modified: dir_sm/sm_nm (new commits, modified content) modified: dir_sm/sm_nmu (new commits, modified content, untracked content) modified: dir_sm/sm_u (untracked content) modified: file_am deleted: file_d modified: file_m Untracked files: (use "git add <file>..." to include in what will be committed) dir_m/dir_u/file_u dir_m/file_u dir_u/file_u file_u Suffix indicates the ought-to state (multiple possible): a - added c - clean d - deleted n - new commits m - modified u - untracked content Prefix indicated the item type: file - file sm - submodule dir - directory """ path = tmp_path_factory.mktemp('gitrepo') call_git( ['init'], cwd=path, capture_output=True, ) # we need a git identify to commit stuff. We cannot use `cfgman`, # because that is a function-scope fixture with patched_env(**test_committer_env): # the returned status promise is the comparison reference status_promise = modify_dataset(path).splitlines() yield path # compare with initial git-status output, if there are any # differences the assumptions of any consuming test could be # invalidated. The modifying code must be found and fixed if status_promise != call_git_lines( ['status', '-uall', '--porcelain=v1'], cwd=path ): msg = 'Unexpected modification of the testbed' raise AssertionError(msg)
[docs] @pytest.fixture(scope='session') def test_datasets(http_fileserver) -> Iterator[HttpFileServerInstance]: """ - ds1-wt: repo with annex and checkout, no submodules - ds1: bare clone of ds1 - ds2: clone of ds1 with submodules (also ds1) added - ds3: clone of ds2 with ds2 as a submodule added """ secret_file = http_fileserver.path / 'protected' / 'files' / 'secret.txt' secret_file.parent.mkdir(parents=True) secret_file.write_text('supersecret') datasets = [] datasets.append(http_fileserver.path / 'ds1-wt') # we need a git identify to commit stuff. We cannot use `cfgman`, # because that is a function-scope fixture with patched_env(**test_committer_env): create_populated_dataset_annex_wt(datasets[-1]) call_git( [ '-c', 'annex.security.allowed-ip-addresses=all', 'annex', 'addurl', '--quiet', '--relaxed', '--raw', '--fast', '--file', 'secret.txt', f'{http_fileserver.url}protected/files/secret.txt', ], cwd=datasets[-1], ) call_git_commit(datasets[-1], msg='add remote secret by URL') def bareclone(src, dest): call_git(['clone', '-q', '--bare', src, dest]) call_git(['annex', 'init', '--quiet'], cwd=dest) # we clone from origin to not pull secret.txt via its # URL call_git(['annex', 'get', '--quiet', '-f', 'origin'], cwd=dest) datasets.append(http_fileserver.path / 'ds1.git') bareclone(datasets[-2], datasets[-1]) ds1_head = call_git_oneline(['rev-parse', 'HEAD'], cwd=datasets[-1]) if ds1_head is None: msg = 'Cannot determine repo HEAD' raise RuntimeError(msg) datasets.append(http_fileserver.path / 'ds2.git') bareclone(datasets[-2], datasets[-1]) ds2_head = apply_changeset( Repo(datasets[-1]), { 'subdir1/subm1': (gt.GitObjectMode.SUBMODULE, ds1_head), 'subm2': (gt.GitObjectMode.SUBMODULE, ds1_head), '.gitmodules': f"""\ [submodule "subdir1/subm1"] path = subdir1/subm1 url = {http_fileserver.url}/ds1.git [submodule "subm2"] path = subm2 url = {http_fileserver.url}/ds1.git """, }, message='register submodules', ) if ds2_head is None: msg = 'Cannot determine repo HEAD' raise RuntimeError(msg) datasets.append(http_fileserver.path / 'ds3.git') bareclone(datasets[-2], datasets[-1]) apply_changeset( Repo(datasets[-1]), { 'subm3': (gt.GitObjectMode.SUBMODULE, ds2_head), '.gitmodules': f'[submodule "subm3"]\n\tpath = subm3\n\turl = {http_fileserver.url}/ds2.git\n', }, message='register submodule', ) # finalize ds_states = [] for ds in datasets: # more efficient access from here on call_git(['gc'], capture_output=True, cwd=ds) call_git(['update-server-info'], cwd=ds) ds_states.append( (ds, call_git_oneline(['rev-parse', 'HEAD'], cwd=ds)), ) yield http_fileserver for ds, state in ds_states: if state != call_git_oneline(['rev-parse', 'HEAD'], cwd=ds): msg = 'Undesired modification of `test_datasets` fixture' raise RuntimeError(msg) # we do --branch HEAD to also function in bare repos missing_keys = call_git( ['annex', 'find', '--not', '--in', 'here', '--branch', 'HEAD'], capture_output=True, cwd=ds, ) if TYPE_CHECKING: assert isinstance(missing_keys, bytes) # secret.txt is served from a dedicated URL, need not be in the annex missing_keys = missing_keys.replace(b'secret.txt', b'') if missing_keys.strip(): msg = ( 'Undesired missing annex key in `test_datasets` fixture: ' f'{ds}\n{missing_keys.decode()}' ) raise RuntimeError(msg)
[docs] @pytest.fixture(scope='session') def http_fileserver( tmp_path_factory, host: str = '127.0.0.1', port: int | None = None, ) -> Iterator[HttpFileServerInstance]: """Starts a ThreadingHTTPServer serving a temporary directory Yields the base URL and served path. """ def _find_free_port(host=host): with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: s.bind((host, 0)) return s.getsockname()[1] if port is None: port = _find_free_port(host) tmp_path = tmp_path_factory.mktemp('http_serve') # create a password-proteced partition protected_root = 'protected' (tmp_path / protected_root).mkdir() # a custom handler that server the TMP directory class _ServeDirHandler(SimpleHTTPRequestHandler): USERNAME = 'datalad' PASSWORD = 'supersecret' # noqa: S105 # TODO: add a protected/ path that requires basic-auth def __init__(self, *args): super().__init__(*args, directory=str(tmp_path)) def log_message(*args): # we do not want the noise of the file server log # disable completely here. Maybe a use case for # a debug switch as some point pass def is_protected(self) -> bool: # protect the path and any subpaths, normalize to avoid # trailing slashes issues return self.path.startswith(f'/{protected_root}') def send_unauthorized(self): self.send_response(401) self.send_header('WWW-Authenticate', 'Basic realm="Protected"') self.send_header('Content-type', 'text/plain; charset=utf-8') self.end_headers() self.wfile.write(b'401 Unauthorized\n') def _check_basic_auth(self, header_value: str) -> bool: if not header_value: return False try: scheme, cred = header_value.split(' ', 1) if scheme.lower() != 'basic': return False decoded = base64.b64decode(cred).decode('utf-8') user, pwd = decoded.split(':', 1) return ( # noqa: TRY300 user == _ServeDirHandler.USERNAME and pwd == _ServeDirHandler.PASSWORD ) except Exception: # noqa: BLE001 return False def do_GET(self): if self.is_protected(): auth_header = self.headers.get('Authorization') if not self._check_basic_auth(auth_header): return self.send_unauthorized() return super().do_GET() def do_HEAD(self): if self.is_protected(): auth_header = self.headers.get('Authorization') if not self._check_basic_auth(auth_header): return self.send_unauthorized() return super().do_HEAD() server = ThreadingHTTPServer((host, port), _ServeDirHandler) thread = threading.Thread(target=server.serve_forever, daemon=True) thread.start() base_url = f'http://{host}:{port}/' try: yield HttpFileServerInstance(url=base_url, path=tmp_path) finally: server.shutdown() server.server_close() thread.join(timeout=1)
[docs] @pytest.fixture(autouse=False, scope='function') # noqa: PT003 def progress_handler(): """Function-scope :class:`~datalad_core.tests.AuditProgressHandler` instance""" return AuditProgressHandler()