From 5895bc2b7542385e84b2370bfae102412ad0fafd Mon Sep 17 00:00:00 2001 From: Wessel Dankers Date: Fri, 17 Jan 2025 11:36:22 +0100 Subject: [PATCH 1/2] Automatic repository detection - Add wsgi_autodetect(ing).py with usage like the existing wsgi_autoreload(ing).py scripts. - Can handle directories with and without .git suffix. - Factor out the repository container functionality from the Klaus object into its own class hierarchy (RepoContainer). - Certain aspects of the automatic detection are configurable (specifically, the path that determines whether a subdirectory is a valid repo, whether it should detect removed repos and what are acceptable suffixes). --- klaus/__init__.py | 53 +++++---- klaus/contrib/wsgi_autodetect.py | 43 +++++++ klaus/contrib/wsgi_autodetecting.py | 178 ++++++++++++++++++++++++++++ klaus/repo.py | 31 ++++- klaus/utils.py | 26 ++++ 5 files changed, 307 insertions(+), 24 deletions(-) create mode 100644 klaus/contrib/wsgi_autodetect.py create mode 100644 klaus/contrib/wsgi_autodetecting.py diff --git a/klaus/__init__.py b/klaus/__init__.py index 96caddb2..becad7a1 100644 --- a/klaus/__init__.py +++ b/klaus/__init__.py @@ -12,7 +12,7 @@ from dulwich.errors import NotGitRepository from klaus import utils, views -from klaus.repo import FancyRepo, InvalidRepo +from klaus.repo import DefaultRepoContainer KLAUS_VERSION = utils.guess_git_revision() or "3.0.1" @@ -23,20 +23,40 @@ class Klaus(flask.Flask): "undefined": jinja2.StrictUndefined, } - def __init__(self, repo_paths, site_name, use_smarthttp, ctags_policy="none"): + def __init__( + self, + repo_paths, + site_name, + use_smarthttp, + ctags_policy="none", + repo_container_factory=None, + ): """(See `make_app` for parameter descriptions.)""" self.site_name = site_name self.use_smarthttp = use_smarthttp self.ctags_policy = ctags_policy - valid_repos, invalid_repos = self.load_repos(repo_paths) - self.valid_repos = {repo.namespaced_name: repo for repo in valid_repos} - self.invalid_repos = {repo.namespaced_name: repo for repo in invalid_repos} + if repo_container_factory is None: + repo_container_factory = DefaultRepoContainer + + self.repo_container = repo_container_factory(repo_paths) flask.Flask.__init__(self, __name__) self.setup_routes() + @property + def valid_repos(self): + """Repositories that are considered valid by the repository manager""" + + return self.repo_container.valid + + @property + def invalid_repos(self): + """Repositories that were declined by the repository manager""" + + return self.repo_container.invalid + def create_jinja_environment(self): """Called by Flask.__init__""" env = super().create_jinja_environment() @@ -95,17 +115,6 @@ def should_use_ctags(self, git_repo, git_commit): else: raise ValueError("Unknown ctags policy %r" % self.ctags_policy) - def load_repos(self, repo_paths): - valid_repos = [] - invalid_repos = [] - for namespace, paths in repo_paths.items(): - for path in paths: - try: - valid_repos.append(FancyRepo(path, namespace)) - except NotGitRepository: - invalid_repos.append(InvalidRepo(path, namespace)) - return valid_repos, invalid_repos - def make_app( repo_paths, @@ -116,6 +125,7 @@ def make_app( disable_push=False, unauthenticated_push=False, ctags_policy="none", + repo_container_factory=None, ): """ Returns a WSGI app with all the features (smarthttp, authentication) @@ -145,6 +155,8 @@ def make_app( - 'tags-and-branches': use ctags for revisions that are the HEAD of a tag or branc - 'ALL': use ctags for all revisions, may result in high server load! + :param repo_container_factory: An instance of klaus.repo.BaseRepoContainer or None, + in which klaus.repo.DefaultRepoContainer will be used. """ if unauthenticated_push: if not use_smarthttp: @@ -159,25 +171,20 @@ def make_app( raise ValueError( "'htdigest_file' set without 'use_smarthttp' or 'require_browser_auth'" ) - if not isinstance(repo_paths, dict): - # If repos is given as a flat list, put all repos under the "no namespace" namespace - repo_paths = {None: repo_paths} app = Klaus( repo_paths, site_name, use_smarthttp, ctags_policy, + repo_container_factory, ) app.wsgi_app = utils.ProxyFix(app.wsgi_app) if use_smarthttp: # `path -> Repo` mapping for Dulwich's web support dulwich_backend = dulwich.server.DictBackend( - { - "/" + namespaced_name: repo - for namespaced_name, repo in app.valid_repos.items() - } + utils.SlashDictProxy(app.valid_repos) ) # Dulwich takes care of all Git related requests/URLs # and passes through everything else to klaus diff --git a/klaus/contrib/wsgi_autodetect.py b/klaus/contrib/wsgi_autodetect.py new file mode 100644 index 00000000..5aed71fe --- /dev/null +++ b/klaus/contrib/wsgi_autodetect.py @@ -0,0 +1,43 @@ +import os +import warnings +import contextlib + +from .app_args import get_args_from_env, strtobool +from .wsgi_autodetecting import make_autodetecting_app + +try: + repos_root = os.environ['KLAUS_REPOS_ROOT'] +except KeyError: + repos_root = os.environ['KLAUS_REPOS'] + warnings.warn( + "use KLAUS_REPOS_ROOT instead of KLAUS_REPOS for the autodecting apps", + DeprecationWarning, + ) + +args, kwargs = get_args_from_env() +args = (repos_root,) + args[1:] + +with contextlib.suppress(KeyError): + kwargs['detect_removals'] = bool(strtobool(os.environ['KLAUS_DETECT_REMOVALS'])) + +with contextlib.suppress(KeyError): + kwargs['export_ok_path'] = os.environ['KLAUS_EXPORT_OK_PATH'] + +with contextlib.suppress(KeyError): + # How to deal with repository directories named "foo" and/or "foo.git". + # This is a list of potential suffixes, with your operating system's + # directory separator as a separator. Examples: + # + # KLAUS_EXPORT_OK_PATH="/.git" + # Directories with and without .git are accepted + # (the first entry is the empty string). Default. + # + # KLAUS_EXPORT_OK_PATH=".git" + # Only .git directories are accepted. + # + # KLAUS_EXPORT_OK_PATH="" + # The .git suffix is not considered. + + kwargs['directory_suffixes'] = os.environ['KLAUS_DIRECTORY_SUFFIXES'].split(os.sep) + +application = make_autodetecting_app(*args, **kwargs) diff --git a/klaus/contrib/wsgi_autodetecting.py b/klaus/contrib/wsgi_autodetecting.py new file mode 100644 index 00000000..f53334e6 --- /dev/null +++ b/klaus/contrib/wsgi_autodetecting.py @@ -0,0 +1,178 @@ +""" +Alternative take on the "automatically discovered repositories" concept +that requires no threads, polling or inotify. Instead the filesystem is +consulted whenever a repository name is looked up. + +Since Path.exists() and Path.iterdir() are fairly quick filesystem +operations, performance should be good for small to medium sites. +FancyRepo() objects are cached. + +Repositories are identified by the existence of a + + /git-daemon-export-ok + +file (for compatibility with gitweb). You can customize this path using +the export_ok_path parameter. Setting it to '.' will cause every +subdirectory to be considered a git repository. + +For large sites this approach may be hard on the filesystem when listing +repositories, because the process of enumerating the git repositories +causes the git-daemon-export-ok file to be checked in every repository. +This can be mitigated by setting detect_removals to False. +""" + +import collections.abc +import functools +import os +import pathlib + +import klaus +import klaus.repo + +_bad_names = frozenset([os.curdir, os.pardir]) +_bad_chars = frozenset(['\0', os.sep, os.altsep]) +_default_directory_suffixes = ['', '.git'] + + +def coalesce(*args): + """Return the first argument that is not None""" + + return next(arg for arg in args if arg is not None) + + +class AutodetectingRepoDict(collections.abc.Mapping): + """ + Maintain a virtual read-only dictionary whose contents represent + the presence of git repositories in the given root directory. + + :param root: The path to a directory containing repositories, each + a direct subdirectory of the root. + :param namespace: A namespace that will be applied to all detected + repositories. + :param detect_removals: Detect if repositories have been removed. + Defaults to True. Setting it to False can improve performance + for repository listings in very large sites. + :param export_ok_path: The filesystem path to check (relative to + the candidate repository root) to see if it is a valid servable + git repository. Defaults to 'git-daemon-export-ok'. Set to '.' + if every directory is known to be a valid repository root. + :param directory_suffixes: A list of suffixes that your git directories + may have. The default is ['', '.git']. + """ + + def __init__( + self, + root, + namespace=None, + detect_removals=None, + export_ok_path=None, + directory_suffixes=None, + ): + self._root = pathlib.Path(root) + self._cache = {} + self._namespace = namespace + self._detect_removals = coalesce(detect_removals, True) + self._export_ok_path = coalesce(export_ok_path, 'git-daemon-export-ok') + # Use the keys of a dict in reverse order so that we can create a sort + # of "poor man's splay tree": the suffixes are always tried in reverse + # order. If a suffix was matched succesfully it is moved to the end by + # removing and readding it so that it is tried as the first option for + # the next repository. + self._suffixes = dict.fromkeys( + reversed(list(coalesce(directory_suffixes, _default_directory_suffixes))) + ) + + def __getitem__(self, name): + if ( + not name + or name.startswith('.') + or name in _bad_names + or not _bad_chars.isdisjoint(name) + ): + raise KeyError(name) + + if not self._detect_removals: + # Try returning a cached version first, to avoid filesystem access + try: + return self._cache[name] + except KeyError: + pass + + for suffix in reversed(self._suffixes): + # Bare git repositories may have a .git suffix on the directory name: + path = self._root / (name + suffix) + if (path / self._export_ok_path).exists(): + # Reorder suffix test order on the assumption that most repos will + # have the same suffix: + del self._suffixes[suffix] + self._suffixes[suffix] = None + break + else: + self._cache.pop(name, None) + raise KeyError(name) + + if self._detect_removals: + try: + return self._cache[name] + except KeyError: + pass + + repo = klaus.repo.FancyRepo(str(path), self._namespace) + self._cache[name] = repo + return repo + + def __iter__(self): + def is_valid_repo(path): + if not self._detect_removals and path.name in self._cache: + return True + return (path / self._export_ok_path).exists() + + suffixes = sorted(self._suffixes, key=len, reverse=True) + + def removesuffixes(string): + for suffix in suffixes: + attempt = string.removesuffix(suffix) + if attempt != string: + return attempt + return string + + return ( + removesuffixes(path.name) + for path in self._root.iterdir() + if is_valid_repo(path) + ) + + def __len__(self): + return sum(1 for _ in self) + + +class AutodetectingRepoContainer(klaus.repo.BaseRepoContainer): + """ + RepoContainer based on AutodetectingRepoDict. + See AutodetectingRepoDict for parameter descriptions. + """ + + def __init__(self, repos_root, *args, **kwargs): + super().__init__(repos_root) + self.valid = AutodetectingRepoDict(repos_root, *args, **kwargs) + + +def make_autodetecting_app( + repos_root, + *args, + detect_removals=None, + export_ok_path=None, + directory_suffixes=None, + **kwargs, +): + return klaus.make_app( + repos_root, + *args, + repo_container_factory=functools.partial( + AutodetectingRepoContainer, + detect_removals=detect_removals, + export_ok_path=export_ok_path, + directory_suffixes=directory_suffixes, + ), + **kwargs, + ) diff --git a/klaus/repo.py b/klaus/repo.py index d2298e63..47955177 100644 --- a/klaus/repo.py +++ b/klaus/repo.py @@ -7,7 +7,7 @@ import dulwich import dulwich.patch -from dulwich.errors import NotTreeError +from dulwich.errors import NotGitRepository, NotTreeError from dulwich.object_store import tree_lookup_path from dulwich.objects import S_ISGITLINK, Blob @@ -435,3 +435,32 @@ def namespaced_name(self): return f"~{self.namespace}/{self.name}" else: return self.name + + +class BaseRepoContainer: + """Abstract base class for repository containers.""" + + def __init__(self, repo_paths): + self._repo_paths = repo_paths + self.valid = {} + self.invalid = {} + + +class DefaultRepoContainer(BaseRepoContainer): + """Default repository container that holds a preset list of repositories""" + + def __init__(self, repo_paths): + if not isinstance(repo_paths, dict): + # If repos is given as a flat list, put all repos under the "no namespace" namespace + repo_paths = {None: repo_paths} + + super().__init__(repo_paths) + + for namespace, paths in repo_paths.items(): + for path in paths: + try: + repo = FancyRepo(path, namespace) + self.valid[repo.namespaced_name] = repo + except NotGitRepository: + repo = InvalidRepo(path, namespace) + self.invalid[repo.namespaced_name] = repo diff --git a/klaus/utils.py b/klaus/utils.py index 10d7e208..e3703040 100644 --- a/klaus/utils.py +++ b/klaus/utils.py @@ -1,4 +1,5 @@ import binascii +import collections.abc import datetime import locale import mimetypes @@ -102,6 +103,31 @@ def __call__(self, environ, start_response): return self.app(environ, start_response) +class SlashDictProxy(collections.abc.Mapping): + """ + Proxy for dicts that makes keys start with a '/' character. + + The slash is added and removed from keys as necessary when items are + stored and retrieved. + + Needed for dulwich.server.DictBackend. + """ + + def __init__(self, base): + self._base = base + + def __getitem__(self, path): + if not path or path[0] != '/': + raise KeyError(path) + return self._base[path[1:]] + + def __iter__(self): + return ('/' + name for name in self._base) + + def __len__(self): + return len(self._base) + + def timesince(when, now=time.time): """Return the difference between `when` and `now` in human readable form.""" return naturaltime(now() - when) From 6978e73a903ae70a895fa72fb1d51ade7bc6fe0b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 17 Jan 2025 12:58:12 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- klaus/__init__.py | 1 - klaus/contrib/wsgi_autodetect.py | 12 ++++++------ klaus/contrib/wsgi_autodetecting.py | 8 ++++---- klaus/utils.py | 4 ++-- 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/klaus/__init__.py b/klaus/__init__.py index becad7a1..28c4e244 100644 --- a/klaus/__init__.py +++ b/klaus/__init__.py @@ -9,7 +9,6 @@ import dulwich.web import flask import httpauth -from dulwich.errors import NotGitRepository from klaus import utils, views from klaus.repo import DefaultRepoContainer diff --git a/klaus/contrib/wsgi_autodetect.py b/klaus/contrib/wsgi_autodetect.py index 5aed71fe..19d6095d 100644 --- a/klaus/contrib/wsgi_autodetect.py +++ b/klaus/contrib/wsgi_autodetect.py @@ -1,14 +1,14 @@ +import contextlib import os import warnings -import contextlib from .app_args import get_args_from_env, strtobool from .wsgi_autodetecting import make_autodetecting_app try: - repos_root = os.environ['KLAUS_REPOS_ROOT'] + repos_root = os.environ["KLAUS_REPOS_ROOT"] except KeyError: - repos_root = os.environ['KLAUS_REPOS'] + repos_root = os.environ["KLAUS_REPOS"] warnings.warn( "use KLAUS_REPOS_ROOT instead of KLAUS_REPOS for the autodecting apps", DeprecationWarning, @@ -18,10 +18,10 @@ args = (repos_root,) + args[1:] with contextlib.suppress(KeyError): - kwargs['detect_removals'] = bool(strtobool(os.environ['KLAUS_DETECT_REMOVALS'])) + kwargs["detect_removals"] = bool(strtobool(os.environ["KLAUS_DETECT_REMOVALS"])) with contextlib.suppress(KeyError): - kwargs['export_ok_path'] = os.environ['KLAUS_EXPORT_OK_PATH'] + kwargs["export_ok_path"] = os.environ["KLAUS_EXPORT_OK_PATH"] with contextlib.suppress(KeyError): # How to deal with repository directories named "foo" and/or "foo.git". @@ -38,6 +38,6 @@ # KLAUS_EXPORT_OK_PATH="" # The .git suffix is not considered. - kwargs['directory_suffixes'] = os.environ['KLAUS_DIRECTORY_SUFFIXES'].split(os.sep) + kwargs["directory_suffixes"] = os.environ["KLAUS_DIRECTORY_SUFFIXES"].split(os.sep) application = make_autodetecting_app(*args, **kwargs) diff --git a/klaus/contrib/wsgi_autodetecting.py b/klaus/contrib/wsgi_autodetecting.py index f53334e6..711d5e62 100644 --- a/klaus/contrib/wsgi_autodetecting.py +++ b/klaus/contrib/wsgi_autodetecting.py @@ -30,8 +30,8 @@ import klaus.repo _bad_names = frozenset([os.curdir, os.pardir]) -_bad_chars = frozenset(['\0', os.sep, os.altsep]) -_default_directory_suffixes = ['', '.git'] +_bad_chars = frozenset(["\0", os.sep, os.altsep]) +_default_directory_suffixes = ["", ".git"] def coalesce(*args): @@ -72,7 +72,7 @@ def __init__( self._cache = {} self._namespace = namespace self._detect_removals = coalesce(detect_removals, True) - self._export_ok_path = coalesce(export_ok_path, 'git-daemon-export-ok') + self._export_ok_path = coalesce(export_ok_path, "git-daemon-export-ok") # Use the keys of a dict in reverse order so that we can create a sort # of "poor man's splay tree": the suffixes are always tried in reverse # order. If a suffix was matched succesfully it is moved to the end by @@ -85,7 +85,7 @@ def __init__( def __getitem__(self, name): if ( not name - or name.startswith('.') + or name.startswith(".") or name in _bad_names or not _bad_chars.isdisjoint(name) ): diff --git a/klaus/utils.py b/klaus/utils.py index e3703040..6576c2c3 100644 --- a/klaus/utils.py +++ b/klaus/utils.py @@ -117,12 +117,12 @@ def __init__(self, base): self._base = base def __getitem__(self, path): - if not path or path[0] != '/': + if not path or path[0] != "/": raise KeyError(path) return self._base[path[1:]] def __iter__(self): - return ('/' + name for name in self._base) + return ("/" + name for name in self._base) def __len__(self): return len(self._base)