diff --git a/klaus/__init__.py b/klaus/__init__.py index 96caddb..28c4e24 100644 --- a/klaus/__init__.py +++ b/klaus/__init__.py @@ -9,10 +9,9 @@ import dulwich.web import flask import httpauth -from dulwich.errors import NotGitRepository from klaus import utils, views -from klaus.repo import FancyRepo, InvalidRepo +from klaus.repo import DefaultRepoContainer KLAUS_VERSION = utils.guess_git_revision() or "3.0.1" @@ -23,20 +22,40 @@ class Klaus(flask.Flask): "undefined": jinja2.StrictUndefined, } - def __init__(self, repo_paths, site_name, use_smarthttp, ctags_policy="none"): + def __init__( + self, + repo_paths, + site_name, + use_smarthttp, + ctags_policy="none", + repo_container_factory=None, + ): """(See `make_app` for parameter descriptions.)""" self.site_name = site_name self.use_smarthttp = use_smarthttp self.ctags_policy = ctags_policy - valid_repos, invalid_repos = self.load_repos(repo_paths) - self.valid_repos = {repo.namespaced_name: repo for repo in valid_repos} - self.invalid_repos = {repo.namespaced_name: repo for repo in invalid_repos} + if repo_container_factory is None: + repo_container_factory = DefaultRepoContainer + + self.repo_container = repo_container_factory(repo_paths) flask.Flask.__init__(self, __name__) self.setup_routes() + @property + def valid_repos(self): + """Repositories that are considered valid by the repository manager""" + + return self.repo_container.valid + + @property + def invalid_repos(self): + """Repositories that were declined by the repository manager""" + + return self.repo_container.invalid + def create_jinja_environment(self): """Called by Flask.__init__""" env = super().create_jinja_environment() @@ -95,17 +114,6 @@ def should_use_ctags(self, git_repo, git_commit): else: raise ValueError("Unknown ctags policy %r" % self.ctags_policy) - def load_repos(self, repo_paths): - valid_repos = [] - invalid_repos = [] - for namespace, paths in repo_paths.items(): - for path in paths: - try: - valid_repos.append(FancyRepo(path, namespace)) - except NotGitRepository: - invalid_repos.append(InvalidRepo(path, namespace)) - return valid_repos, invalid_repos - def make_app( repo_paths, @@ -116,6 +124,7 @@ def make_app( disable_push=False, unauthenticated_push=False, ctags_policy="none", + repo_container_factory=None, ): """ Returns a WSGI app with all the features (smarthttp, authentication) @@ -145,6 +154,8 @@ def make_app( - 'tags-and-branches': use ctags for revisions that are the HEAD of a tag or branc - 'ALL': use ctags for all revisions, may result in high server load! + :param repo_container_factory: An instance of klaus.repo.BaseRepoContainer or None, + in which klaus.repo.DefaultRepoContainer will be used. """ if unauthenticated_push: if not use_smarthttp: @@ -159,25 +170,20 @@ def make_app( raise ValueError( "'htdigest_file' set without 'use_smarthttp' or 'require_browser_auth'" ) - if not isinstance(repo_paths, dict): - # If repos is given as a flat list, put all repos under the "no namespace" namespace - repo_paths = {None: repo_paths} app = Klaus( repo_paths, site_name, use_smarthttp, ctags_policy, + repo_container_factory, ) app.wsgi_app = utils.ProxyFix(app.wsgi_app) if use_smarthttp: # `path -> Repo` mapping for Dulwich's web support dulwich_backend = dulwich.server.DictBackend( - { - "/" + namespaced_name: repo - for namespaced_name, repo in app.valid_repos.items() - } + utils.SlashDictProxy(app.valid_repos) ) # Dulwich takes care of all Git related requests/URLs # and passes through everything else to klaus diff --git a/klaus/contrib/wsgi_autodetect.py b/klaus/contrib/wsgi_autodetect.py new file mode 100644 index 0000000..19d6095 --- /dev/null +++ b/klaus/contrib/wsgi_autodetect.py @@ -0,0 +1,43 @@ +import contextlib +import os +import warnings + +from .app_args import get_args_from_env, strtobool +from .wsgi_autodetecting import make_autodetecting_app + +try: + repos_root = os.environ["KLAUS_REPOS_ROOT"] +except KeyError: + repos_root = os.environ["KLAUS_REPOS"] + warnings.warn( + "use KLAUS_REPOS_ROOT instead of KLAUS_REPOS for the autodecting apps", + DeprecationWarning, + ) + +args, kwargs = get_args_from_env() +args = (repos_root,) + args[1:] + +with contextlib.suppress(KeyError): + kwargs["detect_removals"] = bool(strtobool(os.environ["KLAUS_DETECT_REMOVALS"])) + +with contextlib.suppress(KeyError): + kwargs["export_ok_path"] = os.environ["KLAUS_EXPORT_OK_PATH"] + +with contextlib.suppress(KeyError): + # How to deal with repository directories named "foo" and/or "foo.git". + # This is a list of potential suffixes, with your operating system's + # directory separator as a separator. Examples: + # + # KLAUS_EXPORT_OK_PATH="/.git" + # Directories with and without .git are accepted + # (the first entry is the empty string). Default. + # + # KLAUS_EXPORT_OK_PATH=".git" + # Only .git directories are accepted. + # + # KLAUS_EXPORT_OK_PATH="" + # The .git suffix is not considered. + + kwargs["directory_suffixes"] = os.environ["KLAUS_DIRECTORY_SUFFIXES"].split(os.sep) + +application = make_autodetecting_app(*args, **kwargs) diff --git a/klaus/contrib/wsgi_autodetecting.py b/klaus/contrib/wsgi_autodetecting.py new file mode 100644 index 0000000..711d5e6 --- /dev/null +++ b/klaus/contrib/wsgi_autodetecting.py @@ -0,0 +1,178 @@ +""" +Alternative take on the "automatically discovered repositories" concept +that requires no threads, polling or inotify. Instead the filesystem is +consulted whenever a repository name is looked up. + +Since Path.exists() and Path.iterdir() are fairly quick filesystem +operations, performance should be good for small to medium sites. +FancyRepo() objects are cached. + +Repositories are identified by the existence of a + + /git-daemon-export-ok + +file (for compatibility with gitweb). You can customize this path using +the export_ok_path parameter. Setting it to '.' will cause every +subdirectory to be considered a git repository. + +For large sites this approach may be hard on the filesystem when listing +repositories, because the process of enumerating the git repositories +causes the git-daemon-export-ok file to be checked in every repository. +This can be mitigated by setting detect_removals to False. +""" + +import collections.abc +import functools +import os +import pathlib + +import klaus +import klaus.repo + +_bad_names = frozenset([os.curdir, os.pardir]) +_bad_chars = frozenset(["\0", os.sep, os.altsep]) +_default_directory_suffixes = ["", ".git"] + + +def coalesce(*args): + """Return the first argument that is not None""" + + return next(arg for arg in args if arg is not None) + + +class AutodetectingRepoDict(collections.abc.Mapping): + """ + Maintain a virtual read-only dictionary whose contents represent + the presence of git repositories in the given root directory. + + :param root: The path to a directory containing repositories, each + a direct subdirectory of the root. + :param namespace: A namespace that will be applied to all detected + repositories. + :param detect_removals: Detect if repositories have been removed. + Defaults to True. Setting it to False can improve performance + for repository listings in very large sites. + :param export_ok_path: The filesystem path to check (relative to + the candidate repository root) to see if it is a valid servable + git repository. Defaults to 'git-daemon-export-ok'. Set to '.' + if every directory is known to be a valid repository root. + :param directory_suffixes: A list of suffixes that your git directories + may have. The default is ['', '.git']. + """ + + def __init__( + self, + root, + namespace=None, + detect_removals=None, + export_ok_path=None, + directory_suffixes=None, + ): + self._root = pathlib.Path(root) + self._cache = {} + self._namespace = namespace + self._detect_removals = coalesce(detect_removals, True) + self._export_ok_path = coalesce(export_ok_path, "git-daemon-export-ok") + # Use the keys of a dict in reverse order so that we can create a sort + # of "poor man's splay tree": the suffixes are always tried in reverse + # order. If a suffix was matched succesfully it is moved to the end by + # removing and readding it so that it is tried as the first option for + # the next repository. + self._suffixes = dict.fromkeys( + reversed(list(coalesce(directory_suffixes, _default_directory_suffixes))) + ) + + def __getitem__(self, name): + if ( + not name + or name.startswith(".") + or name in _bad_names + or not _bad_chars.isdisjoint(name) + ): + raise KeyError(name) + + if not self._detect_removals: + # Try returning a cached version first, to avoid filesystem access + try: + return self._cache[name] + except KeyError: + pass + + for suffix in reversed(self._suffixes): + # Bare git repositories may have a .git suffix on the directory name: + path = self._root / (name + suffix) + if (path / self._export_ok_path).exists(): + # Reorder suffix test order on the assumption that most repos will + # have the same suffix: + del self._suffixes[suffix] + self._suffixes[suffix] = None + break + else: + self._cache.pop(name, None) + raise KeyError(name) + + if self._detect_removals: + try: + return self._cache[name] + except KeyError: + pass + + repo = klaus.repo.FancyRepo(str(path), self._namespace) + self._cache[name] = repo + return repo + + def __iter__(self): + def is_valid_repo(path): + if not self._detect_removals and path.name in self._cache: + return True + return (path / self._export_ok_path).exists() + + suffixes = sorted(self._suffixes, key=len, reverse=True) + + def removesuffixes(string): + for suffix in suffixes: + attempt = string.removesuffix(suffix) + if attempt != string: + return attempt + return string + + return ( + removesuffixes(path.name) + for path in self._root.iterdir() + if is_valid_repo(path) + ) + + def __len__(self): + return sum(1 for _ in self) + + +class AutodetectingRepoContainer(klaus.repo.BaseRepoContainer): + """ + RepoContainer based on AutodetectingRepoDict. + See AutodetectingRepoDict for parameter descriptions. + """ + + def __init__(self, repos_root, *args, **kwargs): + super().__init__(repos_root) + self.valid = AutodetectingRepoDict(repos_root, *args, **kwargs) + + +def make_autodetecting_app( + repos_root, + *args, + detect_removals=None, + export_ok_path=None, + directory_suffixes=None, + **kwargs, +): + return klaus.make_app( + repos_root, + *args, + repo_container_factory=functools.partial( + AutodetectingRepoContainer, + detect_removals=detect_removals, + export_ok_path=export_ok_path, + directory_suffixes=directory_suffixes, + ), + **kwargs, + ) diff --git a/klaus/repo.py b/klaus/repo.py index d2298e6..4795517 100644 --- a/klaus/repo.py +++ b/klaus/repo.py @@ -7,7 +7,7 @@ import dulwich import dulwich.patch -from dulwich.errors import NotTreeError +from dulwich.errors import NotGitRepository, NotTreeError from dulwich.object_store import tree_lookup_path from dulwich.objects import S_ISGITLINK, Blob @@ -435,3 +435,32 @@ def namespaced_name(self): return f"~{self.namespace}/{self.name}" else: return self.name + + +class BaseRepoContainer: + """Abstract base class for repository containers.""" + + def __init__(self, repo_paths): + self._repo_paths = repo_paths + self.valid = {} + self.invalid = {} + + +class DefaultRepoContainer(BaseRepoContainer): + """Default repository container that holds a preset list of repositories""" + + def __init__(self, repo_paths): + if not isinstance(repo_paths, dict): + # If repos is given as a flat list, put all repos under the "no namespace" namespace + repo_paths = {None: repo_paths} + + super().__init__(repo_paths) + + for namespace, paths in repo_paths.items(): + for path in paths: + try: + repo = FancyRepo(path, namespace) + self.valid[repo.namespaced_name] = repo + except NotGitRepository: + repo = InvalidRepo(path, namespace) + self.invalid[repo.namespaced_name] = repo diff --git a/klaus/utils.py b/klaus/utils.py index 10d7e20..6576c2c 100644 --- a/klaus/utils.py +++ b/klaus/utils.py @@ -1,4 +1,5 @@ import binascii +import collections.abc import datetime import locale import mimetypes @@ -102,6 +103,31 @@ def __call__(self, environ, start_response): return self.app(environ, start_response) +class SlashDictProxy(collections.abc.Mapping): + """ + Proxy for dicts that makes keys start with a '/' character. + + The slash is added and removed from keys as necessary when items are + stored and retrieved. + + Needed for dulwich.server.DictBackend. + """ + + def __init__(self, base): + self._base = base + + def __getitem__(self, path): + if not path or path[0] != "/": + raise KeyError(path) + return self._base[path[1:]] + + def __iter__(self): + return ("/" + name for name in self._base) + + def __len__(self): + return len(self._base) + + def timesince(when, now=time.time): """Return the difference between `when` and `now` in human readable form.""" return naturaltime(now() - when)