Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add wsgi_autodetect.py to autodetect repositories with low complexity #267

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 30 additions & 24 deletions klaus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@
import dulwich.web
import flask
import httpauth
from dulwich.errors import NotGitRepository

from klaus import utils, views
from klaus.repo import FancyRepo, InvalidRepo
from klaus.repo import DefaultRepoContainer

KLAUS_VERSION = utils.guess_git_revision() or "3.0.1"

Expand All @@ -23,20 +22,40 @@ class Klaus(flask.Flask):
"undefined": jinja2.StrictUndefined,
}

def __init__(self, repo_paths, site_name, use_smarthttp, ctags_policy="none"):
def __init__(
self,
repo_paths,
site_name,
use_smarthttp,
ctags_policy="none",
repo_container_factory=None,
):
"""(See `make_app` for parameter descriptions.)"""
self.site_name = site_name
self.use_smarthttp = use_smarthttp
self.ctags_policy = ctags_policy

valid_repos, invalid_repos = self.load_repos(repo_paths)
self.valid_repos = {repo.namespaced_name: repo for repo in valid_repos}
self.invalid_repos = {repo.namespaced_name: repo for repo in invalid_repos}
if repo_container_factory is None:
repo_container_factory = DefaultRepoContainer

self.repo_container = repo_container_factory(repo_paths)

flask.Flask.__init__(self, __name__)

self.setup_routes()

@property
def valid_repos(self):
"""Repositories that are considered valid by the repository manager"""

return self.repo_container.valid

@property
def invalid_repos(self):
"""Repositories that were declined by the repository manager"""

return self.repo_container.invalid

def create_jinja_environment(self):
"""Called by Flask.__init__"""
env = super().create_jinja_environment()
Expand Down Expand Up @@ -95,17 +114,6 @@ def should_use_ctags(self, git_repo, git_commit):
else:
raise ValueError("Unknown ctags policy %r" % self.ctags_policy)

def load_repos(self, repo_paths):
valid_repos = []
invalid_repos = []
for namespace, paths in repo_paths.items():
for path in paths:
try:
valid_repos.append(FancyRepo(path, namespace))
except NotGitRepository:
invalid_repos.append(InvalidRepo(path, namespace))
return valid_repos, invalid_repos


def make_app(
repo_paths,
Expand All @@ -116,6 +124,7 @@ def make_app(
disable_push=False,
unauthenticated_push=False,
ctags_policy="none",
repo_container_factory=None,
):
"""
Returns a WSGI app with all the features (smarthttp, authentication)
Expand Down Expand Up @@ -145,6 +154,8 @@ def make_app(
- 'tags-and-branches': use ctags for revisions that are the HEAD of
a tag or branc
- 'ALL': use ctags for all revisions, may result in high server load!
:param repo_container_factory: An instance of klaus.repo.BaseRepoContainer or None,
in which klaus.repo.DefaultRepoContainer will be used.
"""
if unauthenticated_push:
if not use_smarthttp:
Expand All @@ -159,25 +170,20 @@ def make_app(
raise ValueError(
"'htdigest_file' set without 'use_smarthttp' or 'require_browser_auth'"
)
if not isinstance(repo_paths, dict):
# If repos is given as a flat list, put all repos under the "no namespace" namespace
repo_paths = {None: repo_paths}

app = Klaus(
repo_paths,
site_name,
use_smarthttp,
ctags_policy,
repo_container_factory,
)
app.wsgi_app = utils.ProxyFix(app.wsgi_app)

if use_smarthttp:
# `path -> Repo` mapping for Dulwich's web support
dulwich_backend = dulwich.server.DictBackend(
{
"/" + namespaced_name: repo
for namespaced_name, repo in app.valid_repos.items()
}
utils.SlashDictProxy(app.valid_repos)
)
# Dulwich takes care of all Git related requests/URLs
# and passes through everything else to klaus
Expand Down
43 changes: 43 additions & 0 deletions klaus/contrib/wsgi_autodetect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import contextlib
import os
import warnings

from .app_args import get_args_from_env, strtobool
from .wsgi_autodetecting import make_autodetecting_app

try:
repos_root = os.environ["KLAUS_REPOS_ROOT"]
except KeyError:
repos_root = os.environ["KLAUS_REPOS"]
warnings.warn(
"use KLAUS_REPOS_ROOT instead of KLAUS_REPOS for the autodecting apps",
DeprecationWarning,
)

args, kwargs = get_args_from_env()
args = (repos_root,) + args[1:]

with contextlib.suppress(KeyError):
kwargs["detect_removals"] = bool(strtobool(os.environ["KLAUS_DETECT_REMOVALS"]))

with contextlib.suppress(KeyError):
kwargs["export_ok_path"] = os.environ["KLAUS_EXPORT_OK_PATH"]

with contextlib.suppress(KeyError):
# How to deal with repository directories named "foo" and/or "foo.git".
# This is a list of potential suffixes, with your operating system's
# directory separator as a separator. Examples:
#
# KLAUS_EXPORT_OK_PATH="/.git"
# Directories with and without .git are accepted
# (the first entry is the empty string). Default.
#
# KLAUS_EXPORT_OK_PATH=".git"
# Only .git directories are accepted.
#
# KLAUS_EXPORT_OK_PATH=""
# The .git suffix is not considered.

kwargs["directory_suffixes"] = os.environ["KLAUS_DIRECTORY_SUFFIXES"].split(os.sep)

application = make_autodetecting_app(*args, **kwargs)
178 changes: 178 additions & 0 deletions klaus/contrib/wsgi_autodetecting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
"""
Alternative take on the "automatically discovered repositories" concept
that requires no threads, polling or inotify. Instead the filesystem is
consulted whenever a repository name is looked up.

Since Path.exists() and Path.iterdir() are fairly quick filesystem
operations, performance should be good for small to medium sites.
FancyRepo() objects are cached.

Repositories are identified by the existence of a

<reponame>/git-daemon-export-ok

file (for compatibility with gitweb). You can customize this path using
the export_ok_path parameter. Setting it to '.' will cause every
subdirectory to be considered a git repository.

For large sites this approach may be hard on the filesystem when listing
repositories, because the process of enumerating the git repositories
causes the git-daemon-export-ok file to be checked in every repository.
This can be mitigated by setting detect_removals to False.
"""

import collections.abc
import functools
import os
import pathlib

import klaus
import klaus.repo

_bad_names = frozenset([os.curdir, os.pardir])
_bad_chars = frozenset(["\0", os.sep, os.altsep])
_default_directory_suffixes = ["", ".git"]


def coalesce(*args):
"""Return the first argument that is not None"""

return next(arg for arg in args if arg is not None)


class AutodetectingRepoDict(collections.abc.Mapping):
"""
Maintain a virtual read-only dictionary whose contents represent
the presence of git repositories in the given root directory.

:param root: The path to a directory containing repositories, each
a direct subdirectory of the root.
:param namespace: A namespace that will be applied to all detected
repositories.
:param detect_removals: Detect if repositories have been removed.
Defaults to True. Setting it to False can improve performance
for repository listings in very large sites.
:param export_ok_path: The filesystem path to check (relative to
the candidate repository root) to see if it is a valid servable
git repository. Defaults to 'git-daemon-export-ok'. Set to '.'
if every directory is known to be a valid repository root.
:param directory_suffixes: A list of suffixes that your git directories
may have. The default is ['', '.git'].
"""

def __init__(
self,
root,
namespace=None,
detect_removals=None,
export_ok_path=None,
directory_suffixes=None,
):
self._root = pathlib.Path(root)
self._cache = {}
self._namespace = namespace
self._detect_removals = coalesce(detect_removals, True)
self._export_ok_path = coalesce(export_ok_path, "git-daemon-export-ok")
# Use the keys of a dict in reverse order so that we can create a sort
# of "poor man's splay tree": the suffixes are always tried in reverse
# order. If a suffix was matched succesfully it is moved to the end by
# removing and readding it so that it is tried as the first option for
# the next repository.
self._suffixes = dict.fromkeys(
reversed(list(coalesce(directory_suffixes, _default_directory_suffixes)))
)

def __getitem__(self, name):
if (
not name
or name.startswith(".")
or name in _bad_names
or not _bad_chars.isdisjoint(name)
):
raise KeyError(name)

if not self._detect_removals:
# Try returning a cached version first, to avoid filesystem access
try:
return self._cache[name]
except KeyError:
pass

for suffix in reversed(self._suffixes):
# Bare git repositories may have a .git suffix on the directory name:
path = self._root / (name + suffix)
if (path / self._export_ok_path).exists():
# Reorder suffix test order on the assumption that most repos will
# have the same suffix:
del self._suffixes[suffix]
self._suffixes[suffix] = None
break
else:
self._cache.pop(name, None)
raise KeyError(name)

if self._detect_removals:
try:
return self._cache[name]
except KeyError:
pass

repo = klaus.repo.FancyRepo(str(path), self._namespace)
self._cache[name] = repo
return repo

def __iter__(self):
def is_valid_repo(path):
if not self._detect_removals and path.name in self._cache:
return True
return (path / self._export_ok_path).exists()

suffixes = sorted(self._suffixes, key=len, reverse=True)

def removesuffixes(string):
for suffix in suffixes:
attempt = string.removesuffix(suffix)
if attempt != string:
return attempt
return string

return (
removesuffixes(path.name)
for path in self._root.iterdir()
if is_valid_repo(path)
)

def __len__(self):
return sum(1 for _ in self)


class AutodetectingRepoContainer(klaus.repo.BaseRepoContainer):
"""
RepoContainer based on AutodetectingRepoDict.
See AutodetectingRepoDict for parameter descriptions.
"""

def __init__(self, repos_root, *args, **kwargs):
super().__init__(repos_root)
self.valid = AutodetectingRepoDict(repos_root, *args, **kwargs)


def make_autodetecting_app(
repos_root,
*args,
detect_removals=None,
export_ok_path=None,
directory_suffixes=None,
**kwargs,
):
return klaus.make_app(
repos_root,
*args,
repo_container_factory=functools.partial(
AutodetectingRepoContainer,
detect_removals=detect_removals,
export_ok_path=export_ok_path,
directory_suffixes=directory_suffixes,
),
**kwargs,
)
31 changes: 30 additions & 1 deletion klaus/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import dulwich
import dulwich.patch
from dulwich.errors import NotTreeError
from dulwich.errors import NotGitRepository, NotTreeError
from dulwich.object_store import tree_lookup_path
from dulwich.objects import S_ISGITLINK, Blob

Expand Down Expand Up @@ -435,3 +435,32 @@ def namespaced_name(self):
return f"~{self.namespace}/{self.name}"
else:
return self.name


class BaseRepoContainer:
"""Abstract base class for repository containers."""

def __init__(self, repo_paths):
self._repo_paths = repo_paths
self.valid = {}
self.invalid = {}


class DefaultRepoContainer(BaseRepoContainer):
"""Default repository container that holds a preset list of repositories"""

def __init__(self, repo_paths):
if not isinstance(repo_paths, dict):
# If repos is given as a flat list, put all repos under the "no namespace" namespace
repo_paths = {None: repo_paths}

super().__init__(repo_paths)

for namespace, paths in repo_paths.items():
for path in paths:
try:
repo = FancyRepo(path, namespace)
self.valid[repo.namespaced_name] = repo
except NotGitRepository:
repo = InvalidRepo(path, namespace)
self.invalid[repo.namespaced_name] = repo
Loading
Loading