From b865244ea9fdbd0231fffd9f6972bd76a1c135e3 Mon Sep 17 00:00:00 2001 From: rgaudin Date: Mon, 13 Jan 2025 14:47:36 +0000 Subject: [PATCH] Initial seeder version --- .github/workflows/bittorrent-seeder_ci.yaml | 28 ++ bittorrent-seeder/Dockerfile | 40 +++ bittorrent-seeder/README.md | 1 + bittorrent-seeder/entrypoint.sh | 97 ++++++ bittorrent-seeder/gen-password.py | 21 ++ bittorrent-seeder/get-pbkdf2.py | 38 +++ bittorrent-seeder/pyproject.toml | 237 ++++++++++++++ .../src/kiwixseeder/__about__.py | 1 + bittorrent-seeder/src/kiwixseeder/__main__.py | 3 + bittorrent-seeder/src/kiwixseeder/context.py | 185 +++++++++++ bittorrent-seeder/src/kiwixseeder/download.py | 148 +++++++++ .../src/kiwixseeder/entrypoint.py | 305 ++++++++++++++++++ bittorrent-seeder/src/kiwixseeder/library.py | 273 ++++++++++++++++ .../src/kiwixseeder/qbittorrent.py | 136 ++++++++ bittorrent-seeder/src/kiwixseeder/runner.py | 252 +++++++++++++++ bittorrent-seeder/src/kiwixseeder/utils.py | 44 +++ bittorrent-seeder/tasks.py | 110 +++++++ 17 files changed, 1919 insertions(+) create mode 100644 .github/workflows/bittorrent-seeder_ci.yaml create mode 100644 bittorrent-seeder/Dockerfile create mode 100644 bittorrent-seeder/README.md create mode 100755 bittorrent-seeder/entrypoint.sh create mode 100755 bittorrent-seeder/gen-password.py create mode 100755 bittorrent-seeder/get-pbkdf2.py create mode 100644 bittorrent-seeder/pyproject.toml create mode 100644 bittorrent-seeder/src/kiwixseeder/__about__.py create mode 100644 bittorrent-seeder/src/kiwixseeder/__main__.py create mode 100644 bittorrent-seeder/src/kiwixseeder/context.py create mode 100644 bittorrent-seeder/src/kiwixseeder/download.py create mode 100644 bittorrent-seeder/src/kiwixseeder/entrypoint.py create mode 100644 bittorrent-seeder/src/kiwixseeder/library.py create mode 100644 bittorrent-seeder/src/kiwixseeder/qbittorrent.py create mode 100644 bittorrent-seeder/src/kiwixseeder/runner.py create mode 100644 bittorrent-seeder/src/kiwixseeder/utils.py create mode 100644 bittorrent-seeder/tasks.py diff --git a/.github/workflows/bittorrent-seeder_ci.yaml b/.github/workflows/bittorrent-seeder_ci.yaml new file mode 100644 index 00000000..d1c56dc9 --- /dev/null +++ b/.github/workflows/bittorrent-seeder_ci.yaml @@ -0,0 +1,28 @@ +name: BitTorrent Seeder + +on: + push: + branches: + - 'main' + paths: + - 'bittorrent-seeder/**' + workflow_dispatch: + +jobs: + + bittorrent-tracker: + name: Deploy BitTorrent Seeder Image + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3.4.0 + - name: Publish Docker Image + uses: openzim/docker-publish-action@v10 + with: + image-name: kiwix/bittorrent-seeder + on-master: latest + restrict-to: kiwix/container-images + context: bittorrent-seeder + registries: ghcr.io + credentials: + GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} + GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} diff --git a/bittorrent-seeder/Dockerfile b/bittorrent-seeder/Dockerfile new file mode 100644 index 00000000..2a8a3310 --- /dev/null +++ b/bittorrent-seeder/Dockerfile @@ -0,0 +1,40 @@ +FROM alpine:edge + +ENV SHELL=bash + +RUN set -e \ + # add testing repo for qbittorrent-cli + && echo "https://dl-cdn.alpinelinux.org/alpine/edge/testing" >> /etc/apk/repositories \ + && apk update \ + && apk --no-cache add dumb-init bash \ + && apk --no-cache add python3 py3-pip \ + # the daemon with webui \ + && apk --no-cache add qbittorrent-nox \ + # for convenience + && apk --no-cache add qbittorrent-cli + +ENV NO_DAEMON="" +ENV QBT_TORRENTING_PORT=6901 +ENV QBT_HOST=localhost +ENV QBT_PORT=80 +ENV QBT_USERNAME=admin +ENV QBT_PASSWORD= + + +COPY entrypoint.sh /usr/local/bin/entrypoint +COPY pyproject.toml README.md tasks.py /src/ +COPY src/ /src/src +COPY gen-password.py /usr/local/bin/gen-password +COPY get-pbkdf2.py /usr/local/bin/get-pbkdf2 + +RUN set -e \ + && pip install --break-system-packages /src/ \ + && kiwix-seeder --help + +EXPOSE 80 +EXPOSE 6901 +VOLUME /data +WORKDIR /data + +ENTRYPOINT ["/usr/bin/dumb-init", "--", "/usr/local/bin/entrypoint"] +CMD ["kiwix-seeder", "--loop"] diff --git a/bittorrent-seeder/README.md b/bittorrent-seeder/README.md new file mode 100644 index 00000000..964c08d9 --- /dev/null +++ b/bittorrent-seeder/README.md @@ -0,0 +1 @@ +## Kiwix Seeder diff --git a/bittorrent-seeder/entrypoint.sh b/bittorrent-seeder/entrypoint.sh new file mode 100755 index 00000000..c2cebc2c --- /dev/null +++ b/bittorrent-seeder/entrypoint.sh @@ -0,0 +1,97 @@ +#!/bin/sh + +function start_qbt { + echo "Starting a qbittorrent-nox process (set NO_DAEMON if you dont want to)" + + QBT_HOST="${QBT_HOST:-localhost}" + QBT_PORT="${QBT_PORT:-80}" + QBT_USERNAME="${QBT_USERNAME:-admin}" + QBT_TORRENTING_PORT="${QBT_TORRENTING_PORT:-6901}" + + QBT_MAX_CONNECTIONS="${QBT_MAX_CONNECTIONS:-500}" + QBT_MAX_CONNECTIONS_PER_TORRENT="${QBT_MAX_CONNECTIONS_PER_TORRENT:-100}" + QBT_MAX_UPLOADS="${QBT_MAX_UPLOADS:-20}" + QBT_MAX_UPLOADS_PER_TORRENT="${QBT_MAX_UPLOADS_PER_TORRENT:-5}" + + if [ "x${QBT_PASSWORD}" = "x" ]; then + QBT_PASSWORD=$(gen-password) + echo "Generated web-ui password: ${QBT_PASSWORD}" + fi + PKBF2_PASSWORD=$(get-pbkdf2 "${QBT_PASSWORD}") + + QBITTORRENT_CONFIG_FILE=/root/.config/qBittorrent/qBittorrent.conf + mkdir -p $(dirname $QBITTORRENT_CONFIG_FILE) + cat < $QBITTORRENT_CONFIG_FILE +[BitTorrent] +MergeTrackersEnabled=true +Session\DefaultSavePath=/data +Session\AddExtensionToIncompleteFiles=true +Session\MaxConnections=${QBT_MAX_CONNECTIONS} +Session\MaxConnectionsPerTorrent=${QBT_MAX_CONNECTIONS_PER_TORRENT} +Session\MaxUploads=${QBT_MAX_UPLOADS} +Session\MaxUploadsPerTorrent=${QBT_MAX_UPLOADS_PER_TORRENT} +Session\Port=${QBT_TORRENTING_PORT} +Session\Preallocation=true +Session\QueueingSystemEnabled=false +Session\SSL\Port=30154 + +[LegalNotice] +Accepted=true + +[Meta] +MigrationVersion=8 + +[Preferences] +General\Locale=en +WebUI\Enabled=true +WebUI\Port=${QBT_PORT} +WebUI\Username=${QBT_USERNAME} +WebUI\Password_PBKDF2="@ByteArray(${PKBF2_PASSWORD})" +WebUI\LocalHostAuth=true +WebUI\HostHeaderValidation=false +WebUI\CSRFProtection=false + +[Core] +AutoDeleteAddedTorrentFile=Always + +[Application] +FileLogger\Age=5 +FileLogger\AgeType=0 +FileLogger\Backup=true +FileLogger\DeleteOld=true +FileLogger\Enabled=true +FileLogger\MaxSizeBytes=1048576 +FileLogger\Path=/var/log +GUI\Notifications\TorrentAdded=false + +EOF + + QBT_CONFIG_FILE=/root/.config/qbt/.qbt.toml + mkdir -p $(dirname $QBT_CONFIG_FILE) + cat < $QBT_CONFIG_FILE +[qbittorrent] +addr = "http://${QBT_HOST}:${QBT_PORT}" # qbittorrent webui-api hostname/ip +login = "${QBT_USERNAME}" # qbittorrent webui-api user +password = "${QBT_PASSWORD}" # qbittorrent webui-api password +#basicUser = "user" # qbittorrent webui-api basic auth user +#basicPass = "password" # qbittorrent webui-api basic auth password + +[rules] +enabled = true # enable or disable rules +max_active_downloads = 2 # set max active downloads +EOF + + mkdir -p ~/.config/qbt && touch ~/.config/qbt/.qbt.toml + + + # qbittorrent-nox --help + qbittorrent-nox --save-path=/data --daemon + rc=$? + echo "rc=${rc}" +} + +if [ "x${NO_DAEMON}" = "x" ]; then + start_qbt +fi + +exec "$@" diff --git a/bittorrent-seeder/gen-password.py b/bittorrent-seeder/gen-password.py new file mode 100755 index 00000000..e8f0c0cb --- /dev/null +++ b/bittorrent-seeder/gen-password.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python + +"""command line to generate a short (8chars) password from alphanum""" + +import secrets +import string +import sys + + +def gen_password() -> str: + alphabet = string.ascii_letters + string.digits + return "".join(secrets.choice(alphabet) for _ in range(8)) + + +def main() -> int: + print(gen_password()) # noqa: T201 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bittorrent-seeder/get-pbkdf2.py b/bittorrent-seeder/get-pbkdf2.py new file mode 100755 index 00000000..06d3421c --- /dev/null +++ b/bittorrent-seeder/get-pbkdf2.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +""" command line to generate a base64-encoded PBKDF2 version of a password + + similar to how qBittorrent stores web-ui password in its config file. + Format is salt:HMAC""" + +import base64 +import hashlib +import secrets +import sys + + +def asb64(data: bytes) -> str: + return base64.b64encode(data).decode("ASCII") + + +def get_pbkdf2_for(password: str) -> str: + salt = secrets.token_bytes(16) + hmac = hashlib.pbkdf2_hmac( + hash_name="sha512", + password=password.encode("UTF-8"), + salt=salt, + iterations=100000, + ) + return f"{asb64(salt)}:{asb64(hmac)}" + + +def main() -> int: + if len(sys.argv) != 2: # noqa: PLR2004 + print(f"Usage: {sys.argv[0]} CLEAR_PASSWORD") # noqa: T201 + return 1 + print(get_pbkdf2_for(sys.argv[1])) # noqa: T201 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bittorrent-seeder/pyproject.toml b/bittorrent-seeder/pyproject.toml new file mode 100644 index 00000000..9585cf98 --- /dev/null +++ b/bittorrent-seeder/pyproject.toml @@ -0,0 +1,237 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "kiwix-seeder" +requires-python = ">=3.12" +description = "Bittorrent seeder for Kiwix ZIM Catalog" +readme = "README.md" +dependencies = [ + "requests==2.32.3", + "humanfriendly==10.0", + "iso639-lang==2.5.1", + "xmltodict==0.14.2", + "qbittorrent-api==2024.12.71", + "rich-argparse==1.6.0", +] +authors = [ + { name = "Kiwix", email = "dev@kiwix.org" }, +] +keywords = ["offspot"] +license = {text = "GPL-3.0-or-later"} +classifiers = [ + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", + "Operating System :: POSIX", +] +dynamic = ["version"] + +[project.optional-dependencies] +scripts = [ + "invoke==2.2.0", +] +lint = [ + "black==24.10.0", + "ruff==0.8.4", +] +check = [ + "pyright==1.1.391", +] +test = [ + "pytest==8.3.4", + "coverage==7.6.10", +] +dev = [ + "pre-commit==4.0.1", + "ipython==8.31.0", + "kiwix-seeder[scripts]", + "kiwix-seeder[lint]", + "kiwix-seeder[test]", + "kiwix-seeder[check]", +] + +[project.scripts] +kiwix-seeder = "kiwixseeder.entrypoint:entrypoint" + +[tool.hatch.version] +path = "src/kiwixseeder/__about__.py" + +[tool.hatch.build] +exclude = [ + "/.github", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/kiwixseeder"] + +[tool.hatch.envs.default] +features = ["dev"] + +[tool.hatch.envs.test] +features = ["scripts", "test"] + +[[tool.hatch.envs.test.matrix]] +python = ["3.12", "3.13"] + +[tool.hatch.envs.test.scripts] +run = "inv test --args '{args}'" +run-cov = "inv test-cov --args '{args}'" +report-cov = "inv report-cov" +coverage = "inv coverage --args '{args}'" +html = "inv coverage --html --args '{args}'" + +[tool.hatch.envs.lint] +template = "lint" +skip-install = false +features = ["scripts", "lint"] + +[tool.hatch.envs.lint.scripts] +black = "inv lint-black --args '{args}'" +ruff = "inv lint-ruff --args '{args}'" +all = "inv lintall --args '{args}'" +fix-black = "inv fix-black --args '{args}'" +fix-ruff = "inv fix-ruff --args '{args}'" +fixall = "inv fixall --args '{args}'" + +[tool.hatch.envs.check] +features = ["scripts", "check"] + +[tool.hatch.envs.check.scripts] +pyright = "inv check-pyright --args '{args}'" +all = "inv checkall --args '{args}'" + +[tool.black] +line-length = 88 +target-version = ['py311'] + +[tool.ruff] +target-version = "py312" +line-length = 88 +src = ["src"] + +[tool.ruff.lint] +select = [ + "A", # flake8-builtins + # "ANN", # flake8-annotations + "ARG", # flake8-unused-arguments + # "ASYNC", # flake8-async + "B", # flake8-bugbear + # "BLE", # flake8-blind-except + "C4", # flake8-comprehensions + "C90", # mccabe + # "COM", # flake8-commas + # "D", # pydocstyle + # "DJ", # flake8-django + "DTZ", # flake8-datetimez + "E", # pycodestyle (default) + "EM", # flake8-errmsg + # "ERA", # eradicate + # "EXE", # flake8-executable + "F", # Pyflakes (default) + # "FA", # flake8-future-annotations + "FBT", # flake8-boolean-trap + # "FLY", # flynt + # "G", # flake8-logging-format + "I", # isort + "ICN", # flake8-import-conventions + # "INP", # flake8-no-pep420 + # "INT", # flake8-gettext + "ISC", # flake8-implicit-str-concat + "N", # pep8-naming + # "NPY", # NumPy-specific rules + # "PD", # pandas-vet + # "PGH", # pygrep-hooks + # "PIE", # flake8-pie + # "PL", # Pylint + "PLC", # Pylint: Convention + "PLE", # Pylint: Error + "PLR", # Pylint: Refactor + "PLW", # Pylint: Warning + # "PT", # flake8-pytest-style + # "PTH", # flake8-use-pathlib + # "PYI", # flake8-pyi + "Q", # flake8-quotes + # "RET", # flake8-return + # "RSE", # flake8-raise + "RUF", # Ruff-specific rules + "S", # flake8-bandit + # "SIM", # flake8-simplify + # "SLF", # flake8-self + "T10", # flake8-debugger + "T20", # flake8-print + # "TCH", # flake8-type-checking + # "TD", # flake8-todos + "TID", # flake8-tidy-imports + # "TRY", # tryceratops + "UP", # pyupgrade + "W", # pycodestyle + "YTT", # flake8-2020 +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Remove flake8-errmsg since we consider they bloat the code and provide limited value + "EM", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", "S106", "S107", + # Ignore warnings on subprocess.run / popen + "S603", + # Ignore complexity + "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", + # implicit string concatenation + "ISC001", +] +unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.lint.isort] +known-first-party = ["kiwixseeder"] + +[tool.ruff.lint.flake8-bugbear] +# add exceptions to B008 for fastapi. +extend-immutable-calls = ["fastapi.Depends", "fastapi.Query"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "all" + +[tool.ruff.lint.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests/**/*" = ["PLR2004", "S101", "TID252"] + +[tool.pytest.ini_options] +minversion = "7.3" +testpaths = ["tests"] +pythonpath = [".", "src"] + +[tool.coverage.paths] +kiwixseeder = ["src/kiwixseeder"] +tests = ["tests"] + +[tool.coverage.run] +source_pkgs = ["kiwixseeder"] +branch = true +parallel = true +omit = [ + "src/kiwixseeder/__about__.py", +] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + +[tool.pyright] +include = ["src", "tests", "tasks.py"] +exclude = [".env/**", ".venv/**"] +extraPaths = ["src"] +pythonVersion = "3.12" +typeCheckingMode="strict" +disableBytesTypePromotions = true diff --git a/bittorrent-seeder/src/kiwixseeder/__about__.py b/bittorrent-seeder/src/kiwixseeder/__about__.py new file mode 100644 index 00000000..5becc17c --- /dev/null +++ b/bittorrent-seeder/src/kiwixseeder/__about__.py @@ -0,0 +1 @@ +__version__ = "1.0.0" diff --git a/bittorrent-seeder/src/kiwixseeder/__main__.py b/bittorrent-seeder/src/kiwixseeder/__main__.py new file mode 100644 index 00000000..91da0eaa --- /dev/null +++ b/bittorrent-seeder/src/kiwixseeder/__main__.py @@ -0,0 +1,3 @@ +from kiwixseeder.entrypoint import entrypoint + +entrypoint() diff --git a/bittorrent-seeder/src/kiwixseeder/context.py b/bittorrent-seeder/src/kiwixseeder/context.py new file mode 100644 index 00000000..e73404a9 --- /dev/null +++ b/bittorrent-seeder/src/kiwixseeder/context.py @@ -0,0 +1,185 @@ +import logging +import os +import platform +from dataclasses import dataclass, field +from typing import Any, Self +from urllib.parse import ParseResult, urlparse + +import humanfriendly +import qbittorrentapi + +NAME = "kiwix-seeder" # must be filesystem-friendly (technical) +CLI_NAME = "kiwix-seeder" +HUMAN_NAME = "Kiwix Seeder" +QBT_CAT_NAME = "kiwix-seeder" # name of category to group our torrents in + +CATALOG_URL = os.getenv("CATALOG_URL", "https://library.kiwix.org/catalog/v2") +DOWNLOAD_URL = os.getenv("CATALOG_URL", "https://download.kiwix.org") +DEFAULT_QBT_USERNAME: str | None = os.getenv("QBT_USERNAME") +DEFAULT_QBT_PASSWORD: str | None = os.getenv("QBT_PASSWORD") +DEFAULT_QBT_HOST: str = os.getenv("QBT_HOST") or "localhost" +DEFAULT_QBT_PORT: int = int(os.getenv("QBT_PORT") or "8080") +DEFAULT_MAX_STORAGE: int = humanfriendly.parse_size(os.getenv("MAX_STORAGE") or "10GiB") +DEFAULT_KEEP_DURATION: float = humanfriendly.parse_timespan( + os.getenv("KEEP_DURATION") or "12w" +) +DEFAULT_SLEEP_INTERVAL: float = humanfriendly.parse_timespan( + os.getenv("SLEEP_INTERVAL") or "5m" +) + +DEFAULT_DB_PATH = os.getenv( + "KIWIXSEEDER_DB", +) + + +# avoid debug-level logs of 3rd party deps +for module in ("urllib3", "qbittorrentapi.request"): + logging.getLogger(module).setLevel(logging.INFO) + + +def format_size(value: int) -> str: + """human friendly representation of size (in binary form)""" + return humanfriendly.format_size(value, binary=True) + + +@dataclass(kw_only=True) +class QbtConnection: + username: str | None + password: str | None + host: str + port: int + + @classmethod + def using(cls, string: str) -> Self: + uri = urlparse(string) + if uri.scheme != "qsl": + raise ValueError(f"Malformed qbt:// URI: {string}") + return cls( + username=uri.username, + password=uri.password, + host=uri.hostname or "localhost", + port=uri.port or 80, + ) + + def __str__(self) -> str: + return ParseResult( + scheme="qsl", + netloc=f"{self.username or ''}" + f"{':' if self.password else ''}{self.password or ''}" + f"@{self.host}:{self.port}", + path="", + params="", + query="", + fragment="", + ).geturl() + + +DEFAULT_QBT_CONN = str( + QbtConnection.using(str(os.getenv("QBT_URI"))) + if os.getenv("QBT_URI") + else QbtConnection( + username=DEFAULT_QBT_USERNAME, + password=DEFAULT_QBT_PASSWORD, + host=DEFAULT_QBT_HOST, + port=DEFAULT_QBT_PORT, + ) +) + + +@dataclass(kw_only=True) +class SizeRange: + minimum: int = -1 + maximum: int = -1 + + def is_valid(self) -> bool: + if self.minimum == self.maximum == -1: + return True + # maximum is either not set or positive + if self.maximum != -1: + return max(self.maximum, 0) >= max(self.minimum, 0) + return True + + def is_above_min(self, value: int) -> bool: + return value >= max(self.minimum, 0) + + def is_below_max(self, value: int) -> bool: + if self.maximum == -1: + return True + return value <= self.maximum + + def match(self, value: int) -> bool: + # not valid, not matching. + if not self.is_valid(): + return False + # no bound, always OK + if self.minimum == self.maximum == -1: + return True + return self.is_above_min(value) and self.is_below_max(value) + + def __str__(self) -> str: + if not self.is_valid(): + return f"Invalid range: min={self.minimum}, max={self.maximum}" + if self.minimum == self.maximum == -1: + return "all" + if self.minimum == self.maximum: + return f"exactly {format_size(self.maximum)}" + if self.minimum == -1: + return f"below {format_size(self.maximum)}" + if self.maximum == -1: + return f"above {format_size(self.minimum)}" + return f"between {format_size(self.minimum)} and {format_size(self.maximum)}" + + +@dataclass(kw_only=True) +class Context: + + # singleton instance + _instance: "Context | None" = None + + # debug flag + debug: bool = False + + run_forever: bool = False + sleep_interval: float = DEFAULT_KEEP_DURATION + + is_mac: bool = platform.system() == "Darwin" + is_win: bool = platform.system() == "Windows" + is_nix: bool = platform.system() not in ("Darwin", "Windows") + + catalog_url: str = CATALOG_URL + download_url: str = DOWNLOAD_URL + qbt: qbittorrentapi.Client + + filenames: set[str] = field(default_factory=set) + languages: set[str] = field(default_factory=set) + categories: set[str] = field(default_factory=set) + flavours: set[str] = field(default_factory=set) + tags: set[str] = field(default_factory=set) + authors: set[str] = field(default_factory=set) + publishers: set[str] = field(default_factory=set) + filesizes: SizeRange = field(default_factory=SizeRange) + max_storage: int = DEFAULT_MAX_STORAGE + keep_for: float = DEFAULT_KEEP_DURATION + all_good: bool = False + + logger: logging.Logger = logging.getLogger(NAME) # noqa: RUF009 + max_direct_online_resource_payload_size: int = 2048 + + @classmethod + def setup(cls, **kwargs: Any): + if cls._instance: + raise OSError("Already inited Context") + cls._instance = cls(**kwargs) + cls._instance.logger.setLevel( + logging.DEBUG if cls._instance.debug else logging.INFO + ) + logging.basicConfig( + level=logging.DEBUG if cls._instance.debug else logging.INFO, + format="%(asctime)s %(levelname)s | %(message)s", + ) + + @classmethod + def get(cls) -> "Context": + if not cls._instance: + raise OSError("Uninitialized context") # pragma: no cover + return cls._instance diff --git a/bittorrent-seeder/src/kiwixseeder/download.py b/bittorrent-seeder/src/kiwixseeder/download.py new file mode 100644 index 00000000..c6397c31 --- /dev/null +++ b/bittorrent-seeder/src/kiwixseeder/download.py @@ -0,0 +1,148 @@ +import io +import re +from http import HTTPStatus +from pathlib import Path +from urllib.parse import urlparse + +import requests +import requests.adapters +from urllib3.util.retry import Retry + +from kiwixseeder.context import Context + +session = requests.Session() +# basic urllib retry mechanism. +# Sleep (seconds): {backoff factor} * (2 ** ({number of total retries} - 1)) +# https://docs.descarteslabs.com/_modules/urllib3/util/retry.html +retries = Retry( + total=10, # Total number of retries to allow. Takes precedence over other counts. + connect=5, # How many connection-related errors to retry on + read=5, # How many times to retry on read errors + redirect=20, # How many redirects to perform. (to avoid infinite redirect loops) + status=3, # How many times to retry on bad status codes + other=0, # How many times to retry on other errors + allowed_methods=None, # Set of HTTP verbs that we should retry on (False is all) + status_forcelist=[ + 413, + 429, + 500, + 502, + 503, + 504, + ], # Set of integer HTTP status we should force a retry on + backoff_factor=30, # backoff factor to apply between attempts after the second try, + backoff_max=1800.0, # allow up-to 30mn backoff (default 2mn) + raise_on_redirect=False, # raise MaxRetryError instead of 3xx response + raise_on_status=False, # raise on Bad Status or response + respect_retry_after_header=True, # respect Retry-After header (status_forcelist) +) +session.mount("http", requests.adapters.HTTPAdapter(max_retries=retries)) + + +def get_online_rsc_size(url: str) -> int: + """size (Content-Length) from url if specified, -1 otherwise (-2 on errors)""" + try: + resp = session.head(url, allow_redirects=True, timeout=60) + # some servers dont offer HEAD + if resp.status_code != HTTPStatus.OK: + resp = requests.get( + url, + allow_redirects=True, + timeout=60, + stream=True, + headers={"Accept-Encoding": "identity"}, + ) + resp.raise_for_status() + return int(resp.headers.get("Content-Length") or -1) + except Exception: + return -2 + + +def url_is_working(url: str) -> bool: + """whether URL currently returns HTTP 200. Use to rule out 404 quickly""" + resp = session.get(url, allow_redirects=True, timeout=60, stream=True) + return resp.status_code == HTTPStatus.OK + + +def get_payload_from( + url: str, no_more_than: int = Context.max_direct_online_resource_payload_size +) -> bytes: + """Retrieved content from an URL + + Limited in order to prevent download bomb. + + Parameters: + url: URL to retrieve payload from (follows redirects) + no_more_than: number of bytes to consider too much and fail at + + Raises: + OSError: Should declared or retrieved size exceed no_more_than + RequestException: HTTP or other error in requests + ConnectionError: connection issues + Timeout: ReadTimeout or request timeout""" + size = get_online_rsc_size(url) + if no_more_than and size > no_more_than: + raise OSError(f"URL content is larger than {no_more_than!s}") + + resp = session.get(url, stream=True, allow_redirects=True, timeout=60) + resp.raise_for_status() + downloaded = 0 + payload = io.BytesIO() + for data in resp.iter_content(2**30): + downloaded += len(data) + if no_more_than and downloaded > no_more_than: + raise OSError(f"URL content is larger than {no_more_than!s}") + payload.write(data) + payload.seek(0) + return payload.getvalue() + + +def read_mirrorbrain_hash_from(url: str) -> str: + """hashes from mirrorbrain-like (or raw) URL (checksums, btih) + + Format can be the raw digest or digest and filename: + 9e92449ce93115e8d85e29e8e584dece wikipedia_ab_all_maxi_2024-02.zim + + Parameters: + url: URL to read from. eg: download.kiwix.org/x/y/z.zim.sha1 + + Raises: + OSError: Should declared or retrieved size exceed no_more_than + RequestException: HTTP or other error in requests + ConnectionError: connection issues + Timeout: ReadTimeout or request timeout + UnicodeDecodeError: content cannot be decoded into ASCII + UnicodeEncodeError: content cannot be encoded into UTF-8 + IndexError: content is empty or malformed + """ + return ( + get_payload_from(url, no_more_than=2 * 2**10) + .decode("UTF-8") + .strip() + .split(maxsplit=1)[0] + .encode("UTF-8") + .decode("ASCII") + ) + + +def get_btih_from_url(url: str) -> str: + uri = urlparse(url) + if uri.netloc != urlparse(Context.download_url).netloc: + raise ValueError(f"btih from URL is reserved to {Context.download_url}") + if not uri.path.endswith(".torrent"): + raise ValueError( + f"btih from URL is only for {Context.download_url}'s .torrent endpoint" + ) + # btih is 40-len but endpoint sends filename as well + return read_mirrorbrain_hash_from(re.sub(r".torrent$", r".btih", url)) + + +def get_pathname_from_url(url: str) -> Path: + uri = urlparse(url) + if uri.netloc != urlparse(Context.download_url).netloc: + raise ValueError(f"path from URL is reserved to {Context.download_url}") + if not uri.path.endswith(".torrent"): + raise ValueError( + f"path from URL is only for {Context.download_url}'s .torrent endpoint" + ) + return Path(re.sub(r".torrent$", r"", uri.path)) diff --git a/bittorrent-seeder/src/kiwixseeder/entrypoint.py b/bittorrent-seeder/src/kiwixseeder/entrypoint.py new file mode 100644 index 00000000..b0fae345 --- /dev/null +++ b/bittorrent-seeder/src/kiwixseeder/entrypoint.py @@ -0,0 +1,305 @@ +import argparse +import signal +import sys +from types import FrameType + +import humanfriendly +import qbittorrentapi +from rich.markdown import ( # pyright: ignore [reportMissingImports] + Markdown, # pyright: ignore [reportUnknownVariableType] +) +from rich_argparse import ( # pyright: ignore [reportMissingImports] + RichHelpFormatter, # pyright: ignore [reportUnknownVariableType] +) + +from kiwixseeder.__about__ import __version__ +from kiwixseeder.context import ( + CLI_NAME, + DEFAULT_KEEP_DURATION, + DEFAULT_MAX_STORAGE, + DEFAULT_QBT_CONN, + DEFAULT_SLEEP_INTERVAL, + Context, + QbtConnection, + SizeRange, +) +from kiwixseeder.utils import format_duration, format_size + +logger = Context.logger + +epilog = """ +--- +## Keeping ZIMs + +`--keep` allows you to keep seeding ZIMs after those drop out of the Catalog. +The Catalog only exposes the latest version of a *Title*. With many *Titles* being \ +redone monthly.\n +The webseeds (HTTP mirrors) only keep at most 2 versions of a *Title*. +This option expects a + +## Glob-patterns + +- filename-patterns are absolute (root is `/`) +- root starts below zim folder (so `/zim` for instance) +- `**` means any directory tree +- `*` Matches any number of non-separator characters, including zero. +- `[seq]` Matches one character in seq. +- `[!seq]` Matches one character not in seq. + +Sample requests: + +- `wikipedia/*`: All in wikipedia folder +- `wikipedia_fr_*` All wikipedia with `fr` lang (See --lang as well) +- `*_maxi_*` All maxi ones +- `wikipedia_fr_all_nopic_*` specific + +See https://docs.python.org/3/library/pathlib.html#pattern-language +""" + + +def prepare_context(raw_args: list[str]) -> None: + parser = argparse.ArgumentParser( + prog=CLI_NAME, + description="Automates a qBitottorrent instance to seed " + "all or part of the Kiwix Catalog", + formatter_class=RichHelpFormatter, # pyright: ignore [reportUnknownArgumentType] + epilog=Markdown( + epilog, style="argparse.text" + ), # pyright: ignore [reportUnknownArgumentType] + ) + + parser.add_argument( + "--loop", + dest="run_forever", + help="Run forever sleeping, " + f"executing every {format_duration(DEFAULT_SLEEP_INTERVAL)}", + action="store_true", + ) + + parser.add_argument( + "--qbt", + dest="qbt_uri", + help="qBittorrent connection string. " + "Format: qbt://{user}:{password}@{host}:{port}. " + "Can be set via QBT_URI or parts via QBT_USER, QBT_PASSWORD, " + f"QBT_HOST and QBT_PORT. Defaults to {DEFAULT_QBT_CONN}", + type=str, + default=DEFAULT_QBT_CONN, + required=False, + ) + + parser.add_argument( + "--version", + help="Display scraper version and exit", + action="version", + version=__version__, + ) + + parser.add_argument( + "--debug", + dest="debug", + help="Enable debug-level logs", + action="store_true", + ) + + parser.add_argument( + "--all-good", + dest="all_good", + action="store_true", + help="Continue even if your filters " + "didnt filter anything out (thousands of torrents)", + required=False, + ) + + parser.add_argument( + "--filename", + dest="filenames", + action="append", + help="Only seed ZIMs matching this folder/filename pattern.\n" + "Can be used multiple times.\n" + "glob-pattern accepted.", + type=str, + default=[], + required=False, + ) + + parser.add_argument( + "--lang", + dest="languages", + action="append", + help="Only seed ZIMs for this ISO-639-3 language code.\n" + "Can be used multiple times.", + type=str, + default=[], + required=False, + ) + + parser.add_argument( + "--category", + dest="categories", + action="append", + help="Only seed ZIMs in this category.\nCan be used multiple times." + "\nglob-pattern accepted.", + type=str, + default=[], + required=False, + ) + + parser.add_argument( + "--flavour", + dest="flavours", + action="append", + choices=["nodet", "mini", "nopic", "maxi"], + help="Only seed ZIMs of this flavour.\nCan be used multiple times." + "\nglob-pattern accepted.", + type=str, + default=[], + required=False, + ) + + parser.add_argument( + "--tags", + dest="tags", + action="append", + help="Only seed ZIMs with this tag.\nCan be used multiple times." + "\nglob-pattern accepted.", + type=str, + default=[], + required=False, + ) + + parser.add_argument( + "--author", + dest="authors", + action="append", + help="Only seed ZIMs created by this one.\nCan be used multiple times." + "\nglob-pattern accepted.", + type=str, + default=[], + required=False, + ) + + parser.add_argument( + "--publisher", + dest="publishers", + action="append", + help="Only seed ZIMs published by this one.\nCan be used multiple times." + "\nglob-pattern accepted.", + type=str, + default=[], + required=False, + ) + + parser.add_argument( + "--min-file-size", + dest="min_size", + help="Only seed ZIMs at least this size. Input is parsed for suffix", + type=str, + default=None, + required=False, + ) + + parser.add_argument( + "--max-file-size", + dest="max_size", + help="Only seed ZIMs at most this size. Input is parsed for suffix", + type=str, + default=None, + required=False, + ) + + parser.add_argument( + "--max-storage", + dest="max_storage", + help="Overall seeder storage. " + "Removes older torrents if new ones require additional disk space. " + f"Defaults to {format_size(DEFAULT_MAX_STORAGE)}", + type=str, + default=format_size(DEFAULT_MAX_STORAGE), + required=False, + ) + + parser.add_argument( + "--keep", + dest="keep_for", + help="Duration for which to keep an already-added torrent " + "once it dropped out of the Catalog. Duration is computed from added date. " + "Use duration prefixes (d for days, w for weeks, y for years). " + f"Defaults to {format_duration(DEFAULT_KEEP_DURATION)}", + type=str, + default=format_duration(DEFAULT_KEEP_DURATION), + required=False, + ) + + args = parser.parse_args(raw_args) + + # ignore unset values in order to not override Context defaults + args_dict = {key: value for key, value in args._get_kwargs() if value} + + # de-dup list of strings and cast to set + for key in ("folder_prefixes", "categories", "tags", "scrapers"): + if key in args_dict: + args_dict[key] = set(args_dict[key]) + + # size-range + min_size: int = ( + -1 if args.min_size is None else humanfriendly.parse_size(args.min_size) + ) + max_size: int = ( + -1 if args.max_size is None else humanfriendly.parse_size(args.max_size) + ) + args_dict.update({"filesizes": SizeRange(minimum=min_size, maximum=max_size)}) + for key in ("min_size", "max_size"): + if key in args_dict: + del args_dict[key] + + # storage + args_dict["max_storage"] = humanfriendly.parse_size(args_dict["max_storage"]) + + # keep duration + args_dict["keep_for"] = humanfriendly.parse_timespan(args_dict["keep_for"]) + + # qbittorrent client + conn = QbtConnection.using(args_dict["qbt_uri"]) + args_dict["qbt"] = qbittorrentapi.Client( + host=conn.host, + port=conn.port, + username=conn.username, + password=conn.password, + ) + del args_dict["qbt_uri"] + + Context.setup(**args_dict) + + +def main() -> int: + try: + prepare_context(sys.argv[1:]) + # late import as to have an initialized Context + from kiwixseeder.context import Context + from kiwixseeder.runner import Runner + + runner = Runner() + + def exit_gracefully(signum: int, frame: FrameType | None): # noqa: ARG001 + logger.info(f"Received {signal.Signals(signum).name}/{signum}. Exiting") + runner.stop() + + signal.signal(signal.SIGTERM, exit_gracefully) + signal.signal(signal.SIGINT, exit_gracefully) + signal.signal(signal.SIGQUIT, exit_gracefully) + + if Context.get().run_forever: + runner.run_forever() + else: + runner.run() + except Exception as exc: + logger.error(f"General failure: {exc!s}") + logger.exception(exc) + return 1 + + return 0 + + +def entrypoint(): + sys.exit(main()) diff --git a/bittorrent-seeder/src/kiwixseeder/library.py b/bittorrent-seeder/src/kiwixseeder/library.py new file mode 100644 index 00000000..bae3ef7f --- /dev/null +++ b/bittorrent-seeder/src/kiwixseeder/library.py @@ -0,0 +1,273 @@ +import collections +import datetime +import os +import re +import urllib.parse +from collections.abc import Generator +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any + +import iso639 +import requests +import xmltodict +from iso639.exceptions import DeprecatedLanguageValue, InvalidLanguageValue + +from kiwixseeder.context import Context +from kiwixseeder.download import get_btih_from_url +from kiwixseeder.utils import format_size + +UPDATE_EVERY_SECONDS: int = int(os.getenv("UPDATE_EVERY_SECONDS", "3600")) + +context = Context.get() +logger = context.logger + + +def to_human_id(name: str, publisher: str | None = "", flavour: str | None = "") -> str: + """periodless exchange identifier for ZIM Title""" + publisher = publisher or "openZIM" + flavour = flavour or "" + return f"{publisher}:{name}:{flavour}" + + +@dataclass(kw_only=True) +class Book: + ident: str + name: str + title: str + description: str + author: str + publisher: str + langs_iso639_1: list[str] = field(default_factory=list) + langs_iso639_3: list[str] + tags: list[str] + flavour: str + size: int + url: str + illustration_relpath: str + version: str + last_seen_on: datetime.datetime + _btih: str + + def __post_init__(self): + for lang in list(self.langs_iso639_3): + value: str = lang + try: + value = iso639.Lang(lang).pt1 + # skip language if code is invalid or deprecated + except ( + DeprecatedLanguageValue, + InvalidLanguageValue, + ): + self.langs_iso639_3.remove(lang) + continue + self.langs_iso639_1.append(value) + + # fallback to eng if no valid code was supplied + if not self.langs_iso639_3: + self.langs_iso639_3.append("eng") + if not self.langs_iso639_1: + self.langs_iso639_1.append("en") + + @property + def category(self) -> str: + try: + return next( + tag.split(":", 1)[1] + for tag in self.tags + if tag.startswith("_category:") + ) + except StopIteration: + return "" + + @property + def filepath(self) -> Path: + return Path(urllib.parse.urlparse(self.url).path) + + @property + def filename(self) -> str: + return Path(urllib.parse.urlparse(self.url).path).name + + @property + def torrent_url(self) -> str: + return f"{self.url}.torrent" + + @property + def lang_codes(self) -> list[str]: + return self.langs_iso639_3 + + @property + def lang_code(self) -> str: + return self.langs_iso639_3[0] + + @property + def language(self) -> iso639.Lang: + return iso639.Lang(self.lang_code) + + @property + def btih(self) -> str: + if not self._btih: + self._btih = get_btih_from_url(self.torrent_url) + return self._btih + + @btih.setter + def btih(self, value: str): + self._btih = value + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + def __str__(self) -> str: + return ( + f"{self.ident} @ {self.version} " # noqa: RUF001 + f"({format_size(self.size)})" + ) + + +class Catalog: + def __init__(self): + # list of Book by ident + self._books: dict[str, Book] = {} + # list of book-idents by language (ISO-639-1) + self._by_langs: dict[str, list[str]] = {} + self.updated_on: datetime.datetime = datetime.datetime( + 1970, 1, 1, tzinfo=datetime.UTC + ) + + def __contains__(self, ident: str) -> bool: + return ident in self.get_all_ids() + + @property + def all_books(self) -> Generator[Book, None, None]: + self.ensure_fresh() + yield from self._books.values() + + @property + def nb_books(self) -> int: + self.ensure_fresh() + return len(self._books) + + @property + def languages(self) -> collections.OrderedDict[str, str]: + overrides = { + "ina": "Interlingua", + } + return collections.OrderedDict( + sorted( + [ + ( + lang, + overrides.get(lang, iso639.Lang(lang).name), + ) + for lang in self._by_langs.keys() + ], + key=lambda x: x[1], + ) + ) + + def get(self, ident: str) -> Book: + self.ensure_fresh() + return self._books[ident] + + def get_or_none(self, ident: str) -> Book | None: + self.ensure_fresh() + return self._books.get(ident) + + def get_all_ids(self) -> Generator[str, None, None]: + self.ensure_fresh() + yield from self._books.keys() + + def get_for_lang(self, lang_code: str) -> Generator[Book, str, None]: + self.ensure_fresh() + for ident in self._by_langs.get(lang_code, []): + yield self.get(ident) + + def reset(self): + self._books.clear() + self._by_langs.clear() + self.updated_on: datetime.datetime = datetime.datetime( + 1970, 1, 1, tzinfo=datetime.UTC + ) + + def ensure_fresh(self): + # only refresh library if it reachs a configured age + if self.updated_on < datetime.datetime.now(datetime.UTC) - datetime.timedelta( + seconds=UPDATE_EVERY_SECONDS + ): + self.do_refresh() + + def do_refresh(self): + logger.debug(f"refreshing catalog via {context.catalog_url}") + books: dict[str, Book] = {} + langs: dict[str, list[str]] = {} + try: + resp = requests.get( + f"{context.catalog_url}/entries", params={"count": "-1"}, timeout=30 + ) + resp.raise_for_status() + fetched_on = datetime.datetime.now(datetime.UTC) + catalog = xmltodict.parse(resp.content) + if "feed" not in catalog: + raise ValueError("Malformed OPDS response") + if not int(catalog["feed"]["totalResults"]): + raise OSError("Catalog has no entry; probably misbehaving") + for entry in catalog["feed"]["entry"]: + if not entry.get("name"): + logger.warning(f"Skipping entry without name: {entry}") + continue + + links = {link["@type"]: link for link in entry["link"]} + version = datetime.datetime.fromisoformat( + re.sub(r"[A-Z]$", "", entry["updated"]) + ).strftime("%Y-%m-%d") + flavour = entry.get("flavour") or "" + publisher = entry.get("publisher", {}).get("name") or "" + author = entry.get("author", {}).get("name") or "" + ident = to_human_id( + name=entry["name"], + publisher=publisher, + flavour=flavour, + ) + if not links.get("image/png;width=48;height=48;scale=1"): + logger.warning(f"Book has no illustration: {ident}") + books[ident] = Book( + ident=ident, + name=entry["name"], + title=entry["title"], + author=author, + publisher=publisher, + description=entry["summary"], + langs_iso639_3=list(set(entry["language"].split(","))) or ["eng"], + tags=list(set(entry["tags"].split(";"))), + flavour=flavour, + size=int(links["application/x-zim"]["@length"]), + url=re.sub(r".meta4$", "", links["application/x-zim"]["@href"]), + illustration_relpath=links.get( + "image/png;width=48;height=48;scale=1", {} + ).get("@href", ""), + version=version, + last_seen_on=fetched_on, + _btih="", + ) + except Exception as exc: + logger.error(f"Unable to load catalog from OPDS: {exc}") + # only fail refresh if we have no previous catalog to use + if not self._books: + raise exc + else: + # re-order alphabetically by language then title + books = collections.OrderedDict( + sorted( + ((ident, book) for ident, book in books.items()), + key=lambda item: (item[1].language.name, item[1].title), + ) + ) + for ident in books.keys(): + for lang in books[ident].lang_codes: + if lang not in langs: + langs[lang] = [] + langs[lang].append(ident) + self._books = books + self._by_langs = langs + self.updated_on = datetime.datetime.now(datetime.UTC) + logger.debug(f"refreshed on {self.updated_on}") diff --git a/bittorrent-seeder/src/kiwixseeder/qbittorrent.py b/bittorrent-seeder/src/kiwixseeder/qbittorrent.py new file mode 100644 index 00000000..62a26240 --- /dev/null +++ b/bittorrent-seeder/src/kiwixseeder/qbittorrent.py @@ -0,0 +1,136 @@ +import datetime +import time +from dataclasses import dataclass +from typing import Self + +import qbittorrentapi + +from kiwixseeder.context import QBT_CAT_NAME, Context +from kiwixseeder.download import get_btih_from_url, url_is_working +from kiwixseeder.library import Book +from kiwixseeder.utils import format_size + +context = Context.get() +client = context.qbt +logger = context.logger + + +@dataclass(kw_only=True) +class TorrentInfo: + """Custom backend-agnostic torrent info""" + + btih: str + filename: str + added_on: datetime.datetime + size: int + + @classmethod + def from_torrentdictionary(cls, tdict: qbittorrentapi.TorrentDictionary) -> Self: + return cls( + btih=tdict.properties.hash, + filename=tdict.properties.name, + added_on=datetime.datetime.fromtimestamp( + tdict.properties.addition_date, tz=datetime.UTC + ), + size=tdict.properties.total_size, + ) + + def __str__(self) -> str: + return ( + f"{self.filename} @ {self.btih} " # noqa: RUF001 + f"({format_size(self.size)})" + ) + + +class TorrentManager: + + def __init__(self) -> None: + self.btihs: dict[str, str] = {} + + def is_connected(self) -> tuple[bool, str | Exception]: + """whether qbittorrent is reachable and either version or exception""" + try: + return True, client.app_version() + except Exception as exc: + return False, exc + + def setup(self): + # ensure we have our category + if QBT_CAT_NAME not in client.torrent_categories.categories: + client.torrent_categories.create_category(name=QBT_CAT_NAME) + + def reload(self): + """read torrents list from qbittorrent""" + self.btihs.clear() + for torrent in client.torrents.info(category=QBT_CAT_NAME): + self.btihs[torrent.properties.hash] = torrent.properties.name + + @property + def nb_torrents(self) -> int: + return len(self.btihs) + + def add(self, book: Book) -> bool: + btih = self.add_url(url=book.torrent_url, btih=book.btih) + if not btih: + return False + if book.btih != btih: + book.btih = btih + return True + + def add_url(self, url: str, btih: str | None) -> str: + # upload_limit + # download_limit + # save_path + # ratio_limit + # seeding_time_limit + # download_path + try: + if not url_is_working(url): + raise OSError(f"Torrent URL is not working: {url}") + btih = btih or get_btih_from_url(url) + if client.torrents.add( + urls=url, category=QBT_CAT_NAME + ) == "Ok." and self.get_or_none(btih, with_patience=True): + return btih + raise OSError(f"Failed to add torrent for {url}") + finally: + self.reload() + + def get(self, ident: str) -> TorrentInfo: + """Torrent dict from its hash""" + return TorrentInfo.from_torrentdictionary( + client.torrents.info(torrent_hashes=ident)[0] + ) + + def get_or_none( + self, ident: str, *, with_patience: bool = False + ) -> TorrentInfo | None: + """Torrent dict from its hash or None""" + if with_patience: + attempts = 10 + duration = 0.1 + else: + attempts = 1 + duration = 0 + while attempts: + attempts -= 1 + try: + return self.get(ident) + except IndexError: + time.sleep(duration) + continue + return None + + def remove(self, ident: str) -> bool: + """Remove a single torrent (if present) via its hash""" + try: + client.torrents.delete(torrent_hashes=ident, delete_files=True) + finally: + self.reload() + return ident not in self.btihs + + @property + def total_size(self) -> int: + """total size of our torrents""" + self.reload() + return sum(self.get(btih).size for btih in self.btihs) diff --git a/bittorrent-seeder/src/kiwixseeder/runner.py b/bittorrent-seeder/src/kiwixseeder/runner.py new file mode 100644 index 00000000..70a42dcb --- /dev/null +++ b/bittorrent-seeder/src/kiwixseeder/runner.py @@ -0,0 +1,252 @@ +import datetime +import fnmatch +import time + +from kiwixseeder.context import QBT_CAT_NAME, Context +from kiwixseeder.library import Book, Catalog +from kiwixseeder.qbittorrent import TorrentManager +from kiwixseeder.utils import format_size + +context = Context.get() +logger = context.logger + + +class Runner: + + def __init__(self) -> None: + self.exit_requested: bool = False + self.now = datetime.datetime.now(datetime.UTC) + self.manager: TorrentManager = TorrentManager() + self.catalog: Catalog = Catalog() + self.books: list[Book] = [] + + def stop(self): + self.exit_requested = True + + def matches_filename(self, book: Book) -> bool: + if not context.filenames: + return True + + for pattern in context.filenames: + if book.filepath.match(pattern): + return True + return False + + def matches_lang(self, book: Book) -> bool: + if not context.languages: + return True + for lang_code in context.languages: + if lang_code in book.lang_codes: + return True + return False + + def matches_category(self, book: Book) -> bool: + if not context.categories: + return True + for category_pattern in context.categories: + if fnmatch.fnmatch(book.category, category_pattern): + return True + return False + + def matches_flavour(self, book: Book) -> bool: + if not context.flavours: + return True + for flavour in context.flavours: + if flavour == book.flavour: + return True + return False + + def matches_tag(self, book: Book) -> bool: + if not context.tags: + return True + for tag_pattern in context.tags: + for tag in book.tags: + if fnmatch.fnmatch(tag, tag_pattern): + return True + return False + + def matches_author(self, book: Book) -> bool: + if not context.authors: + return True + for author_pattern in context.authors: + if fnmatch.fnmatch(book.author, author_pattern): + return True + return False + + def matches_publisher(self, book: Book) -> bool: + if not context.publishers: + return True + for publisher_pattern in context.publishers: + if fnmatch.fnmatch(book.publisher, publisher_pattern): + return True + return False + + def matches_size(self, book: Book) -> bool: + return context.filesizes.match(book.size) + + def matches(self, book: Book) -> bool: + for value in ( + "filename", + "lang", + "category", + "flavour", + "tag", + "author", + "publisher", + "size", + ): + if not getattr(self, f"matches_{value}")(book): + return False + return True + + def run_forever(self): + while not self.exit_requested: + self.run() + for _ in range(0, int(context.sleep_interval)): + if self.exit_requested: + break + time.sleep(1) + + def run(self): + + self.display_filters() + self.connect_to_backend() + + # read existing torrents from qbt + self.manager.reload() + logger.info(f"There are {self.manager.nb_torrents} torrents in {QBT_CAT_NAME}") + for btih in self.manager.btihs: + logger.debug(f"* {self.manager.get(btih)!s}") + + self.fetch_catalog() + self.reduce_catalog() + self.remove_outdated_torrents() + + self.ensure_storage() + + self.add_books() + + logger.info(f"{QBT_CAT_NAME} has {self.manager.nb_torrents} torrents") + + def display_filters(self): + logger.info("Starting super-seeder with filters:") + logger.info(f"Filenames: {', '.join(context.filenames) or 'all'}") + logger.info(f"Languages: {', '.join(context.languages) or 'all'}") + logger.info(f"Categories: {', '.join(context.categories) or 'all'}") + logger.info(f"Flavours: {', '.join(context.flavours) or 'all'}") + logger.info(f"Tags: {', '.join(context.tags) or 'all'}") + logger.info(f"Authors: {', '.join(context.authors) or 'all'}") + logger.info(f"Publishers: {', '.join(context.publishers) or 'all'}") + logger.info(f"Size: {context.filesizes!s}") + + if not context.filesizes.is_valid(): + raise ValueError("Invalid filters combination: sizes") + + def connect_to_backend(self): + logger.info("Checking qBittorrent connection…") + succeeded, vers_or_exc = self.manager.is_connected() + if not succeeded and isinstance(vers_or_exc, BaseException): + raise OSError( + f"Unable to connect to qBittorrent: {vers_or_exc!s}" + ) from vers_or_exc + logger.info(f"> Connected to qBittorrent {vers_or_exc} ; fetching data…") + + self.manager.setup() + + def fetch_catalog(self): + logger.info("Fetching catalog…") + self.catalog.ensure_fresh() + logger.info(f"Catalog contains {self.catalog.nb_books} ZIMs") + + def reduce_catalog(self): + # build books with our filters + self.books = list(filter(self.matches, self.catalog.all_books)) + + # drop catalog (we dont need any of it anymore) + catalog_size = self.catalog.nb_books + self.catalog.reset() + + # make sure it's not an accidental no-param call + books_size = sum(book.size for book in self.books) + if len(self.books) == catalog_size and not context.all_good: + if ( + input( + f"You are about to seed {catalog_size} torrents " + f"accounting for {format_size(books_size)}. " + "Do you want to continue? Y/[N] " + ).upper() + != "Y" + ): + logger.info("OK, exiting.") + return + + logger.info(f"Filters matches {len(self.books)} ZIMs") + if len(self.books) <= 15: # noqa: PLR2004 + for book in self.books: + logger.debug(f"* {book!s}") + + def remove_outdated_torrents(self): + if not self.manager.btihs: + return + + logger.info("Checking for existing torrents removal…") + + # reconciling existing torrents and books + unselected_books = list(self.manager.btihs.keys()) + for book in self.books: + btihs = [ + btih + for btih, fname in self.manager.btihs.items() + if fname == book.filename + ] + if btihs: + book.btih = btihs[0] + unselected_books.remove(book.btih) + + # keep those that are within --keep duration + keep_until = self.now - datetime.timedelta(seconds=context.keep_for) + for btih in unselected_books: + if self.manager.get(btih).added_on >= keep_until: + unselected_books.remove(btih) + + if not len(unselected_books): + logger.info("> None") + return + + logger.info( + f"Removing {len(unselected_books)} outdated torrents " + "(not in catalog, over --keep)…" + ) + for btih in unselected_books: + logger.info(f"- {self.manager.get(btih)!s}") + if not self.manager.remove(btih): + logger.error(f"Failed to remove {btih}") + + def ensure_storage(self): + torrents_size = self.manager.total_size + books_size = sum(book.size for book in self.books) + total_size = torrents_size + books_size + logger.info("Checking overall storage needs:") + logger.debug(f"- Existing torrents: {format_size(torrents_size)}") + logger.debug(f"- Requested torrents: {format_size(books_size)}") + logger.info( + f"- Total torrents: {format_size(total_size)} " + f"{'>' if torrents_size > context.max_storage else '<='} " + f"{format_size(context.max_storage)} (max storage)" + ) + + if total_size > context.max_storage: + raise OSError("Total size exceeds max-storage") + + def add_books(self): + to_add = [book for book in self.books if book.btih not in self.manager.btihs] + if not to_add: + logger.info("No ZIM to add.") + return + + logger.info(f"Adding {len(to_add)} torrents…") + for book in to_add: + if self.manager.add(book): + logger.info(f"Added {book!s}") + else: + logger.error(f"Failed to add {book!s}") diff --git a/bittorrent-seeder/src/kiwixseeder/utils.py b/bittorrent-seeder/src/kiwixseeder/utils.py new file mode 100644 index 00000000..fc0c3d7a --- /dev/null +++ b/bittorrent-seeder/src/kiwixseeder/utils.py @@ -0,0 +1,44 @@ +import os +from pathlib import Path + +import humanfriendly + +from kiwixseeder.context import NAME, Context + + +def get_config_path() -> Path: + """Path to save/read config file""" + fname = "config.toml" + xdg_config_home = os.getenv("XDG_CONFIG_HOME") + # favor this env on any platform + if xdg_config_home: + return Path(xdg_config_home) / fname + if Context.is_mac: + return Path.home() / "Library" / "Preferences" / NAME / fname + if Context.is_win: + return Path(os.getenv("APPDATA", "C:")) / NAME / fname + return Path.home() / ".config" / NAME / fname + + +def get_db_path() -> Path: + """Path to save/read database""" + fname = f"{NAME}.db" + xdg_cache_home = os.getenv("XDG_CACHE_HOME") + # favor this env on any platform + if xdg_cache_home: + return Path(xdg_cache_home) / fname + if Context.is_mac: + return Path.home() / "Library" / "Caches" / NAME / fname + if Context.is_win: + return Path(os.getenv("APPDATA", "C:")) / NAME / fname + return Path.home() / ".config" / NAME / fname + + +def format_size(value: int) -> str: + """human friendly size in binary""" + return humanfriendly.format_size(value, binary=True) + + +def format_duration(value: float) -> str: + """human friendly duration""" + return humanfriendly.format_timespan(value) diff --git a/bittorrent-seeder/tasks.py b/bittorrent-seeder/tasks.py new file mode 100644 index 00000000..87cd5529 --- /dev/null +++ b/bittorrent-seeder/tasks.py @@ -0,0 +1,110 @@ +# pyright: strict, reportUntypedFunctionDecorator=false +import os + +from invoke.context import Context +from invoke.tasks import task # pyright: ignore [reportUnknownVariableType] + +use_pty = not os.getenv("CI", "") + + +@task(optional=["args"], help={"args": "pytest additional arguments"}) +def test(ctx: Context, args: str = ""): + """run tests (without coverage)""" + ctx.run(f"pytest {args}", pty=use_pty) + + +@task(optional=["args"], help={"args": "pytest additional arguments"}) +def test_cov(ctx: Context, args: str = ""): + """run test vith coverage""" + ctx.run(f"coverage run -m pytest {args}", pty=use_pty) + + +@task(optional=["html"], help={"html": "flag to export html report"}) +def report_cov(ctx: Context, *, html: bool = False): + """report coverage""" + ctx.run("coverage combine", warn=True, pty=use_pty) + ctx.run("coverage report --show-missing", pty=use_pty) + ctx.run("coverage xml", pty=use_pty) + if html: + ctx.run("coverage html", pty=use_pty) + + +@task( + optional=["args", "html"], + help={ + "args": "pytest additional arguments", + "html": "flag to export html report", + }, +) +def coverage(ctx: Context, args: str = "", *, html: bool = False): + """run tests and report coverage""" + test_cov(ctx, args=args) + report_cov(ctx, html=html) + + +@task(optional=["args"], help={"args": "black additional arguments"}) +def lint_black(ctx: Context, args: str = "."): + args = args or "." # needed for hatch script + ctx.run("black --version", pty=use_pty) + ctx.run(f"black --check --diff {args}", pty=use_pty) + + +@task(optional=["args"], help={"args": "ruff additional arguments"}) +def lint_ruff(ctx: Context, args: str = "."): + args = args or "." # needed for hatch script + ctx.run("ruff --version", pty=use_pty) + ctx.run(f"ruff check {args}", pty=use_pty) + + +@task( + optional=["args"], + help={ + "args": "linting tools (black, ruff) additional arguments, typically a path", + }, +) +def lintall(ctx: Context, args: str = "."): + """Check linting""" + args = args or "." # needed for hatch script + lint_black(ctx, args) + lint_ruff(ctx, args) + + +@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"}) +def check_pyright(ctx: Context, args: str = ""): + """check static types with pyright""" + ctx.run("pyright --version") + ctx.run(f"pyright {args}", pty=use_pty) + + +@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"}) +def checkall(ctx: Context, args: str = ""): + """check static types""" + check_pyright(ctx, args) + + +@task(optional=["args"], help={"args": "black additional arguments"}) +def fix_black(ctx: Context, args: str = "."): + """fix black formatting""" + args = args or "." # needed for hatch script + ctx.run(f"black {args}", pty=use_pty) + + +@task(optional=["args"], help={"args": "ruff additional arguments"}) +def fix_ruff(ctx: Context, args: str = "."): + """fix all ruff rules""" + args = args or "." # needed for hatch script + ctx.run(f"ruff check --fix {args}", pty=use_pty) + + +@task( + optional=["args"], + help={ + "args": "linting tools (black, ruff) additional arguments, typically a path", + }, +) +def fixall(ctx: Context, args: str = "."): + """Fix everything automatically""" + args = args or "." # needed for hatch script + fix_black(ctx, args) + fix_ruff(ctx, args) + lintall(ctx, args)