Skip to content

Commit

Permalink
Merge pull request #450 from openzim/upgrade_crawler
Browse files Browse the repository at this point in the history
Upgrade to browsertrix crawler 1.4.2, fix integration tests and fix docker label
  • Loading branch information
benoit74 authored Jan 9, 2025
2 parents b5dac3c + 97ea6df commit 8cddcf0
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 39 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/Tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ jobs:
run: docker run -v $PWD/output:/output zimit zimit --help

- name: run crawl
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail [email protected] --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
run: docker run -v $PWD/output:/output zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail [email protected] --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep

- name: run integration test suite
run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Changed

- Upgrade to browsertrix crawler 1.4.0-beta.0 (#434)
- Upgrade to browsertrix crawler 1.4.2 (#450)

## [2.1.6] - 2024-11-07

Expand Down
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
FROM webrecorder/browsertrix-crawler:1.4.0-beta.0
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
FROM webrecorder/browsertrix-crawler:1.4.2
LABEL org.opencontainers.image.source=https://github.com/openzim/zimit

RUN apt-get update \
&& apt-get install -qqy --no-install-recommends \
Expand Down
83 changes: 48 additions & 35 deletions tests-integration/integration.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,34 @@
import glob
import json
import os
from pathlib import Path

from warcio import ArchiveIterator
from zimscraperlib.zim import Archive


def test_is_file():
"""Ensure ZIM file exists"""
assert os.path.isfile("/output/isago.zim")
assert os.path.isfile("/output/tests_en_onepage.zim")


def test_zim_main_page():
"""Main page specified, http://isago.rskg.org/, was a redirect to https
"""Main page specified, http://website.test.openzim.org/http-return-codes.html,
was a redirect to https
Ensure main page is the redirected page"""

main_entry = Archive("/output/isago.zim").main_entry
main_entry = Archive("/output/tests_en_onepage.zim").main_entry
assert main_entry.is_redirect
assert main_entry.get_redirect_entry().path == "isago.rskg.org/"
assert (
main_entry.get_redirect_entry().path
== "website.test.openzim.org/http-return-codes.html"
)


def test_zim_scraper():
"""Main page specified, http://isago.rskg.org/, was a redirect to https
Ensure main page is the redirected page"""
"""Check content of scraper metadata"""

zim_fh = Archive("/output/isago.zim")
zim_fh = Archive("/output/tests_en_onepage.zim")
scraper = zim_fh.get_text_metadata("Scraper")
assert "zimit " in scraper
assert "warc2zim " in scraper
Expand All @@ -33,18 +37,28 @@ def test_zim_scraper():

def test_files_list():
"""Check that expected files are present in the ZIM at proper path"""
zim_fh = Archive("/output/isago.zim")
zim_fh = Archive("/output/tests_en_onepage.zim")
for expected_entry in [
"_zim_static/__wb_module_decl.js",
"_zim_static/wombat.js",
"_zim_static/wombatSetup.js",
"isago.rskg.org/",
"isago.rskg.org/a-propos",
"isago.rskg.org/conseils",
"isago.rskg.org/faq",
"isago.rskg.org/static/favicon256.png",
"isago.rskg.org/static/tarifs-isago.pdf",
"maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css",
"website.test.openzim.org/http-return-codes.html",
"website.test.openzim.org/200-response",
"website.test.openzim.org/201-response",
"website.test.openzim.org/202-response",
"website.test.openzim.org/301-external-redirect-ok",
"website.test.openzim.org/301-internal-redirect-ok",
"website.test.openzim.org/302-external-redirect-ok",
"website.test.openzim.org/302-internal-redirect-ok",
"website.test.openzim.org/307-external-redirect-ok",
"website.test.openzim.org/307-internal-redirect-ok",
"website.test.openzim.org/308-external-redirect-ok",
"website.test.openzim.org/308-internal-redirect-ok",
"website.test.openzim.org/http-return-codes.html",
"website.test.openzim.org/icons/favicon.ico",
"website.test.openzim.org/icons/site.webmanifest",
"website.test.openzim.org/internal_redirect_target.html",
"www.example.com/",
]:
assert zim_fh.get_content(expected_entry)

Expand Down Expand Up @@ -72,23 +86,22 @@ def test_user_agent():


def test_stats_output():
with open("/output/crawl.json") as fh:
assert json.loads(fh.read()) == {
"crawled": 5,
"pending": 0,
"pendingPages": [],
"total": 5,
"failed": 0,
"limit": {"max": 0, "hit": False},
}
with open("/output/warc2zim.json") as fh:
assert json.loads(fh.read()) == {
"written": 7,
"total": 7,
}
with open("/output/stats.json") as fh:
assert json.loads(fh.read()) == {
"done": 7,
"total": 7,
"limit": {"max": 0, "hit": False},
}
assert json.loads(Path("/output/crawl.json").read_bytes()) == {
"crawled": 35,
"pending": 0,
"pendingPages": [],
"total": 35,
"failed": 18,
"limit": {"max": 0, "hit": False},
}

assert json.loads(Path("/output/warc2zim.json").read_bytes()) == {
"written": 8,
"total": 8,
}

assert json.loads(Path("/output/stats.json").read_bytes()) == {
"done": 8,
"total": 8,
"limit": {"max": 0, "hit": False},
}

0 comments on commit 8cddcf0

Please sign in to comment.