Merge pull request #450 from openzim/upgrade_crawler

Upgrade to browsertrix crawler 1.4.2, fix integration tests and fix docker label
openzim · Jan 9, 2025 · 8cddcf0 · 8cddcf0
2 parents b5dac3c + 97ea6df
commit 8cddcf0
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 39 deletions.
diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml
@@ -63,7 +63,7 @@ jobs:
         run: docker run -v $PWD/output:/output zimit zimit --help
 
       - name: run crawl
-        run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail [email protected] --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
+        run: docker run -v $PWD/output:/output zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail [email protected] --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
 
       - name: run integration test suite
         run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
-- Upgrade to browsertrix crawler 1.4.0-beta.0 (#434)
+- Upgrade to browsertrix crawler 1.4.2 (#450)
 
 ## [2.1.6] - 2024-11-07
 

diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,5 @@
-FROM webrecorder/browsertrix-crawler:1.4.0-beta.0
-LABEL org.opencontainers.image.source https://github.com/openzim/zimit
+FROM webrecorder/browsertrix-crawler:1.4.2
+LABEL org.opencontainers.image.source=https://github.com/openzim/zimit
 
 RUN apt-get update \
  && apt-get install -qqy --no-install-recommends \

diff --git a/tests-integration/integration.py b/tests-integration/integration.py
@@ -1,30 +1,34 @@
 import glob
 import json
 import os
+from pathlib import Path
 
 from warcio import ArchiveIterator
 from zimscraperlib.zim import Archive
 
 
 def test_is_file():
     """Ensure ZIM file exists"""
-    assert os.path.isfile("/output/isago.zim")
+    assert os.path.isfile("/output/tests_en_onepage.zim")
 
 
 def test_zim_main_page():
-    """Main page specified, http://isago.rskg.org/, was a redirect to https
+    """Main page specified, http://website.test.openzim.org/http-return-codes.html,
+    was a redirect to https
     Ensure main page is the redirected page"""
 
-    main_entry = Archive("/output/isago.zim").main_entry
+    main_entry = Archive("/output/tests_en_onepage.zim").main_entry
     assert main_entry.is_redirect
-    assert main_entry.get_redirect_entry().path == "isago.rskg.org/"
+    assert (
+        main_entry.get_redirect_entry().path
+        == "website.test.openzim.org/http-return-codes.html"
+    )
 
 
 def test_zim_scraper():
-    """Main page specified, http://isago.rskg.org/, was a redirect to https
-    Ensure main page is the redirected page"""
+    """Check content of scraper metadata"""
 
-    zim_fh = Archive("/output/isago.zim")
+    zim_fh = Archive("/output/tests_en_onepage.zim")
     scraper = zim_fh.get_text_metadata("Scraper")
     assert "zimit " in scraper
     assert "warc2zim " in scraper
@@ -33,18 +37,28 @@ def test_zim_scraper():
 
 def test_files_list():
     """Check that expected files are present in the ZIM at proper path"""
-    zim_fh = Archive("/output/isago.zim")
+    zim_fh = Archive("/output/tests_en_onepage.zim")
     for expected_entry in [
         "_zim_static/__wb_module_decl.js",
         "_zim_static/wombat.js",
         "_zim_static/wombatSetup.js",
-        "isago.rskg.org/",
-        "isago.rskg.org/a-propos",
-        "isago.rskg.org/conseils",
-        "isago.rskg.org/faq",
-        "isago.rskg.org/static/favicon256.png",
-        "isago.rskg.org/static/tarifs-isago.pdf",
-        "maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css",
+        "website.test.openzim.org/http-return-codes.html",
+        "website.test.openzim.org/200-response",
+        "website.test.openzim.org/201-response",
+        "website.test.openzim.org/202-response",
+        "website.test.openzim.org/301-external-redirect-ok",
+        "website.test.openzim.org/301-internal-redirect-ok",
+        "website.test.openzim.org/302-external-redirect-ok",
+        "website.test.openzim.org/302-internal-redirect-ok",
+        "website.test.openzim.org/307-external-redirect-ok",
+        "website.test.openzim.org/307-internal-redirect-ok",
+        "website.test.openzim.org/308-external-redirect-ok",
+        "website.test.openzim.org/308-internal-redirect-ok",
+        "website.test.openzim.org/http-return-codes.html",
+        "website.test.openzim.org/icons/favicon.ico",
+        "website.test.openzim.org/icons/site.webmanifest",
+        "website.test.openzim.org/internal_redirect_target.html",
+        "www.example.com/",
     ]:
         assert zim_fh.get_content(expected_entry)
 
@@ -72,23 +86,22 @@ def test_user_agent():
 
 
 def test_stats_output():
-    with open("/output/crawl.json") as fh:
-        assert json.loads(fh.read()) == {
-            "crawled": 5,
-            "pending": 0,
-            "pendingPages": [],
-            "total": 5,
-            "failed": 0,
-            "limit": {"max": 0, "hit": False},
-        }
-    with open("/output/warc2zim.json") as fh:
-        assert json.loads(fh.read()) == {
-            "written": 7,
-            "total": 7,
-        }
-    with open("/output/stats.json") as fh:
-        assert json.loads(fh.read()) == {
-            "done": 7,
-            "total": 7,
-            "limit": {"max": 0, "hit": False},
-        }
+    assert json.loads(Path("/output/crawl.json").read_bytes()) == {
+        "crawled": 35,
+        "pending": 0,
+        "pendingPages": [],
+        "total": 35,
+        "failed": 18,
+        "limit": {"max": 0, "hit": False},
+    }
+
+    assert json.loads(Path("/output/warc2zim.json").read_bytes()) == {
+        "written": 8,
+        "total": 8,
+    }
+
+    assert json.loads(Path("/output/stats.json").read_bytes()) == {
+        "done": 8,
+        "total": 8,
+        "limit": {"max": 0, "hit": False},
+    }