first experimentations with INA's DL Web

medialab · Jul 21, 2023 · 8eb457f · 8eb457f
1 parent 21b6ffb
commit 8eb457f
Show file tree

Hide file tree

Showing 6 changed files with 73 additions and 30 deletions.
diff --git a/config-backend.env.example b/config-backend.env.example
@@ -22,6 +22,7 @@ HYPHE_FOLLOW_REDIRECTS=["fb.me", "l.facebook.com", "facebook.com/l.php", "www.fa
 HYPHE_WEBARCHIVES_OPTIONS=["web.archive.org"]
 HYPHE_WEBARCHIVES_DATE=2012-07-01
 HYPHE_WEBARCHIVES_DAYSRANGE=28
+HYPHE_WEBARCHIVES_PASSWORD=
 
 #HYPHE_ADMIN_PASSWORD=
 #HYPHE_OPEN_CORS_API=false

diff --git a/hyphe_backend/crawler/deploy.py b/hyphe_backend/crawler/deploy.py
@@ -77,6 +77,7 @@ def strToBool(string):
     config['mongo-scrapy']['log_level'] = 'DEBUG' if config['DEBUG'] > 1 else 'INFO'
     config["mongo-scrapy"]["host"] = os.environ.get('HYPHE_MONGODB_HOST', config["mongo-scrapy"]["host"])
     config["mongo-scrapy"]["obey_robots"] = strToBool(os.environ.get('HYPHE_OBEY_ROBOTS', config["mongo-scrapy"].get("obey_robots", False)))
+    config["mongo-scrapy"]["webarchives_password"] = os.environ.get('HYPHE_WEBARCHIVES_PASSWORD', config["webarchives"].get("password", ""))
     for _to in ["", "idle_", "ajax_"]:
         config['mongo-scrapy']['phantom_%stimeout' % _to] = config['phantom']['%stimeout' % _to]
     with nested(open("hcicrawler/settings-template.py", "r"), open("hcicrawler/settings.py", "w")) as (template, generated):

diff --git a/hyphe_backend/crawler/hcicrawler/settings-template.py b/hyphe_backend/crawler/hcicrawler/settings-template.py
@@ -40,6 +40,8 @@
 MONGO_QUEUE_COL = 'queue'
 MONGO_PAGESTORE_COL = 'pages'
 
+WEBARCHIVES_PASSWORD = '{{webarchives_password}}'
+
 PHANTOM = {
   "PATH": os.path.join('{{hyphePath}}', 'bin', 'hyphe-phantomjs-2.0.0'),
   "JS_PATH": os.path.join('{{hyphePath}}', 'hyphe_backend', 'crawler', BOT_NAME, 'spiders', 'js'),

diff --git a/hyphe_backend/crawler/hcicrawler/spiders/pages.py b/hyphe_backend/crawler/hcicrawler/spiders/pages.py
@@ -2,6 +2,7 @@
 
 import os, time, signal, re
 import json
+from hashlib import sha256
 import logging
 from datetime import datetime, timedelta
 
@@ -30,7 +31,7 @@
 from hcicrawler.urllru import url_to_lru_clean, lru_get_host_url, lru_get_path_url, has_prefix, lru_to_url
 from hcicrawler.tlds_tree import TLDS_TREE
 from hcicrawler.items import Page
-from hcicrawler.settings import HYPHE_PROJECT, PHANTOM, STORE_HTML, MONGO_HOST, MONGO_PORT, MONGO_DB, MONGO_JOBS_COL
+from hcicrawler.settings import HYPHE_PROJECT, PHANTOM, STORE_HTML, MONGO_HOST, MONGO_PORT, MONGO_DB, MONGO_JOBS_COL, WEBARCHIVES_PASSWORD
 from hcicrawler.errors import error_name
 
 def timeout_alarm(*args):
@@ -91,20 +92,25 @@ def __init__(self, **kwargs):
         if "option" not in self.webarchives or self.webarchives["option"] not in ARCHIVES_OPTIONS or not self.webarchives["option"]:
             self.webarchives = {}
         if self.webarchives:
-            self.webarchives["url_prefix"] = ARCHIVES_OPTIONS[self.webarchives["option"]].get("url_prefix", None)
+            self.webarchives["url_prefix"] = ARCHIVES_OPTIONS[self.webarchives["option"]].get("url_prefix", "") or ""
             archivedate = re.sub(r"\D", "", str(self.webarchives["date"]))
             self.archivedate = str(archivedate) + "120000"
             archivedt = datetime.strptime(self.archivedate, "%Y%m%d%H%M%S")
             self.archivemindate = datetime.strftime(archivedt - timedelta(days=self.webarchives["days_range"]/2., seconds=43200), "%Y%m%d%H%M%S")
             self.archivemaxdate = datetime.strftime(archivedt + timedelta(days=self.webarchives["days_range"]/2., seconds=43199), "%Y%m%d%H%M%S")
 
-            archiveprefix = self.webarchives["url_prefix"].rstrip('/')
-            self.archiveprefix = "%s/%s/" % (archiveprefix, self.archivedate)
-            self.archiveregexp = re.compile(r"^%s/(\d{14}).?/" % archiveprefix, re.I)
-            self.archivehost = "/".join(archiveprefix.split('/')[:3])
-            self.archivedomain_lru = url_to_lru_clean("http://%s" % get_domain_name(archiveprefix), TLDS_TREE)
-            archivedomain_regexp = "(?:%s|%s)" % (archiveprefix, archiveprefix.replace(self.archivehost, ""))
-            self.archiveredirect = re.compile(RE_ARCHIVE_REDIRECT % archivedomain_regexp, re.I|re.S)
+            archiveprefix = (self.webarchives["url_prefix"] or "").rstrip('/')
+            if archiveprefix:
+                self.archiveprefix = "%s/%s/" % (archiveprefix, self.archivedate)
+                self.archiveregexp = re.compile(r"^%s/(\d{14}).?/" % archiveprefix, re.I)
+                self.archivehost = "/".join(archiveprefix.split('/')[:3])
+                self.archivedomain_lru = url_to_lru_clean("http://%s" % get_domain_name(archiveprefix), TLDS_TREE)
+                archivedomain_regexp = "(?:%s|%s)" % (archiveprefix, archiveprefix.replace(self.archivehost, ""))
+                self.archiveredirect = re.compile(RE_ARCHIVE_REDIRECT % archivedomain_regexp, re.I|re.S)
+            else:
+                self.archiveprefix = ""
+                self.archivetimestamp = str(int((archivedt - datetime(1970, 1, 1)).total_seconds()))
+
 
             if "proxy" in ARCHIVES_OPTIONS[self.webarchives["option"]]:
                 self.proxy = ARCHIVES_OPTIONS[self.webarchives["option"]]["proxy"]
@@ -124,7 +130,7 @@ def start_requests(self):
         self.log("Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], logging.INFO)
         self.log("ARGUMENTS : "+str(self.args), logging.INFO)
         if self.webarchives:
-            self.log("Crawling on Web Archive using for prefix %s between %s and %s" % (self.archiveprefix, self.archivemindate, self.archivemaxdate))
+            self.log("Crawling on Web Archive %s using for prefix %s between %s and %s" % (self.webarchives["option"], self.archiveprefix, self.archivemindate, self.archivemaxdate))
         if self.proxy:
             self.log("Using proxy %s" % self.proxy, logging.INFO)
 
@@ -180,6 +186,7 @@ def spider_closed(self, spider, reason=""):
                         os.remove(fi)
 
     def handle_response(self, response):
+        # self.log("RESPONSE %s: %s" % (response.url, dict(response.headers)), logging.DEBUG)
 
         if self.phantom:
             self.phantom.get(response.url)
@@ -237,7 +244,7 @@ def handle_response(self, response):
               ("archivesinternet.bnf.fr" in self.webarchives["url_prefix"] and 300 <= response.status < 400 and \
                (not response.body or "<head><title>301 Moved Permanently</title></head>" in response.body)):
                 redir_url = response.headers['Location']
-                if redir_url.startswith("/"):
+                if redir_url.startswith("/") and self.archiveprefix:
                     redir_url = "%s%s" % (self.archivehost, redir_url)
                 if "archivesinternet.bnf.fr" in self.webarchives["url_prefix"]:
                     if "depth" in response.meta:
@@ -248,20 +255,21 @@ def handle_response(self, response):
                     if not response.body and redir > 10:
                         return self.parse_html(response)
                     return self._request(redir_url, redirection=redir, dont_filter=(not response.body))
-                real_url = self.archiveregexp.sub("", redir_url)
-                orig_url = self.archiveregexp.sub("", response.url)
-                match = self.archiveregexp.search(redir_url)
-                if match:
-                    # Check date obtained fits into a user defined timerange and return 404 otherwise
-                    if not (self.archivemindate <= match.group(1) <= self.archivemaxdate):
-                        self.log("Skipping archive page (%s) with date (%s) outside desired range (%s/%s)" % (redir_url, match.group(1), self.archivemindate, self.archivemaxdate), logging.DEBUG)
-                        return self._make_raw_page(response, archive_fail_url=redir_url)
-                    if normalize(real_url) == normalize(orig_url):
-                        if "depth" in response.meta:
-                            response.meta['depth'] -= 1
-                        else:
-                            response.meta['depth'] = -1
-                        return self._request(redir_url)
+                if self.archiveprefix:
+                    real_url = self.archiveregexp.sub("", redir_url)
+                    orig_url = self.archiveregexp.sub("", response.url)
+                    match = self.archiveregexp.search(redir_url)
+                    if match:
+                        # Check date obtained fits into a user defined timerange and return 404 otherwise
+                        if not (self.archivemindate <= match.group(1) <= self.archivemaxdate):
+                            self.log("Skipping archive page (%s) with date (%s) outside desired range (%s/%s)" % (redir_url, match.group(1), self.archivemindate, self.archivemaxdate), logging.DEBUG)
+                            return self._make_raw_page(response, archive_fail_url=redir_url)
+                        if normalize(real_url) == normalize(orig_url):
+                            if "depth" in response.meta:
+                                response.meta['depth'] -= 1
+                            else:
+                                response.meta['depth'] = -1
+                            return self._request(redir_url)
             if response.status >= 400:
                 return self._make_raw_page(response)
 
@@ -289,7 +297,7 @@ def parse_html(self, response):
         archive_timestamp = None
         clean_body = None
         orig_url = response.url
-        if self.webarchives:
+        if self.webarchives and self.archiveprefix:
             orig_url = self.archiveregexp.sub("", orig_url)
         lru = url_to_lru_clean(orig_url, TLDS_TREE)
         lrulinks = []
@@ -299,7 +307,8 @@ def parse_html(self, response):
 
         skip_page = False
         if self.webarchives:
-            redir_url = self.archiveredirect.search(response.body)
+            # TODO detect INA redirection cases
+            redir_url = self.archiveredirect.search(response.body) if self.archiveprefix else None
             if "web.archive.org" in self.webarchives["url_prefix"]:
                 # Remove WEB ARCHIVES banner
                 clean_body = RE_WEB_ARCHIVES_BANNER.sub("", response.body)
@@ -322,6 +331,18 @@ def parse_html(self, response):
                 # Remove BNF banner
                 clean_body = RE_BNF_ARCHIVES_BANNER.sub("", response.body)
 
+            elif self.webarchives["option"] == "dlweb.ina.fr":
+                archive_url = response.headers["X-Response-URL"]
+                # Check date obtained fits into a user defined timerange and return 404 otherwise
+                archive_timestamp = response.headers["X-Response-Time"]
+                if not archive_timestamp:
+                    self.log("Skipping archive page (%s) for which archive date could not be found within response's headers (%s)." % (response.url, dict(response.headers)), logging.ERROR)
+                    return
+                archive_timestamp = datetime.strftime((datetime(1970, 1, 1) + timedelta(seconds=int(archive_timestamp))), "%Y%m%d%H%M%S")
+                if not (self.archivemindate <= archive_timestamp <= self.archivemaxdate):
+                    self.log("Skipping archive page (%s) with date (%s) outside desired range (%s/%s)" % (response.url, archive_timestamp, self.archivemindate, self.archivemaxdate), logging.DEBUG)
+                    skip_page = archive_url
+
             # Specific case of redirections from website returned by archives as JS redirections with code 200
             elif redir_url:
                 response.status = int(redir_url.group(2))
@@ -339,7 +360,7 @@ def parse_html(self, response):
         if not skip_page and 300 <= response.status < 400:
             redir_url = response.headers['Location']
 
-            if self.webarchives and self.archiveregexp.match(redir_url):
+            if self.webarchives and self.archiveprefix and self.archiveregexp.match(redir_url):
                 redir_url = self.archiveregexp.sub("", redir_url)
 
             if redir_url.startswith('/'):
@@ -369,7 +390,7 @@ def parse_html(self, response):
             except AttributeError:
                 url = link['url']
 
-            if self.webarchives:
+            if self.webarchives and self.archiveprefix:
                 # Rewrite archives urls and filter internal archives links
                 url = self.archiveregexp.sub("", url)
                 if url.startswith(self.archivehost) or \
@@ -414,7 +435,7 @@ def _make_raw_page(self, response, modified_body=None, archive_fail_url=None):
         p = Page()
         p['url'] = response.url
         if self.webarchives:
-            p['url'] = self.archiveregexp.sub("", response.url)
+            p['url'] = self.archiveregexp.sub("", response.url) if self.archiveprefix else response.headers.get("x-response-url", response.url)
             p['archive_url'] = archive_fail_url or response.url
             p['archive_date_requested'] = self.archivedate
             if 'archive_timestamp' in response.meta and not archive_fail_url:
@@ -447,6 +468,17 @@ def _request(self, url, noproxy=False, redirection=0, **kw):
         if self.phantom:
             kw['method'] = 'HEAD'
         if self.webarchives:
+            if self.webarchives["option"] == "dlweb.ina.fr":
+                # TODO: fix urls such as http://domain.com without trailing slash not working with INA archive
+                kw["headers"] = {
+                    "X-DLWeb-Token": sha256("%s\n%s\n" % (WEBARCHIVES_PASSWORD, url)).hexdigest(),
+                    "X-Request-Time": self.archivetimestamp,
+                    "X-User": "HYPHE_%s_%s" % (HYPHE_PROJECT, self.crawler.settings['JOBID'])
+                }
+                kw["meta"]["archive_timestamp"] = self.archivedate
+                # self.log("REQUEST %s: %s" % (url, kw), logging.DEBUG)
+
+                return Request(url, **kw)
             if "archivesinternet.bnf.fr" in self.webarchives["url_prefix"]:
                 kw['headers'] = {
                     "BnF-OSWM-User-Name": "WS-HYPHE_%s_%s" % (HYPHE_PROJECT, self.crawler.settings['JOBID'])

diff --git a/hyphe_backend/lib/config_hci.py b/hyphe_backend/lib/config_hci.py
@@ -132,6 +132,7 @@ def validateStartpagesMode(modes):
   }, "webarchives": {
     "type": dict,
     "int_fields": ["days_range"],
+    "str_fields": ["password"],
     "extra_fields": {
       "options": webarchives.validateOptions,
       "date": webarchives.validateArchiveDate

diff --git a/hyphe_backend/lib/webarchives.py b/hyphe_backend/lib/webarchives.py
@@ -19,6 +19,12 @@
         "permalinks_prefix": "http://rntse.bnf.fr/jsp/lancerDL.jsp?appli=WAYBACK_URL&titre=la%20navigation%20dans%20les%20archives%20de%20l%27internet&url=http://archivesinternet.bnf.fr/DATETIME/SOURCEURL",
         "proxy": "archivesinternet.bnf.fr:8090",
         "min_date": "1996-01-01"
+    },
+    "dlweb.ina.fr": {
+        "label": "DLWeb.INA.fr",
+        "description": "crawl France's official web medias archives maintained by INA",
+        "proxy": "dlweb.ina.fr:82",
+        "min_date": "1996-01-01"
     }
 }