Skip to content

Commit

Permalink
first experimentations with INA's DL Web
Browse files Browse the repository at this point in the history
  • Loading branch information
boogheta committed Jul 21, 2023
1 parent 21b6ffb commit 8eb457f
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 30 deletions.
1 change: 1 addition & 0 deletions config-backend.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ HYPHE_FOLLOW_REDIRECTS=["fb.me", "l.facebook.com", "facebook.com/l.php", "www.fa
HYPHE_WEBARCHIVES_OPTIONS=["web.archive.org"]
HYPHE_WEBARCHIVES_DATE=2012-07-01
HYPHE_WEBARCHIVES_DAYSRANGE=28
HYPHE_WEBARCHIVES_PASSWORD=

#HYPHE_ADMIN_PASSWORD=
#HYPHE_OPEN_CORS_API=false
Expand Down
1 change: 1 addition & 0 deletions hyphe_backend/crawler/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def strToBool(string):
config['mongo-scrapy']['log_level'] = 'DEBUG' if config['DEBUG'] > 1 else 'INFO'
config["mongo-scrapy"]["host"] = os.environ.get('HYPHE_MONGODB_HOST', config["mongo-scrapy"]["host"])
config["mongo-scrapy"]["obey_robots"] = strToBool(os.environ.get('HYPHE_OBEY_ROBOTS', config["mongo-scrapy"].get("obey_robots", False)))
config["mongo-scrapy"]["webarchives_password"] = os.environ.get('HYPHE_WEBARCHIVES_PASSWORD', config["webarchives"].get("password", ""))
for _to in ["", "idle_", "ajax_"]:
config['mongo-scrapy']['phantom_%stimeout' % _to] = config['phantom']['%stimeout' % _to]
with nested(open("hcicrawler/settings-template.py", "r"), open("hcicrawler/settings.py", "w")) as (template, generated):
Expand Down
2 changes: 2 additions & 0 deletions hyphe_backend/crawler/hcicrawler/settings-template.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
MONGO_QUEUE_COL = 'queue'
MONGO_PAGESTORE_COL = 'pages'

WEBARCHIVES_PASSWORD = '{{webarchives_password}}'

PHANTOM = {
"PATH": os.path.join('{{hyphePath}}', 'bin', 'hyphe-phantomjs-2.0.0'),
"JS_PATH": os.path.join('{{hyphePath}}', 'hyphe_backend', 'crawler', BOT_NAME, 'spiders', 'js'),
Expand Down
92 changes: 62 additions & 30 deletions hyphe_backend/crawler/hcicrawler/spiders/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import os, time, signal, re
import json
from hashlib import sha256
import logging
from datetime import datetime, timedelta

Expand Down Expand Up @@ -30,7 +31,7 @@
from hcicrawler.urllru import url_to_lru_clean, lru_get_host_url, lru_get_path_url, has_prefix, lru_to_url
from hcicrawler.tlds_tree import TLDS_TREE
from hcicrawler.items import Page
from hcicrawler.settings import HYPHE_PROJECT, PHANTOM, STORE_HTML, MONGO_HOST, MONGO_PORT, MONGO_DB, MONGO_JOBS_COL
from hcicrawler.settings import HYPHE_PROJECT, PHANTOM, STORE_HTML, MONGO_HOST, MONGO_PORT, MONGO_DB, MONGO_JOBS_COL, WEBARCHIVES_PASSWORD
from hcicrawler.errors import error_name

def timeout_alarm(*args):
Expand Down Expand Up @@ -91,20 +92,25 @@ def __init__(self, **kwargs):
if "option" not in self.webarchives or self.webarchives["option"] not in ARCHIVES_OPTIONS or not self.webarchives["option"]:
self.webarchives = {}
if self.webarchives:
self.webarchives["url_prefix"] = ARCHIVES_OPTIONS[self.webarchives["option"]].get("url_prefix", None)
self.webarchives["url_prefix"] = ARCHIVES_OPTIONS[self.webarchives["option"]].get("url_prefix", "") or ""
archivedate = re.sub(r"\D", "", str(self.webarchives["date"]))
self.archivedate = str(archivedate) + "120000"
archivedt = datetime.strptime(self.archivedate, "%Y%m%d%H%M%S")
self.archivemindate = datetime.strftime(archivedt - timedelta(days=self.webarchives["days_range"]/2., seconds=43200), "%Y%m%d%H%M%S")
self.archivemaxdate = datetime.strftime(archivedt + timedelta(days=self.webarchives["days_range"]/2., seconds=43199), "%Y%m%d%H%M%S")

archiveprefix = self.webarchives["url_prefix"].rstrip('/')
self.archiveprefix = "%s/%s/" % (archiveprefix, self.archivedate)
self.archiveregexp = re.compile(r"^%s/(\d{14}).?/" % archiveprefix, re.I)
self.archivehost = "/".join(archiveprefix.split('/')[:3])
self.archivedomain_lru = url_to_lru_clean("http://%s" % get_domain_name(archiveprefix), TLDS_TREE)
archivedomain_regexp = "(?:%s|%s)" % (archiveprefix, archiveprefix.replace(self.archivehost, ""))
self.archiveredirect = re.compile(RE_ARCHIVE_REDIRECT % archivedomain_regexp, re.I|re.S)
archiveprefix = (self.webarchives["url_prefix"] or "").rstrip('/')
if archiveprefix:
self.archiveprefix = "%s/%s/" % (archiveprefix, self.archivedate)
self.archiveregexp = re.compile(r"^%s/(\d{14}).?/" % archiveprefix, re.I)
self.archivehost = "/".join(archiveprefix.split('/')[:3])
self.archivedomain_lru = url_to_lru_clean("http://%s" % get_domain_name(archiveprefix), TLDS_TREE)
archivedomain_regexp = "(?:%s|%s)" % (archiveprefix, archiveprefix.replace(self.archivehost, ""))
self.archiveredirect = re.compile(RE_ARCHIVE_REDIRECT % archivedomain_regexp, re.I|re.S)
else:
self.archiveprefix = ""
self.archivetimestamp = str(int((archivedt - datetime(1970, 1, 1)).total_seconds()))


if "proxy" in ARCHIVES_OPTIONS[self.webarchives["option"]]:
self.proxy = ARCHIVES_OPTIONS[self.webarchives["option"]]["proxy"]
Expand All @@ -124,7 +130,7 @@ def start_requests(self):
self.log("Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], logging.INFO)
self.log("ARGUMENTS : "+str(self.args), logging.INFO)
if self.webarchives:
self.log("Crawling on Web Archive using for prefix %s between %s and %s" % (self.archiveprefix, self.archivemindate, self.archivemaxdate))
self.log("Crawling on Web Archive %s using for prefix %s between %s and %s" % (self.webarchives["option"], self.archiveprefix, self.archivemindate, self.archivemaxdate))
if self.proxy:
self.log("Using proxy %s" % self.proxy, logging.INFO)

Expand Down Expand Up @@ -180,6 +186,7 @@ def spider_closed(self, spider, reason=""):
os.remove(fi)

def handle_response(self, response):
# self.log("RESPONSE %s: %s" % (response.url, dict(response.headers)), logging.DEBUG)

if self.phantom:
self.phantom.get(response.url)
Expand Down Expand Up @@ -237,7 +244,7 @@ def handle_response(self, response):
("archivesinternet.bnf.fr" in self.webarchives["url_prefix"] and 300 <= response.status < 400 and \
(not response.body or "<head><title>301 Moved Permanently</title></head>" in response.body)):
redir_url = response.headers['Location']
if redir_url.startswith("/"):
if redir_url.startswith("/") and self.archiveprefix:
redir_url = "%s%s" % (self.archivehost, redir_url)
if "archivesinternet.bnf.fr" in self.webarchives["url_prefix"]:
if "depth" in response.meta:
Expand All @@ -248,20 +255,21 @@ def handle_response(self, response):
if not response.body and redir > 10:
return self.parse_html(response)
return self._request(redir_url, redirection=redir, dont_filter=(not response.body))
real_url = self.archiveregexp.sub("", redir_url)
orig_url = self.archiveregexp.sub("", response.url)
match = self.archiveregexp.search(redir_url)
if match:
# Check date obtained fits into a user defined timerange and return 404 otherwise
if not (self.archivemindate <= match.group(1) <= self.archivemaxdate):
self.log("Skipping archive page (%s) with date (%s) outside desired range (%s/%s)" % (redir_url, match.group(1), self.archivemindate, self.archivemaxdate), logging.DEBUG)
return self._make_raw_page(response, archive_fail_url=redir_url)
if normalize(real_url) == normalize(orig_url):
if "depth" in response.meta:
response.meta['depth'] -= 1
else:
response.meta['depth'] = -1
return self._request(redir_url)
if self.archiveprefix:
real_url = self.archiveregexp.sub("", redir_url)
orig_url = self.archiveregexp.sub("", response.url)
match = self.archiveregexp.search(redir_url)
if match:
# Check date obtained fits into a user defined timerange and return 404 otherwise
if not (self.archivemindate <= match.group(1) <= self.archivemaxdate):
self.log("Skipping archive page (%s) with date (%s) outside desired range (%s/%s)" % (redir_url, match.group(1), self.archivemindate, self.archivemaxdate), logging.DEBUG)
return self._make_raw_page(response, archive_fail_url=redir_url)
if normalize(real_url) == normalize(orig_url):
if "depth" in response.meta:
response.meta['depth'] -= 1
else:
response.meta['depth'] = -1
return self._request(redir_url)
if response.status >= 400:
return self._make_raw_page(response)

Expand Down Expand Up @@ -289,7 +297,7 @@ def parse_html(self, response):
archive_timestamp = None
clean_body = None
orig_url = response.url
if self.webarchives:
if self.webarchives and self.archiveprefix:
orig_url = self.archiveregexp.sub("", orig_url)
lru = url_to_lru_clean(orig_url, TLDS_TREE)
lrulinks = []
Expand All @@ -299,7 +307,8 @@ def parse_html(self, response):

skip_page = False
if self.webarchives:
redir_url = self.archiveredirect.search(response.body)
# TODO detect INA redirection cases
redir_url = self.archiveredirect.search(response.body) if self.archiveprefix else None
if "web.archive.org" in self.webarchives["url_prefix"]:
# Remove WEB ARCHIVES banner
clean_body = RE_WEB_ARCHIVES_BANNER.sub("", response.body)
Expand All @@ -322,6 +331,18 @@ def parse_html(self, response):
# Remove BNF banner
clean_body = RE_BNF_ARCHIVES_BANNER.sub("", response.body)

elif self.webarchives["option"] == "dlweb.ina.fr":
archive_url = response.headers["X-Response-URL"]
# Check date obtained fits into a user defined timerange and return 404 otherwise
archive_timestamp = response.headers["X-Response-Time"]
if not archive_timestamp:
self.log("Skipping archive page (%s) for which archive date could not be found within response's headers (%s)." % (response.url, dict(response.headers)), logging.ERROR)
return
archive_timestamp = datetime.strftime((datetime(1970, 1, 1) + timedelta(seconds=int(archive_timestamp))), "%Y%m%d%H%M%S")
if not (self.archivemindate <= archive_timestamp <= self.archivemaxdate):
self.log("Skipping archive page (%s) with date (%s) outside desired range (%s/%s)" % (response.url, archive_timestamp, self.archivemindate, self.archivemaxdate), logging.DEBUG)
skip_page = archive_url

# Specific case of redirections from website returned by archives as JS redirections with code 200
elif redir_url:
response.status = int(redir_url.group(2))
Expand All @@ -339,7 +360,7 @@ def parse_html(self, response):
if not skip_page and 300 <= response.status < 400:
redir_url = response.headers['Location']

if self.webarchives and self.archiveregexp.match(redir_url):
if self.webarchives and self.archiveprefix and self.archiveregexp.match(redir_url):
redir_url = self.archiveregexp.sub("", redir_url)

if redir_url.startswith('/'):
Expand Down Expand Up @@ -369,7 +390,7 @@ def parse_html(self, response):
except AttributeError:
url = link['url']

if self.webarchives:
if self.webarchives and self.archiveprefix:
# Rewrite archives urls and filter internal archives links
url = self.archiveregexp.sub("", url)
if url.startswith(self.archivehost) or \
Expand Down Expand Up @@ -414,7 +435,7 @@ def _make_raw_page(self, response, modified_body=None, archive_fail_url=None):
p = Page()
p['url'] = response.url
if self.webarchives:
p['url'] = self.archiveregexp.sub("", response.url)
p['url'] = self.archiveregexp.sub("", response.url) if self.archiveprefix else response.headers.get("x-response-url", response.url)
p['archive_url'] = archive_fail_url or response.url
p['archive_date_requested'] = self.archivedate
if 'archive_timestamp' in response.meta and not archive_fail_url:
Expand Down Expand Up @@ -447,6 +468,17 @@ def _request(self, url, noproxy=False, redirection=0, **kw):
if self.phantom:
kw['method'] = 'HEAD'
if self.webarchives:
if self.webarchives["option"] == "dlweb.ina.fr":
# TODO: fix urls such as http://domain.com without trailing slash not working with INA archive
kw["headers"] = {
"X-DLWeb-Token": sha256("%s\n%s\n" % (WEBARCHIVES_PASSWORD, url)).hexdigest(),
"X-Request-Time": self.archivetimestamp,
"X-User": "HYPHE_%s_%s" % (HYPHE_PROJECT, self.crawler.settings['JOBID'])
}
kw["meta"]["archive_timestamp"] = self.archivedate
# self.log("REQUEST %s: %s" % (url, kw), logging.DEBUG)

return Request(url, **kw)
if "archivesinternet.bnf.fr" in self.webarchives["url_prefix"]:
kw['headers'] = {
"BnF-OSWM-User-Name": "WS-HYPHE_%s_%s" % (HYPHE_PROJECT, self.crawler.settings['JOBID'])
Expand Down
1 change: 1 addition & 0 deletions hyphe_backend/lib/config_hci.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def validateStartpagesMode(modes):
}, "webarchives": {
"type": dict,
"int_fields": ["days_range"],
"str_fields": ["password"],
"extra_fields": {
"options": webarchives.validateOptions,
"date": webarchives.validateArchiveDate
Expand Down
6 changes: 6 additions & 0 deletions hyphe_backend/lib/webarchives.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@
"permalinks_prefix": "http://rntse.bnf.fr/jsp/lancerDL.jsp?appli=WAYBACK_URL&titre=la%20navigation%20dans%20les%20archives%20de%20l%27internet&url=http://archivesinternet.bnf.fr/DATETIME/SOURCEURL",
"proxy": "archivesinternet.bnf.fr:8090",
"min_date": "1996-01-01"
},
"dlweb.ina.fr": {
"label": "DLWeb.INA.fr",
"description": "crawl France's official web medias archives maintained by INA",
"proxy": "dlweb.ina.fr:82",
"min_date": "1996-01-01"
}
}

Expand Down

0 comments on commit 8eb457f

Please sign in to comment.