-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate-log-warc.py
executable file
·62 lines (51 loc) · 2.36 KB
/
create-log-warc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python
from argparse import ArgumentParser
from pathlib import Path
from datetime import datetime, timezone
import importlib.metadata
import sys
from warcio import WARCWriter
from edgi_versionista_warc.versionista_warc import format_datetime_iso
from edgi_versionista_warc.warctools import WARC_VERSION
def cli() -> None:
parser = ArgumentParser(description="Create a WARC containing log file from the `edgi-versionista-warc` script")
parser.add_argument('--uncompressed', action='store_true', help='Create uncompressed `.warc` files instead of gzipped `.warc.gz` files')
parser.add_argument('path', help='Path to directory or log file to build WARC for')
configuration = parser.parse_args()
gzip = not configuration.uncompressed
log_path = Path(configuration.path)
if log_path.is_dir():
log_path = log_path / 'log.txt'
if not log_path.exists():
print(f'No log file found at {log_path}')
sys.exit(1)
log_time = datetime.fromtimestamp(log_path.stat().st_ctime).astimezone(timezone.utc)
warc_suffix = f'--{log_time.strftime('%Y-%m-%dT%H%M%S')}.warc'
if gzip:
warc_suffix += '.gz'
warc_name = f'{log_path.stem}{warc_suffix}'
warc_path = log_path.parent / f'{log_path.stem}{warc_suffix}'
with warc_path.open('wb') as warcfile:
warc = WARCWriter(warcfile, gzip=gzip, warc_version=WARC_VERSION)
warc.write_record(warc.create_warcinfo_record(warc_name, {
'software': f'warcio/{importlib.metadata.version("warcio")}',
'format': f'WARC file version {WARC_VERSION}',
'operator': '"Environmental Data & Governance Initiative" <[email protected]>',
'description': (
"Log file listing notices and warnings when generating WARCs of web content captured by EDGI's "
'Web Monitoring project using Versionista (https://versionista.com).'
)
}))
with log_path.open('rb') as logfile:
warc.write_record(warc.create_warc_record(
None,
'resource',
warc_headers_dict={
'WARC-Date': format_datetime_iso(log_time),
'Content-Type': 'text/plain'
},
payload=logfile
))
print(f'Wrote WARC file to {warc_path}')
if __name__ == '__main__':
cli()