Skip to content

Commit

Permalink
Support the new header after 2020-34
Browse files Browse the repository at this point in the history
  • Loading branch information
akeyhero committed Jul 19, 2023
1 parent bda555b commit bbcd656
Showing 1 changed file with 13 additions and 5 deletions.
18 changes: 13 additions & 5 deletions cc_net/process_wet_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,21 @@ def parse_doc(headers: List[str], doc: List[str]) -> Optional[dict]:
return None

try:
warc_type = headers[1].split()[1]
headers_map = {}

for header in headers[1:]:
if not header:
continue
key, value = header.split(": ", 1)
headers_map[key] = value

warc_type = headers_map["WARC-Type"]
if warc_type != "conversion":
return None
url = headers[2].split()[1]
date = headers[3].split()[1]
digest = headers[6].split()[1]
length = int(headers[8].split()[1])
url = headers_map["WARC-Target-URI"]
date = headers_map["WARC-Date"]
digest = headers_map["WARC-Block-Digest"]
length = int(headers_map["Content-Length"])
except Exception as e:
logger.warning("Can't parse header:", e, headers, doc)
return None
Expand Down

0 comments on commit bbcd656

Please sign in to comment.