-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIndexEngine.py
executable file
·190 lines (151 loc) · 5.52 KB
/
IndexEngine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
"""
This program reads a gzip-compressed LATimes file (latimes.gz), extracts metadata (DOCNO, HEADLINE), and text from each document, and:
- Tokenizes text from the TEXT, HEADLINE, and GRAPHIC tags (without removing stopwords or stemming)
- Calculates and stores document lengths
- Converts tokens to integer IDs using a lexicon
- Builds an in-memory inverted index, mapping term IDs to document IDs and term frequencies
- Stores each document as a separate file in a directory structure based on the document's date (YY/MM/DD), using the DOCNO as the filename
Usage:
python index_engine.py <path_to_gz_file> <output_directory>
Arguments:
<path_to_gz_file>: path to the latimes.gz file containing the documents
<output_directory>: directory where the documents and metadata will be stored
Example:
python IndexEngine.py /home/smucker/latimes.gz /home/smucker/latimes-index
"""
from datetime import datetime
import json
import os
import gzip
import sys
import re
from collections import defaultdict
# global vars
docnos = []
docno_to_id = {}
lexicon = {}
postings = defaultdict(list)
curr_tid = 0
doc_lengths = []
def main():
if len(sys.argv) != 3:
print("Usage: python IndexEngine.py <path_to_gz_file> <output_dir>")
sys.exit(1)
input_gz = sys.argv[1]
output_dir = sys.argv[2]
if not os.path.exists(input_gz):
print(f"Error: File '{input_gz}' does not exist.")
sys.exit(1)
if os.path.exists(output_dir):
print(f"Error: Output directory '{output_dir}' already exists.")
sys.exit(1)
os.makedirs(output_dir, exist_ok=True)
docno_list_file = os.path.join(output_dir, "docno_list.txt")
docno_id_map_file = os.path.join(output_dir, "docno_id_map.json")
with open(docno_list_file, "w") as map_out:
with gzip.open(input_gz, "rt") as f:
doc = ""
within = False
for line in f:
if "<DOC>" in line:
within = True
doc = line
elif "</DOC>" in line:
doc += line
process(doc, output_dir, map_out, len(docnos))
docnos.append(doc.split("</DOCNO>")[0].split("<DOCNO>")[1].strip())
doc = ""
within = False
elif within:
doc += line
with open(docno_id_map_file, "w") as f:
json.dump(docno_to_id, f, indent=4)
save(output_dir)
def process(doc, output_dir, map_out, iid):
global curr_tid
docno = extract(doc, "DOCNO")
headline = extract(doc, "HEADLINE")
year, month, day = parse_docno_to_date(docno)
text_content = (
extract(doc, "TEXT")
+ " "
+ extract(doc, "HEADLINE")
+ " "
+ extract(doc, "GRAPHIC")
)
tokens = []
Tokenize(text_content, tokens)
length = len(tokens)
doc_lengths.append(length)
tf = defaultdict(int)
for token in tokens:
if token not in lexicon:
lexicon[token] = curr_tid
curr_tid += 1
tid = lexicon[token]
tf[tid] += 1
for tid, freq in tf.items():
postings[tid].append((iid, freq))
docno_to_id[docno] = iid
map_out.write(docno + "\n")
# normalize the headline to a single line
headline = " ".join(headline.split())
date_path = os.path.join(output_dir, year, month, day)
os.makedirs(date_path, exist_ok=True)
raw_doc_filename = os.path.join(date_path, f"{docno}.txt")
with open(raw_doc_filename, "w") as raw_doc_file:
raw_doc_file.write(f"{doc}\n")
metadata_filename = os.path.join(date_path, f"{docno}.metadata.txt")
with open(metadata_filename, "w") as metadata_file:
metadata_file.write(f"docno: {docno}\n")
metadata_file.write(f"internal id: {iid}\n")
metadata_file.write(f"date: {format_date(year, month, day)}\n")
metadata_file.write(f"headline: {headline}\n")
metadata_file.write(f"document length: {length}\n")
# Based on SimpleTokenizer by Trevor Strohman,
# http://www.galagosearch.org/
def Tokenize(text, tokens):
text = text.lower()
start = 0
i = 0
for currChar in text:
if not currChar.isdigit() and not currChar.isalpha():
if start != i:
token = text[start:i]
tokens.append(token)
start = i + 1
i = i + 1
if start != i:
tokens.append(text[start:i])
def save(output_dir):
inverted_index_file = os.path.join(output_dir, "inverted-index.json")
lexicon_file = os.path.join(output_dir, "lexicon.json")
doc_lengths_file = os.path.join(output_dir, "doc-lengths.txt")
with open(inverted_index_file, "w") as f:
json.dump(postings, f)
with open(lexicon_file, "w") as f:
json.dump(lexicon, f)
with open(doc_lengths_file, "w") as f:
for length in doc_lengths:
f.write(f"{length}\n")
def extract(doc, tag):
start_tag = f"<{tag}>"
end_tag = f"</{tag}>"
start = doc.find(start_tag)
end = doc.find(end_tag)
if start == -1 or end == -1:
return ""
content = doc[start + len(start_tag) : end].strip()
content = re.sub(r"<[^>]+>", "", content)
return content
def parse_docno_to_date(docno):
month = docno[2:4]
day = docno[4:6]
year = "19" + docno[6:8]
return year, month, day
def format_date(year, month, day):
date_obj = datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d")
formatted_date = date_obj.strftime("%B %d, %Y")
return formatted_date
if __name__ == "__main__":
main()