-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtag.py
executable file
·70 lines (57 loc) · 2.26 KB
/
tag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python3
from hashlib import md5
from pathlib import Path
import spacy
spacy.prefer_gpu()
NLP = {
"lat": spacy.load("la_core_web_trf", disable=["ner"]),
"grc": spacy.load("grc_proiel_trf", disable=["ner"]),
}
DATA_DIR = Path("data")
def get_files(directory):
for group in directory.iterdir():
if group.is_dir() and group.name != ".git":
for work in group.iterdir():
for file in work.iterdir():
if file.suffix == ".tsv" and ".tagged" not in file.stem:
if "lat" in file.stem:
yield "lat", file
elif "grc" in file.stem:
yield "grc", file
unchanged = 0
changed = 0
for shard in DATA_DIR.iterdir():
if not shard.is_dir():
continue
for lang, file in get_files(shard):
indata = open(file).read()
hash = md5(indata.encode("utf-8")).hexdigest()
work_dir = file.parent
if work_dir.joinpath(f"{file.stem}.tagged.md5").exists() and open(work_dir.joinpath(f"{file.stem}.tagged.md5")).read() == hash:
print(f"{file} unchanged")
unchanged += 1
else:
runtime_error = False
print(f"{file} changed", end="...")
with open(file.with_suffix(".tagged.tsv"), "w") as outfile:
for line in open(file):
print(".", end="", flush=True)
ref, text = line.rstrip("\n").split("\t")
try:
nlp = NLP[lang]
nlp.max_length = 2_000_000
doc = nlp(text)
except RuntimeError as e:
print(f"Error in {file}: {ref} : {e}")
runtime_error = True
break
for token in doc:
print(ref, token.i, token.text, token.pos_, token.tag_, token.morph, token.lemma_, token.dep_, token.head.i, sep="\t", file=outfile)
if runtime_error:
continue
print("tagged.")
work_dir.joinpath(f"{file.stem}.tagged.md5").write_text(hash)
changed += 1
print()
print(f"{changed} changed")
print(f"{unchanged} unchanged")