-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_anafora_timenorm.py
131 lines (107 loc) · 5.38 KB
/
extract_anafora_timenorm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import os
import argparse
import anafora
operator_types = ["This", "Last", "Next", "Before", "After", "Between",
"Sum", "Difference", "Union", "Intersection", "Every-Nth",
"NthFromStart", "NthFromEnd", "Frequency"]
def remove_item(element, item_name):
item = element.xml.find(item_name)
element.xml.remove(item)
def convert_xml(xml_path, output_path, raw_path=None):
data = anafora.AnaforaData.from_file(xml_path)
wrong_patterns = False
raw = None
if raw_path is not None:
with open(raw_path) as raw_file:
raw = raw_file.read()
# Remove Events
annotations_to_delete = list(data.annotations.select_type("Event"))
for annotation in annotations_to_delete:
data.annotations.remove(annotation)
# Remove PreAnnotations
annotations_to_delete = list(data.annotations.select_type("PreAnnotation"))
for annotation in annotations_to_delete:
data.annotations.remove(annotation)
# Remove NotNormalizable
annotations_to_delete = list(data.annotations.select_type("NotNormalizable"))
for annotation in annotations_to_delete:
data.annotations.remove(annotation)
# Remove everything but Type
annotations_by_span = {}
duplicate_annotations = set()
for annotation in iter(data.annotations):
remove_item(annotation, "parentsType")
remove_item(annotation, "properties")
for span in annotation.spans:
if span not in annotations_by_span:
annotations_by_span[span] = set()
annotation_types_in_span = [annotation_in_span.type for annotation_in_span in annotations_by_span[span]]
if annotation.type not in annotation_types_in_span:
annotations_by_span[span].add(annotation)
else:
duplicate_annotations.add(annotation)
# Remove duplicate annotations
for annotation in duplicate_annotations:
data.annotations.remove(annotation)
# Remove implicit operators and unwanted entities
spans_with_multiple = [span for span in annotations_by_span if len(annotations_by_span[span]) > 1]
for span in spans_with_multiple:
annotations = annotations_by_span[span]
operators = [annotation for annotation in annotations if annotation.type in operator_types]
num_operators = len(operators)
num_non_operators = len(annotations) - num_operators
to_remove = set()
if num_non_operators > 1:
periods = [annotation for annotation in annotations if annotation.type == "Period"]
if len(periods) > 0:
numbers = [annotation for annotation in annotations if annotation.type == "Number"]
to_remove.update(numbers)
if num_operators > 1:
intersections = [annotation for annotation in annotations if annotation.type == "Intersection"]
to_remove.update(intersections)
if num_operators > 0 and num_non_operators > 0:
to_remove.update(operators)
if len(annotations) - len(to_remove) == 1:
for annotation in to_remove:
data.annotations.remove(annotation)
else:
wrong_patterns = True
print("Wrong annotation pattern: %s" % annotations)
if raw is not None:
start, end = span
print("TEXT: %s" % raw[start-10:end+10])
if wrong_patterns:
print("Data with wrong patterns. File %s will not be saved.\n" % output_path)
else:
output_document = os.path.split(output_path)[0]
if not os.path.exists(output_document):
os.mkdir(output_document)
data.to_file(output_path)
def convert_dir(input_dir, output_dir, raw_dir=None):
if not os.path.exists(output_dir):
os.mkdir(output_dir)
for document in anafora.walk(input_dir):
document_dir = document[0]
document_name = document[1]
for xml_name in document[2]:
if xml_name.endswith(".TimeNorm.gold.completed.xml"):
xml_path = os.path.join(input_dir, document_dir, xml_name)
output_path = os.path.join(output_dir, document_name, xml_name)
raw_path = None
if raw_dir is not None:
raw_path = os.path.join(raw_dir, document_dir, document_name)
convert_xml(xml_path, output_path, raw_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="""%(prog)s converts one directory of Anafora XML annotations
into the SemEval-2021 shared task format.""")
parser.add_argument("-i", "--input", metavar="DIR", dest="input_dir", required=True,
help="The root of a set of Anafora XML directories representing reference annotations.")
parser.add_argument("-o", "--output", metavar="DIR", dest="output_dir", required=True,
help="The root of the directory structure where the converted Anafora XML will be stored.")
parser.add_argument("-r", "--raw", metavar="DIR", dest="raw_dir", required=False,
help="The root of directories containing the raw texts for debugging.")
args = parser.parse_args()
input_dir = args.input_dir
output_dir = args.output_dir
raw_dir = args.raw_dir
convert_dir(input_dir, output_dir, raw_dir=raw_dir)