-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchallenge.py
76 lines (60 loc) · 2.07 KB
/
challenge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""Minimal normalization of json file containing organization affiliation data from https://arxiv.org/"""
from distutils.util import change_root
import json
import re
import csv
import sys
import logging
ABBREVIATIONS = {
"u": "University",
"uni": "University",
"univ": "University",
"dept": "Department",
"inst": "Institute",
"nat": "National",
"chem": "Chemistry",
"div": "Division",
"obs": "Observatory",
"phys": "Physics",
# 'tech' could be 'Technology' or 'Technological' so it is not included
}
def fix_abbreviations(items: list):
normalized = {}
final_affiliations = []
change_count = 0
for item in items:
org = item["author_affiliation"]
if normalized.get(org):
final_affiliations.append((org, normalized.get(org)))
change_count += 1
else:
normal_org = normalize_name(org)
if normal_org != org:
change_count += 1
final_affiliations.append((org, normal_org))
return final_affiliations, change_count
def normalize_name(org):
for abbreviation, full in ABBREVIATIONS.items():
org = re.sub(rf"(?i)\b{abbreviation}($|\.|\b)", full, org)
return org
# TODO: normalize missing values: '', 'na', 'n/a', etc.
# TODO: normalize orgs listed as one character
# TODO: remove repeated phrases like "for the"
if __name__ == "__main__":
# TODO handle invalid arguments
if len(sys.argv) != 3:
raise IndexError(
"Please try running again with arguments <input_file_name> <output_filename>")
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file, "r"):
data = json.load(open(input_file, "r"))
normalized, change_count = fix_abbreviations(data)
print("change count: " + str(change_count))
with open(output_file, "w") as f:
w = csv.writer(f, dialect="excel")
w.writerow(["original_affiliation", "normalized_affiliation"])
w.writerows(normalized)
print(f"File saved as {output_file}")