-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtbx_merge.py
236 lines (202 loc) · 7.86 KB
/
tbx_merge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
from copy import deepcopy
import os
import argparse
import requests
from pathlib import Path
from xml.etree import ElementTree as et
nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
class hashabledict(dict):
def __hash__(self):
return hash(tuple(sorted(self.items())))
class XMLCombiner(object):
def __init__(self, filenames):
if len(filenames) == 0:
raise FileNotFoundError("Invalid path, or path contains no valid files.")
try:
self.roots = [et.parse(f).getroot() for f in filenames]
except SyntaxError:
print(
"Invalid tbx file, possibly due to incorrect locale code. Check your locales file to ensure all locale codes are valid."
)
def combine(self):
for r in self.roots[1:]:
self.combine_element(self.roots[0], r)
return et.ElementTree(self.roots[0])
def combine_element(self, one, other):
"""
This function recursively combines text, attributes, and children of an xml element tree.
It updates either the text/attributes/children of an element if another element is found in `one`,
or adds it from `other` if not found.
"""
mapping = {(el.tag, hashabledict(el.attrib)): el for el in one}
for el in other:
if len(el) == 0:
# Not nested
try:
mapping[(el.tag, hashabledict(el.attrib))].text = el.text
except KeyError:
# Element not found in the mapping
mapping[(el.tag, hashabledict(el.attrib))] = el
one.append(el)
else:
# Nested
try:
# Recursively process the element, and update it in the same way
self.combine_element(mapping[(el.tag, hashabledict(el.attrib))], el)
except KeyError:
# Element not found in the mapping
mapping[(el.tag, hashabledict(el.attrib))] = el
one.append(el)
def extract_smartling_id_term(f):
"""Creates a dictionary with (term, definition) as key, and the Smartling UID as value."""
root = et.parse(f).getroot()
smartling_map = {}
for termEntry in root.iter("termEntry"):
term = termEntry.find(
"./langSet[@xml:lang='en-US']/tig/term",
nsmap,
).text
try:
definition = termEntry.find(
"./descrip[@type='definition']",
nsmap,
).text
except AttributeError:
definition = None
id = termEntry.attrib["id"]
smartling_map[(term, definition)] = id
return smartling_map
def replace_pontoon_ids(etree, smartling_map):
"""Replaces Pontoon IDs with Smartling IDs if the term and definition match exactly a term in Smartling glossary file."""
root = etree.getroot()
pontoon_term_map = {}
for termEntry in root.iter("termEntry"):
term = termEntry.find(
"./langSet[@xml:lang='en-US']/ntig/termGrp/term",
nsmap,
).text
try:
definition = termEntry.find(
"./langSet[@xml:lang='en-US']/descripGrp/descrip[@type='definition']",
nsmap,
).text
except AttributeError:
definition = None
pontoon_term_map[(term, definition)] = termEntry
for key in pontoon_term_map:
if key in smartling_map:
pontoon_term_map[key].attrib["id"] = smartling_map[key]
else:
pontoon_term_map[key].attrib.pop("id", None)
def remove_all_ids(etree):
"""Smartling system only accepts IDs it creates. This removes all IDs so all terms will be registered as new."""
root = etree.getroot()
for termEntry in root.iter("termEntry"):
termEntry.attrib.pop("id", None)
def export_tbx(locale_list):
root_path = os.path.abspath(
os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)
)
locale_path = os.path.join(root_path, "pontoon_exports")
if not os.path.isdir(locale_path):
os.mkdir(locale_path)
for locale in locale_list:
try:
response = requests.get(
f"https://pontoon.mozilla.org/terminology/{locale}.v2.tbx"
)
with open(os.path.join(locale_path, f"{locale}_pontoon.tbx"), "wb") as f:
f.write(response.content)
except Exception as e:
print(e)
return list(Path(locale_path).glob("*.tbx"))
def convert_locale_codes(etree):
smartling_locale_map = {
"bn": ["bn-BD"],
"bg": ["bg-BG"],
"cs": ["cs-CZ"],
"da": ["da-DK"],
"de": ["de-DE"],
"el": ["el-GR"],
"et": ["et-EE"],
"fa": ["fa-IR"],
"fi": ["fi-FI"],
"fr": ["fr-CA", "fr-FR"],
"hr": ["hr-HR"],
"hu": ["hu-HU"],
"id": ["id-ID"],
"is": ["is-IS"],
"it": ["it-IT"],
"ja": ["ja-JP"],
"ko": ["ko-KR"],
"lt": ["lt-LT"],
"lv": ["lv-lv"],
"ms": ["ms-MY"],
"nl": ["nl-NL"],
"pl": ["pl-PL"],
"ro": ["ro-RO"],
"ru": ["ru-RU"],
"sk": ["sk-SK"],
"sl": ["sl-SI"],
"tr": ["tr-TR"],
"uk": ["uk-UA"],
"vi": ["vi-VN"],
}
root = etree.getroot()
for mozilla_locale, smartling_locales in smartling_locale_map.items():
for termEntry in root.iter("termEntry"):
for langSet in termEntry.iter("langSet"):
if langSet.attrib[f"{{{nsmap['xml']}}}lang"] == mozilla_locale:
langSet.attrib[f"{{{nsmap['xml']}}}lang"] = smartling_locales[0]
if len(smartling_locales) > 1:
for locale in smartling_locales[1:]:
langSetCopy = deepcopy(langSet)
langSetCopy.attrib[f"{{{nsmap['xml']}}}lang"] = locale
termEntry.append(langSetCopy)
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--locales",
required=True,
dest="locale_list",
help="Path to .txt file with each required locale code entered on a new line. The appropriate .tbx file will be exported from Pontoon.",
)
parser.add_argument(
"--id-format",
required=True,
dest="ids",
choices=["smartling", "new", "pontoon"],
help="Define how to set IDs for termEntry. Select smartling for importing into an existing Smartling glossary, new for creating a new Smartling glossary, or pontoon to preserve Pontoon IDs.",
)
parser.add_argument(
"--smartling",
dest="smartling_export",
help="Path to glossary tbx file exported from Smartling. Required when using --id-format smartling",
)
args = parser.parse_args()
if args.ids == "smartling" and not args.smartling_export:
parser.error(
"Path to Smartling glossary tbx file not defined (--smartling argument required)."
)
with open(args.locale_list) as f:
locale_list = [locale.strip() for locale in f]
merge_files = export_tbx(locale_list)
merged_tree = XMLCombiner(merge_files).combine()
if args.ids == "pontoon":
merged_tree.write(
"pontoon_glossary_multilingual.tbx", encoding="UTF-8", xml_declaration=True
)
if args.ids == "smartling":
smartling_map = extract_smartling_id_term(args.smartling_export)
replace_pontoon_ids(merged_tree, smartling_map)
convert_locale_codes(merged_tree)
merged_tree.write(
"smartling_merge_glossary.tbx", encoding="UTF-8", xml_declaration=True
)
if args.ids == "new":
remove_all_ids(merged_tree)
merged_tree.write(
"smartling_new_glossary.tbx", encoding="UTF-8", xml_declaration=True
)
if __name__ == "__main__":
main()