Add more cmd args, fix upper/lower char mismatch

argosopentech · Sep 17, 2024 · c56a843 · c56a843
1 parent 8b96e5e
commit c56a843
Showing 1 changed file with 27 additions and 10 deletions.
diff --git a/bin/generate-wiktionary-data b/bin/generate-wiktionary-data
@@ -8,13 +8,17 @@ import argparse
 from pathlib import Path
 
 # Configure
-sl = 'en'
-tl = 'es'
 
 parser = argparse.ArgumentParser()
 parser.add_argument("wikidata", help="path to Wiktionary JSON file")
-parser.add_argument("wikidata2", help="path to second Wiktionary JSON file (optional)")
+parser.add_argument("wikidata2", nargs='?', help="path to second Wiktionary JSON file (optional)")
+parser.add_argument("--source", type=str, default='en', help="source language code. Default: %(default)s")
+parser.add_argument("--target", type=str, default='es', help="target language code. Default: %(default)s")
+parser.add_argument("--force", help="overwrite existing files if they already exists. Default: no", action="store_true")
+
 args = parser.parse_args()
+sl = args.source
+tl = args.target
 
 # Read JSON
 wikidata = []
@@ -29,11 +33,22 @@ print("Read JSON into memory")
 
 source_data = []
 target_data = []
+
+def add_data(source, target):
+    # Fix inconsistencies of capitalization in Western languages
+    if source[0].isupper() and target[0].islower():
+        target = target[0].upper() + target[1:]
+    elif source[0].islower() and target[0].isupper():
+        target = target[0].lower() + target[1:]
+
+    source_data.append(source)
+    target_data.append(target)
+
 for data in wikidata:
     word = data.get("word")
     translations = data.get("translations")
     lang_code = data.get("lang_code")
-    if not word or not translations:
+    if not word or not translations or word.startswith("-"):
         continue
     if lang_code == sl:
         target_translations = list(filter(lambda x: x.get("code") == tl, translations))
@@ -42,17 +57,17 @@ for data in wikidata:
         target_translation = target_translations[0].get("word")
         if not target_translation:
             continue
-        source_data.append(word)
-        target_data.append(target_translation)
+
+        add_data(word, target_translation)
     elif lang_code == tl:
         target_translations = list(filter(lambda x: x.get("code") == sl, translations))
         if len(target_translations) < 1:
             continue
         target_translation = target_translations[0].get("word")
         if not target_translation:
             continue
-        target_data.append(word)
-        source_data.append(target_translation)
+
+        add_data(word, target_translation)
 
 # Extract single word translation data
 """
@@ -124,8 +139,10 @@ for filename, data in [
     ("wiktionary." + tl, target_data),
 ]:
     filename = Path(filename)
-    assert not filename.exists()
+    if not args.force:
+        assert not filename.exists()
+
     data_file = open(filename, "w")
     data_file.write("\n".join(data))
     data_file.close()
-print("Wrote to wiktionary.*")
+    print("Wrote %s" % filename)