Skip to content

Commit

Permalink
Add more cmd args, fix upper/lower char mismatch
Browse files Browse the repository at this point in the history
  • Loading branch information
pierotofy authored and argosopentech committed Sep 17, 2024
1 parent 8b96e5e commit c56a843
Showing 1 changed file with 27 additions and 10 deletions.
37 changes: 27 additions & 10 deletions bin/generate-wiktionary-data
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,17 @@ import argparse
from pathlib import Path

# Configure
sl = 'en'
tl = 'es'

parser = argparse.ArgumentParser()
parser.add_argument("wikidata", help="path to Wiktionary JSON file")
parser.add_argument("wikidata2", help="path to second Wiktionary JSON file (optional)")
parser.add_argument("wikidata2", nargs='?', help="path to second Wiktionary JSON file (optional)")
parser.add_argument("--source", type=str, default='en', help="source language code. Default: %(default)s")
parser.add_argument("--target", type=str, default='es', help="target language code. Default: %(default)s")
parser.add_argument("--force", help="overwrite existing files if they already exists. Default: no", action="store_true")

args = parser.parse_args()
sl = args.source
tl = args.target

# Read JSON
wikidata = []
Expand All @@ -29,11 +33,22 @@ print("Read JSON into memory")

source_data = []
target_data = []

def add_data(source, target):
# Fix inconsistencies of capitalization in Western languages
if source[0].isupper() and target[0].islower():
target = target[0].upper() + target[1:]
elif source[0].islower() and target[0].isupper():
target = target[0].lower() + target[1:]

source_data.append(source)
target_data.append(target)

for data in wikidata:
word = data.get("word")
translations = data.get("translations")
lang_code = data.get("lang_code")
if not word or not translations:
if not word or not translations or word.startswith("-"):
continue
if lang_code == sl:
target_translations = list(filter(lambda x: x.get("code") == tl, translations))
Expand All @@ -42,17 +57,17 @@ for data in wikidata:
target_translation = target_translations[0].get("word")
if not target_translation:
continue
source_data.append(word)
target_data.append(target_translation)

add_data(word, target_translation)
elif lang_code == tl:
target_translations = list(filter(lambda x: x.get("code") == sl, translations))
if len(target_translations) < 1:
continue
target_translation = target_translations[0].get("word")
if not target_translation:
continue
target_data.append(word)
source_data.append(target_translation)

add_data(word, target_translation)

# Extract single word translation data
"""
Expand Down Expand Up @@ -124,8 +139,10 @@ for filename, data in [
("wiktionary." + tl, target_data),
]:
filename = Path(filename)
assert not filename.exists()
if not args.force:
assert not filename.exists()

data_file = open(filename, "w")
data_file.write("\n".join(data))
data_file.close()
print("Wrote to wiktionary.*")
print("Wrote %s" % filename)

0 comments on commit c56a843

Please sign in to comment.