Lots of updates to specific languages.

I lost track! 😣
hltdi · Jun 27, 2020 · 4d1f11d · tofeahmed3 · Sep 1, 2022 · megasser
1 parent 7efd12e
commit 4d1f11d
Show file tree

Hide file tree

Showing 116 changed files with 2,001,627 additions and 1,553,989 deletions.
diff --git a/es_corpus.py b/es_corpus.py
@@ -3,7 +3,9 @@
 CORP_PATH = "../../Projects/LingData/Es/POS/cess_tags_sents.txt"
 
 CORPUS = []
+
 with open(CORP_PATH, encoding='utf8') as f:
+    """Read in the CESS corpus, replacing empty lines with ('EOS', 'X')."""
     for line in f:
         if not line.strip():
             CORPUS.append(('EOS', 'X'))
@@ -13,33 +15,82 @@
 esanal = lambda word: morfo.anal('spa', word, raw=True)
 
 def anal_v(word, tag):
+    '''If word is verb, analyze with morfo, replacing tag with more
+    specific one.'''
     if tag == 'v':
         analyses = [a[1] for a in esanal(word.lower())]
         v_anals = [a for a in analyses if a.get('pos') == 'v']
         if not v_anals:
-            print("Something wrong; can't analyze verb {}".format(word))
+#            print("Something wrong; can't analyze verb {}".format(word))
             return None
         else:
             v_anal = v_anals[0]
             v_tm = v_anal.get('tm')
             if v_tm in ['prc', 'inf', 'ger']:
-                return "v.nonfin"
+                return "v." + v_tm
             else:
                 return "v.fin"
     else:
         return None
 
+def anal_v_feats(word, tag):
+    '''If word is verb, analyze with morfo, replacing tag with more
+    specific one.'''
+    if tag == 'v':
+        analyses = [a[1] for a in esanal(word.lower())]
+        v_anals = [a for a in analyses if a.get('pos') == 'v']
+        if not v_anals:
+#            print("Something wrong; can't analyze verb {}".format(word))
+            return None
+        else:
+            impv = []
+            fin = []
+            nonfin = []
+            for v_anal in v_anals:
+                tm = v_anal.get('tm')
+                if tm == 'ipv':
+                    impv.append(v_anal)
+                elif tm in ['prc', 'inf', 'ger']:
+                    nonfin.append(v_anal)
+                else:
+                    fin.append(v_anal)
+            if fin:
+                if not impv and not nonfin:
+                    return "v.fin"
+            elif impv and not nonfin:
+                return "v.ipv"
+    else:
+        return None
+
 def proc_v():
+    '''Do anal_v on whole corpus.'''
     for word_tag in CORPUS:
         if len(word_tag) < 2:
             print("Something wrong with {}".format(word_tag))
         tag = anal_v(word_tag[0], word_tag[1])
         if tag:
             word_tag[1] = tag
+
+
+def proc_v_feats():
+    '''Do anal_v_feats on whole corpus.'''
+    n_ipv = 0
+    for index, word_tag in enumerate(CORPUS):
+        if (index + 1) % 1000 == 0:
+            print("Processed {} words".format(index))
+        if len(word_tag) < 2:
+            print("Something wrong with {}".format(word_tag))
+        tag = anal_v_feats(word_tag[0], word_tag[1])
+        if tag:
+            if tag == 'v.ipv':
+                n_ipv += 1
+            word_tag[1] = tag
+    return n_ipv
 
-def write_corp(path="../../Projects/LingData/Es/POS/cess_tags_sents2.txt"):
+def write_corp(path="../../Projects/LingData/Es/POS/cess_tags_sents3.txt"):
+    '''Write the modified corpus.'''
     with open(path, 'w', encoding='utf8') as f:
-        for word, targ in CORPUS:
+        for word, tag in CORPUS:
             if word == 'EOS':
                 print(file=f)
             else:

diff --git a/morfo.py b/morfo.py
@@ -5,7 +5,7 @@
 
     <http://homes.soic.indiana.edu/gasser/plogs.html>
 
-    Copyleft 2018, 2019.
+    Copyleft 2018, 2019, 2020.
     PLoGS and Michael Gasser <[email protected]>.
 
     morfo is free software: you can redistribute it and/or modify
@@ -52,7 +52,7 @@ def get_pos(abbrev, pos, phon=False, segment=False, load_morph=False,
     morfo.load_lang(abbrev, segment=segment, phon=phon, load_morph=load_morph,
                  guess=guess, verbose=verbose)
     lang = morfo.get_language(abbrev, phon=phon, segment=segment, load=load_morph,
-                                  verbose=verbose)
+                              load_morph=load_morph, verbose=verbose)
     if lang:
         return lang.morphology[pos]
 
@@ -98,6 +98,13 @@ def make_casc(name):
 
 ### Debugging functions
 
+def get_feats(fs, feats):
+    """Print values for features feats within feature structure fs."""
+    values = []
+    for feat in feats:
+        values.append("{}={}".format(feat, fs.get(feat)))
+    return ",".join(values)
+
 def casc_anal(casc, string, start_i, end_i=0, trace=0):
     seg_units = casc.seg_units
     s = string
@@ -190,6 +197,24 @@ def proc_grn_feats(pos=None, feat=None):
 #    result.sort(key=lambda x: x[1], reverse=True)
     return result
 
+# Splitting off Grn derived nouns
+
+GNACC = "áéíóúýãẽĩõũỹ"
+
+def split_grn_nouns():
+    old = []
+    new = []
+    with open("morfo/L/grn/lex/n_raizG.lex", encoding='utf8') as file:
+        for line in file:
+            line = line.strip()
+            if ' ' in line:
+                old.append(line)
+            elif len(line) > 2 and line[-3] in GNACC and line.endswith('va'):
+                new.append(line)
+            else:
+                old.append(line)
+    return old, new
+
 def proc_grn_roots(pos=None, features=None):
     """Count different feature-value combinations for each Guarani root that appears
     in word token list."""
@@ -270,94 +295,6 @@ def proc_grn_roots(pos=None, features=None):
 ##                    print("{} :: {}".format(item, counts), file=file)
     return result
 
-##def proc_grn_root_fvs(pos, fvs):
-##    result = {}
-##    forms = []
-##    with open("../LingData/Gn/words5.txt.anl", encoding='utf8') as inf:
-##        for line in inf:
-##            item, count = line.split()
-##            if ';' in item:
-##                # Otherwise no real analysis, so ignore the line
-##                count = int(count)
-##                word, anals = item.split(';')
-##                anals = anals.split('|')
-##                nanals = len(anals)
-##                count1 = count / nanals
-##                count2 = 0
-##                for anal in anals:
-##                    root_feats = anal.split(':')
-##                    if len(root_feats) == 1:
-##                        # No features
-##                        continue
-##                    root, feats = anal.split(':')
-##                    if feats == '[]':
-##                        continue
-##                    if '_' + pos not in root:
-##                        continue
-##                    if '*' in root:
-##                        root = root.replace('*', word)
-##                    feats = morfo.fs.FeatStruct(feats)
-##                    found = True
-##                    for f, v in fvs:
-##                        if f not in feats or feats[f] != v:
-##                            found = False
-##                            break
-##                    if found:
-##                        count2 += count1
-##                if count2:
-##                    if root not in result:
-##                        result[root] = []
-##                    result[root].append((word, count2))
-##                    forms.append((word, count2))
-##    return result, forms
-
-
-# Testing Amharic deverbal nouns
-
-AN_BASIC = ["melqem", "meleqaqem", "leqami", "leqaqami", "'aleqaqem", "melaqem", "telaqami",
-            "meCeres", "meCerares", "Cerax", "Cerarax", "'aCerares",
-            "mebaken", "mebekaken", "bakaN", "'abekaken",
-            "megenTel", "megeneTaTel", "genTay", "geneTaTay", "'ageneTaTel", "megenaTel", "tegenaTay",
-            "mewexenger", "mewexenegager", "wexengari", "wexenegagari", "'awexenegager",
-            "meCberber", "meCberebaber", "teCberbari", "teCberebabari", "'aCberebaber", "meCberaber"]
-
-AN_XaX = ["megelameT", "magelameT", "gelamaC", "'agelemameT",
-          "mensafef", "tensafafi"]
-
-AN_L1 = ["mamen", "masamen", "metamen", "mastemamen", "metemamen",
-         "'amaN", "tamaN", "temamaN", "'astemamaN", "'astemamen",
-         "mades", "masades", "metades", "mastedades", "metedades",
-         "'adax", "tadax", "tedadax", "'astedades",
-         "manTes", "masneTes", "meneTes", "'aneTaTes"]
-
-AN_L2 = ["mecal", "mecacal", "cay", "cacay", "'acacal"]
-
-AN_L3 = ["megbat", "gebi", "megebat", "masgebat", "megbabat", "megebabat", "gebabi",
-         "'agebab", "'agbab", "tegbabi", "'agebabi", "'agbabi",
-         "meqret", "qeri", "meseTet", "masqeret", "meqeraret", "qerari", "'aqerar", "'aqeraret",
-         "mamat", "masamat", "metamat", "'ami", "tami",
-         "'astemam", "metemamat", "mastemamat", "temami", "'astemami",
-         "mayet", "metayet", "masayet", "'asteyayet"]
-
-AN_L4 = ["melalat", "malalat", "'alela", "lay",
-         "mezergat", "zergi", "mezeregagat", "'azeregag",
-         "meselcet", "selci", "maselcet", "'aselecac",
-         "mebelaxet", "'abelexax", "'abelexaxet", "'abelax",
-         "mengagat", "tengagi",
-         "manqelafat", "'anqelafi", "'anqelefaf",
-         "menkeratet", "tenkeratac", "'ankeretatet"]
-
-AN_wy2 = ["meSom", "meSWaSWam", "SWami", "SWaSWami", "'aSWaSWam",
-          "mecer", "mecacar", "cari", "cacari", "'acacar",
-          "mefEz", "mefafEz", "fiyaZ", "fafiyaZ", "'afafEz"]
-
-amV1 = ["ይመሳስላሉ", "ይመሳሰላሉ", "ያመሳስላል"]
-amV2 = ["ተባበሩ", "ተሳሳሙ", "ተግባቡ", "ተጭበረባበሩ"]
-amV3 = ["ይጠብቃል", "ባከነ", "ቀባጠረ"]
-amV4 = ["ተጋጠሙ", "አጋጠመ"]
-amN1 = ["መቀጠል", "መቀጠያ", "ቀጣይ", "አቀጣጠል"]
-amN2 = ["ያለምክንያት", "አለምክንያት", "አለመንሳፈፍ", "ኢፍትሃዊ"]
-
 def segment(fst, form, printout=True):
     seg = fst.anal(form, segment=True)
     if seg: