Skip to content

Commit

Permalink
Lots of updates to specific languages.
Browse files Browse the repository at this point in the history
I lost track!  😣
  • Loading branch information
megasser committed Jun 27, 2020
1 parent 7efd12e commit 4d1f11d
Show file tree
Hide file tree
Showing 116 changed files with 2,001,627 additions and 1,553,989 deletions.
59 changes: 55 additions & 4 deletions es_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
CORP_PATH = "../../Projects/LingData/Es/POS/cess_tags_sents.txt"

CORPUS = []

with open(CORP_PATH, encoding='utf8') as f:
"""Read in the CESS corpus, replacing empty lines with ('EOS', 'X')."""
for line in f:
if not line.strip():
CORPUS.append(('EOS', 'X'))
Expand All @@ -13,33 +15,82 @@
esanal = lambda word: morfo.anal('spa', word, raw=True)

def anal_v(word, tag):
'''If word is verb, analyze with morfo, replacing tag with more
specific one.'''
if tag == 'v':
analyses = [a[1] for a in esanal(word.lower())]
v_anals = [a for a in analyses if a.get('pos') == 'v']
if not v_anals:
print("Something wrong; can't analyze verb {}".format(word))
# print("Something wrong; can't analyze verb {}".format(word))
return None
else:
v_anal = v_anals[0]
v_tm = v_anal.get('tm')
if v_tm in ['prc', 'inf', 'ger']:
return "v.nonfin"
return "v." + v_tm
else:
return "v.fin"
else:
return None

def anal_v_feats(word, tag):
'''If word is verb, analyze with morfo, replacing tag with more
specific one.'''
if tag == 'v':
analyses = [a[1] for a in esanal(word.lower())]
v_anals = [a for a in analyses if a.get('pos') == 'v']
if not v_anals:
# print("Something wrong; can't analyze verb {}".format(word))
return None
else:
impv = []
fin = []
nonfin = []
for v_anal in v_anals:
tm = v_anal.get('tm')
if tm == 'ipv':
impv.append(v_anal)
elif tm in ['prc', 'inf', 'ger']:
nonfin.append(v_anal)
else:
fin.append(v_anal)
if fin:
if not impv and not nonfin:
return "v.fin"
elif impv and not nonfin:
return "v.ipv"
else:
return None

def proc_v():
'''Do anal_v on whole corpus.'''
for word_tag in CORPUS:
if len(word_tag) < 2:
print("Something wrong with {}".format(word_tag))
tag = anal_v(word_tag[0], word_tag[1])
if tag:
word_tag[1] = tag


def proc_v_feats():
'''Do anal_v_feats on whole corpus.'''
n_ipv = 0
for index, word_tag in enumerate(CORPUS):
if (index + 1) % 1000 == 0:
print("Processed {} words".format(index))
if len(word_tag) < 2:
print("Something wrong with {}".format(word_tag))
tag = anal_v_feats(word_tag[0], word_tag[1])
if tag:
if tag == 'v.ipv':
n_ipv += 1
word_tag[1] = tag
return n_ipv

def write_corp(path="../../Projects/LingData/Es/POS/cess_tags_sents2.txt"):
def write_corp(path="../../Projects/LingData/Es/POS/cess_tags_sents3.txt"):
'''Write the modified corpus.'''
with open(path, 'w', encoding='utf8') as f:
for word, targ in CORPUS:
for word, tag in CORPUS:
if word == 'EOS':
print(file=f)
else:
Expand Down
117 changes: 27 additions & 90 deletions morfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<http://homes.soic.indiana.edu/gasser/plogs.html>
Copyleft 2018, 2019.
Copyleft 2018, 2019, 2020.
PLoGS and Michael Gasser <[email protected]>.
morfo is free software: you can redistribute it and/or modify
Expand Down Expand Up @@ -52,7 +52,7 @@ def get_pos(abbrev, pos, phon=False, segment=False, load_morph=False,
morfo.load_lang(abbrev, segment=segment, phon=phon, load_morph=load_morph,
guess=guess, verbose=verbose)
lang = morfo.get_language(abbrev, phon=phon, segment=segment, load=load_morph,
verbose=verbose)
load_morph=load_morph, verbose=verbose)
if lang:
return lang.morphology[pos]

Expand Down Expand Up @@ -98,6 +98,13 @@ def make_casc(name):

### Debugging functions

def get_feats(fs, feats):
"""Print values for features feats within feature structure fs."""
values = []
for feat in feats:
values.append("{}={}".format(feat, fs.get(feat)))
return ",".join(values)

def casc_anal(casc, string, start_i, end_i=0, trace=0):
seg_units = casc.seg_units
s = string
Expand Down Expand Up @@ -190,6 +197,24 @@ def proc_grn_feats(pos=None, feat=None):
# result.sort(key=lambda x: x[1], reverse=True)
return result

This comment has been minimized.

Copy link
@tofeahmed3

tofeahmed3 Sep 1, 2022

No Afaan Oromo

This comment has been minimized.

Copy link
@megasser

megasser via email Sep 1, 2022

Author Member

This comment has been minimized.

Copy link
@tofeahmed3

tofeahmed3 via email Sep 1, 2022


# Splitting off Grn derived nouns

GNACC = "áéíóúýãẽĩõũỹ"

def split_grn_nouns():
old = []
new = []
with open("morfo/L/grn/lex/n_raizG.lex", encoding='utf8') as file:
for line in file:
line = line.strip()
if ' ' in line:
old.append(line)
elif len(line) > 2 and line[-3] in GNACC and line.endswith('va'):
new.append(line)
else:
old.append(line)
return old, new

def proc_grn_roots(pos=None, features=None):
"""Count different feature-value combinations for each Guarani root that appears
in word token list."""
Expand Down Expand Up @@ -270,94 +295,6 @@ def proc_grn_roots(pos=None, features=None):
## print("{} :: {}".format(item, counts), file=file)
return result

##def proc_grn_root_fvs(pos, fvs):
## result = {}
## forms = []
## with open("../LingData/Gn/words5.txt.anl", encoding='utf8') as inf:
## for line in inf:
## item, count = line.split()
## if ';' in item:
## # Otherwise no real analysis, so ignore the line
## count = int(count)
## word, anals = item.split(';')
## anals = anals.split('|')
## nanals = len(anals)
## count1 = count / nanals
## count2 = 0
## for anal in anals:
## root_feats = anal.split(':')
## if len(root_feats) == 1:
## # No features
## continue
## root, feats = anal.split(':')
## if feats == '[]':
## continue
## if '_' + pos not in root:
## continue
## if '*' in root:
## root = root.replace('*', word)
## feats = morfo.fs.FeatStruct(feats)
## found = True
## for f, v in fvs:
## if f not in feats or feats[f] != v:
## found = False
## break
## if found:
## count2 += count1
## if count2:
## if root not in result:
## result[root] = []
## result[root].append((word, count2))
## forms.append((word, count2))
## return result, forms


# Testing Amharic deverbal nouns

AN_BASIC = ["melqem", "meleqaqem", "leqami", "leqaqami", "'aleqaqem", "melaqem", "telaqami",
"meCeres", "meCerares", "Cerax", "Cerarax", "'aCerares",
"mebaken", "mebekaken", "bakaN", "'abekaken",
"megenTel", "megeneTaTel", "genTay", "geneTaTay", "'ageneTaTel", "megenaTel", "tegenaTay",
"mewexenger", "mewexenegager", "wexengari", "wexenegagari", "'awexenegager",
"meCberber", "meCberebaber", "teCberbari", "teCberebabari", "'aCberebaber", "meCberaber"]

AN_XaX = ["megelameT", "magelameT", "gelamaC", "'agelemameT",
"mensafef", "tensafafi"]

AN_L1 = ["mamen", "masamen", "metamen", "mastemamen", "metemamen",
"'amaN", "tamaN", "temamaN", "'astemamaN", "'astemamen",
"mades", "masades", "metades", "mastedades", "metedades",
"'adax", "tadax", "tedadax", "'astedades",
"manTes", "masneTes", "meneTes", "'aneTaTes"]

AN_L2 = ["mecal", "mecacal", "cay", "cacay", "'acacal"]

AN_L3 = ["megbat", "gebi", "megebat", "masgebat", "megbabat", "megebabat", "gebabi",
"'agebab", "'agbab", "tegbabi", "'agebabi", "'agbabi",
"meqret", "qeri", "meseTet", "masqeret", "meqeraret", "qerari", "'aqerar", "'aqeraret",
"mamat", "masamat", "metamat", "'ami", "tami",
"'astemam", "metemamat", "mastemamat", "temami", "'astemami",
"mayet", "metayet", "masayet", "'asteyayet"]

AN_L4 = ["melalat", "malalat", "'alela", "lay",
"mezergat", "zergi", "mezeregagat", "'azeregag",
"meselcet", "selci", "maselcet", "'aselecac",
"mebelaxet", "'abelexax", "'abelexaxet", "'abelax",
"mengagat", "tengagi",
"manqelafat", "'anqelafi", "'anqelefaf",
"menkeratet", "tenkeratac", "'ankeretatet"]

AN_wy2 = ["meSom", "meSWaSWam", "SWami", "SWaSWami", "'aSWaSWam",
"mecer", "mecacar", "cari", "cacari", "'acacar",
"mefEz", "mefafEz", "fiyaZ", "fafiyaZ", "'afafEz"]

amV1 = ["ይመሳስላሉ", "ይመሳሰላሉ", "ያመሳስላል"]
amV2 = ["ተባበሩ", "ተሳሳሙ", "ተግባቡ", "ተጭበረባበሩ"]
amV3 = ["ይጠብቃል", "ባከነ", "ቀባጠረ"]
amV4 = ["ተጋጠሙ", "አጋጠመ"]
amN1 = ["መቀጠል", "መቀጠያ", "ቀጣይ", "አቀጣጠል"]
amN2 = ["ያለምክንያት", "አለምክንያት", "አለመንሳፈፍ", "ኢፍትሃዊ"]

def segment(fst, form, printout=True):
seg = fst.anal(form, segment=True)
if seg:
Expand Down
Loading

0 comments on commit 4d1f11d

Please sign in to comment.