From 04f1cb33a1ae1c0546899dade22b4e49e932c004 Mon Sep 17 00:00:00 2001 From: alexharri Date: Thu, 12 Oct 2023 10:52:19 +0000 Subject: [PATCH 1/4] fix(word-categories): consider word categories, preferring some over others --- lib/beygla.spec.ts | 13 ++-- lib/compress/types.ts | 23 +++++++ lib/preprocess/format/case.ts | 19 +++--- lib/preprocess/format/name.ts | 27 +++++++- scripts/download-words.ts | 6 +- scripts/filter-names.ts | 12 ++-- scripts/group-names.ts | 122 +++++++++++++++++++++++++++++++--- 7 files changed, 190 insertions(+), 32 deletions(-) diff --git a/lib/beygla.spec.ts b/lib/beygla.spec.ts index bbe9f36..126f620 100644 --- a/lib/beygla.spec.ts +++ b/lib/beygla.spec.ts @@ -76,7 +76,7 @@ describe("applyCase", () => { const out = applyCase("þgf", sourceName); - expect(out).toEqual("Gunnari Sigurbergi Brjánssyni"); + expect(out).toEqual("Gunnari Sigurberg Brjánssyni"); }); it("strips whitespace in full names", () => { @@ -150,12 +150,13 @@ describe("applyCase", () => { ["Sófús", "0;,,i,ar"], ["Kristólín", "0;,,,ar"], ["Jasper", "0;,,,s"], - ["Rúnel", "0;,,i,s"], - ["Agok", "0;,,i,s"], + ["Agok", "0;,,,s"], ]; for (const [name, declension] of tests) { - expect(getDeclensionForName(name)).toEqual(declension); + expect(`${name}: ${getDeclensionForName(name)}`).toEqual( + `${name}: ${declension}` + ); } }); @@ -170,7 +171,9 @@ describe("applyCase", () => { ]; for (const name of tests) { - expect(getDeclensionForName(name)).toEqual(null); + expect(`${name}: ${getDeclensionForName(name)}`).toEqual( + `${name}: ${null}` + ); } }); }); diff --git a/lib/compress/types.ts b/lib/compress/types.ts index fd8300e..0da98a9 100644 --- a/lib/compress/types.ts +++ b/lib/compress/types.ts @@ -5,14 +5,37 @@ export enum Case { Dative = 4, } +// See https://bin.arnastofnun.is/gogn/k-snid +export enum WordCategory { + PersonNames = "ism", + NonIcelandicPersonNames = "erm", + MythicalName = "hetja", + PlaceNames = "örn", + Nicknames = "gæl", + FamilyNames = "ætt", + GeographicalNames = "bær", + NonIcelandicGeographicalNames = "erl", + CountryNames = "lönd", + CategoriesOfPeoples = "ffl", + StreetNames = "göt", + CompanyOrOrganizationName = "fyr", + OtherNames = "heö", + AstrologicalNames = "stja", + General = "alm", +} + export interface DeclinedName { base: string; name: string; case: Case; + gender: string; + category: WordCategory; } export interface UnprocessedName { base: string; name: string; case: string; + gender: string; + category: WordCategory; } diff --git a/lib/preprocess/format/case.ts b/lib/preprocess/format/case.ts index 4857883..4fe991c 100644 --- a/lib/preprocess/format/case.ts +++ b/lib/preprocess/format/case.ts @@ -1,22 +1,25 @@ import { Case } from "../../compress/types"; +export function isFirstVariationOfCase(caseString: string): boolean { + switch (caseString) { + case "NFET": + case "ÞFET": + case "ÞGFET": + case "EFET": + return true; + } + return false; +} + export function getCase(caseString: string): Case { switch (caseString) { case "NFET": return Case.Nominative; case "ÞFET": - case "ÞFET2": - case "ÞFET3": - /** @todo only use one of these, not both */ return Case.Accusative; case "ÞGFET": - case "ÞGFET2": - case "ÞGFET3": - /** @todo only use one of these, not both */ return Case.Dative; case "EFET": - case "EFET2": - /** @todo only use one of these, not both */ return Case.Genitive; default: throw new Error(`Unexpected case '${caseString}'`); diff --git a/lib/preprocess/format/name.ts b/lib/preprocess/format/name.ts index 4d518a1..a47b9fe 100644 --- a/lib/preprocess/format/name.ts +++ b/lib/preprocess/format/name.ts @@ -1,13 +1,34 @@ -import { DeclinedName, UnprocessedName } from "../../compress/types"; +import { + DeclinedName, + UnprocessedName, + WordCategory, +} from "../../compress/types"; import { getCase } from "./case"; export function getRawName(line: string): UnprocessedName { - const [base, _id, _gender, name, caseString] = line.split(";"); + // Properties prefixed with '_bin' have not been translated. + // + // See https://bin.arnastofnun.is/gogn/k-snid + const [ + base, + _id, + gender, + category, + _bin_einkunn, + _bin_malsnid_ords, + _bin_malfraedi, + _bin_millivisun, + _bin_birting, + name, + caseString, + ] = line.split(";"); return { base, case: caseString, name, + category: category as WordCategory, + gender, }; } @@ -18,5 +39,7 @@ export function formatName(name: UnprocessedName): DeclinedName { base: name.base, name: name.name, case: nameCase, + category: name.category, + gender: name.gender, }; } diff --git a/scripts/download-words.ts b/scripts/download-words.ts index 6b38f59..542232a 100644 --- a/scripts/download-words.ts +++ b/scripts/download-words.ts @@ -9,7 +9,7 @@ const csvFilePath = path.resolve(__dirname, "../data/word-cases.csv"); console.log(`Downloading file\n`); execSync( - `curl -o ${zipFilePath} https://bin.arnastofnun.is/django/api/nidurhal/?file=Storasnid_beygm.csv.zip`, + `curl -o ${zipFilePath} https://bin.arnastofnun.is/django/api/nidurhal/?file=KRISTINsnid.csv.zip`, { stdio: "inherit" } ); @@ -25,8 +25,8 @@ const unzip = async () => { return; } zipfile.on("entry", (entry) => { - if (entry.fileName === "Storasnid_beygm.csv.sha256sum") return; - if (entry.fileName === "Storasnid_beygm.csv") { + if (entry.fileName === "KRISTINsnid.csv.sha256sum") return; + if (entry.fileName === "KRISTINsnid.csv") { zipfile.openReadStream(entry, (err, readStream) => { if (err) { reject(err); diff --git a/scripts/filter-names.ts b/scripts/filter-names.ts index 87e8ae3..e3f44d6 100644 --- a/scripts/filter-names.ts +++ b/scripts/filter-names.ts @@ -7,6 +7,8 @@ import fsSync from "fs"; import path from "path"; import { getNames } from "../lib/preprocess/data/getNames"; import { logWriteAndSize } from "../lib/preprocess/utils/gzip"; +import { getRawName } from "../lib/preprocess/format/name"; +import { isFirstVariationOfCase } from "../lib/preprocess/format/case"; const nameCasesCsvFilePath = path.resolve(__dirname, "../out/name-cases.csv"); const wordCasesCsvFilePath = path.resolve(__dirname, "../data/word-cases.csv"); @@ -25,9 +27,11 @@ async function main() { for await (const line of inputFile.readLines()) { nInputLines++; - const name = line.split(";")[0]; - - if (!nameSet.has(name)) { + const name = getRawName(line); + if (!nameSet.has(name.base)) { + continue; + } + if (!isFirstVariationOfCase(name.case)) { continue; } @@ -37,7 +41,7 @@ async function main() { console.log(`Filtered ${nInputLines} entries into ${nOutputLines} entries.`); - await logWriteAndSize(nameCasesCsvFilePath); + logWriteAndSize(nameCasesCsvFilePath); } main(); diff --git a/scripts/group-names.ts b/scripts/group-names.ts index 598909c..bf4f83b 100644 --- a/scripts/group-names.ts +++ b/scripts/group-names.ts @@ -4,11 +4,48 @@ import { getNames } from "../lib/preprocess/data/getNames"; import { isDefiniteArticle } from "../lib/preprocess/format/article"; import { isCasePlural } from "../lib/preprocess/format/case"; import { formatName, getRawName } from "../lib/preprocess/format/name"; -import { Case, DeclinedName } from "../lib/compress/types"; +import { Case, DeclinedName, WordCategory } from "../lib/compress/types"; import { writeAndLogSize } from "../lib/preprocess/utils/gzip"; +import assert from "assert"; const nameCasesFilePath = path.resolve(__dirname, "../out/name-cases.csv"); +// Some names (e.g. Eldey) can both be a personal name (eigin nafn) and the name +// of a company/organization (stofnunar- eða fyrirtækisheiti). +// +// These are not always declined in the same manner. See the following explainer +// from BÍN for an example: +// +// https://bin.arnastofnun.is/korn/7 +// +// There's LOTS of word categories, with various degrees of overlap. All of the +// following categories contain at least one legal Icelandic name. +// +// The following list specifies the "category preference order". If a name +// exists in multiple categories, the first category in the list will be picked. +// +// PS: Aside from the first few elements, this ordering is mostly arbitrary. If +// reasons for preferring one category over another is discovered, then this +// list can be amended. +// +const categoriesInOrderOfPreference = [ + WordCategory.PersonNames, + WordCategory.NonIcelandicPersonNames, + WordCategory.Nicknames, + WordCategory.MythicalName, + WordCategory.FamilyNames, + WordCategory.GeographicalNames, + WordCategory.PlaceNames, + WordCategory.AstrologicalNames, + WordCategory.OtherNames, + WordCategory.NonIcelandicGeographicalNames, + WordCategory.CountryNames, + WordCategory.StreetNames, + WordCategory.CategoriesOfPeoples, + WordCategory.CompanyOrOrganizationName, + WordCategory.General, +] as string[]; + async function main() { const fileContent = await fs.readFile(nameCasesFilePath, "utf-8"); const lines = fileContent.split("\n"); @@ -38,33 +75,91 @@ async function main() { const out: string[][] = []; + const namesWithMultipleGenders = new Set(); + const namesWithMultipleDeclensions = new Set(); + for (const names of Object.values(groups)) { - let nf: DeclinedName | undefined; - let þf: DeclinedName | undefined; - let þgf: DeclinedName | undefined; - let ef: DeclinedName | undefined; + const byCategory: { + [category: string]: { + [gender: string]: { + [_case: string]: DeclinedName; + }; + }; + } = {}; for (const name of names) { + let _case; switch (name.case) { case Case.Nominative: - nf = name; + _case = "nf"; break; case Case.Accusative: - þf = name; + _case = "þf"; break; case Case.Dative: - þgf = name; + _case = "þgf"; break; case Case.Genitive: - ef = name; + _case = "ef"; break; default: throw new Error(`Unexpected case '${name.case}'`); } + byCategory[name.category] ||= {}; + byCategory[name.category][name.gender] ||= {}; + if (byCategory[name.category][name.gender][_case]) { + namesWithMultipleDeclensions.add(name.base); + } + byCategory[name.category][name.gender][_case] ||= name; + } + + let category: string | undefined; + const categories = Object.keys(byCategory); + + assert(categories.length > 0, "should have at least 1 category"); + + if ( + categories.length === 1 && + ["gæl,ism", "dýr,hetja"].includes(categories[0]) + ) { + // This seems like a data entry error, for which BÍN should be contacted. + // These occur for 1 word each. + // + // Anyway, ignore these while they are sorted out in the source. + continue; + } + + for (const preferredCategory of categoriesInOrderOfPreference) { + if (categories.includes(preferredCategory)) { + category = preferredCategory; + break; + } } + if (!category) { + throw new Error( + `No preferred category matched in list [${categories.join( + ", " + )}] for name '${names[0].base}'` + ); + } + + const byGender = byCategory[category]; + + let gender: string; + const genders = Object.keys(byGender); + assert(genders.length > 0, "should have at least 1 genders"); + if (genders.length > 1) { + namesWithMultipleGenders.add(names[0].base); + continue; + } else { + gender = genders[0]; + } + + const byCase = byGender[gender]; + const { nf, þf, þgf, ef } = byCase; if (!nf || !þf || !þgf || !ef) { - throw new Error(`Missing case for name '${names![0].base}'`); + throw new Error(`Missing case for name '${names[0].base}'`); } out.push([nf.name, þf.name, þgf.name, ef.name]); @@ -79,6 +174,13 @@ async function main() { `${excludedNames.length} of ${names.length} names (${percentage}) in 'name-cases.csv' are not present in 'words.csv' and are not included.\n` ); + console.log( + `Found ${namesWithMultipleDeclensions.size} names with multiple declensions. The last declension is used.` + ); + console.log( + `Found ${namesWithMultipleGenders.size} names with multiple genders. They are omitted from Beygla.\n` + ); + const groupedNamesfilePath = path.resolve( __dirname, "../out/grouped-names.json" From 936c046351b745338cddd30ad94f9a9a874d7010 Mon Sep 17 00:00:00 2001 From: alexharri Date: Thu, 12 Oct 2023 11:57:21 +0000 Subject: [PATCH 2/4] fix: do not log names with multiple declensions --- scripts/group-names.ts | 7 ------- 1 file changed, 7 deletions(-) diff --git a/scripts/group-names.ts b/scripts/group-names.ts index bf4f83b..cf65e5f 100644 --- a/scripts/group-names.ts +++ b/scripts/group-names.ts @@ -76,7 +76,6 @@ async function main() { const out: string[][] = []; const namesWithMultipleGenders = new Set(); - const namesWithMultipleDeclensions = new Set(); for (const names of Object.values(groups)) { const byCategory: { @@ -107,9 +106,6 @@ async function main() { } byCategory[name.category] ||= {}; byCategory[name.category][name.gender] ||= {}; - if (byCategory[name.category][name.gender][_case]) { - namesWithMultipleDeclensions.add(name.base); - } byCategory[name.category][name.gender][_case] ||= name; } @@ -174,9 +170,6 @@ async function main() { `${excludedNames.length} of ${names.length} names (${percentage}) in 'name-cases.csv' are not present in 'words.csv' and are not included.\n` ); - console.log( - `Found ${namesWithMultipleDeclensions.size} names with multiple declensions. The last declension is used.` - ); console.log( `Found ${namesWithMultipleGenders.size} names with multiple genders. They are omitted from Beygla.\n` ); From ff6e21d0879722b68d2de705297c20b3b4ee2635 Mon Sep 17 00:00:00 2001 From: alexharri Date: Thu, 12 Oct 2023 12:28:04 +0000 Subject: [PATCH 3/4] fix(typo): replace 1 genders with 1 gender --- scripts/group-names.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/group-names.ts b/scripts/group-names.ts index cf65e5f..183b4cc 100644 --- a/scripts/group-names.ts +++ b/scripts/group-names.ts @@ -143,7 +143,7 @@ async function main() { let gender: string; const genders = Object.keys(byGender); - assert(genders.length > 0, "should have at least 1 genders"); + assert(genders.length > 0, "should have at least 1 gender"); if (genders.length > 1) { namesWithMultipleGenders.add(names[0].base); continue; From a6fd4bea4c0b6f12d5da1d43b486b0d702d6dd0b Mon Sep 17 00:00:00 2001 From: alexharri Date: Thu, 12 Oct 2023 12:34:34 +0000 Subject: [PATCH 4/4] test(applyCase): add test for #11 --- lib/beygla.spec.ts | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lib/beygla.spec.ts b/lib/beygla.spec.ts index 126f620..86efa13 100644 --- a/lib/beygla.spec.ts +++ b/lib/beygla.spec.ts @@ -176,4 +176,17 @@ describe("applyCase", () => { ); } }); + + test("it uses the declensions for the person, not the company/organization", () => { + const tests = [ + ["nf", "Eldey"], + ["þf", "Eldeyju"], + ["þgf", "Eldeyju"], + ["ef", "Eldeyjar"], + ] as const; + + for (const [_case, name] of tests) { + expect(applyCase(_case, "Eldey")).toEqual(name); + } + }); });