From 04f1cb33a1ae1c0546899dade22b4e49e932c004 Mon Sep 17 00:00:00 2001
From: alexharri <alexharri2919@gmail.com>
Date: Thu, 12 Oct 2023 10:52:19 +0000
Subject: [PATCH 1/4] fix(word-categories): consider word categories,
 preferring some over others

---
 lib/beygla.spec.ts            |  13 ++--
 lib/compress/types.ts         |  23 +++++++
 lib/preprocess/format/case.ts |  19 +++---
 lib/preprocess/format/name.ts |  27 +++++++-
 scripts/download-words.ts     |   6 +-
 scripts/filter-names.ts       |  12 ++--
 scripts/group-names.ts        | 122 +++++++++++++++++++++++++++++++---
 7 files changed, 190 insertions(+), 32 deletions(-)

diff --git a/lib/beygla.spec.ts b/lib/beygla.spec.ts
index bbe9f36..126f620 100644
--- a/lib/beygla.spec.ts
+++ b/lib/beygla.spec.ts
@@ -76,7 +76,7 @@ describe("applyCase", () => {
 
     const out = applyCase("þgf", sourceName);
 
-    expect(out).toEqual("Gunnari Sigurbergi Brjánssyni");
+    expect(out).toEqual("Gunnari Sigurberg Brjánssyni");
   });
 
   it("strips whitespace in full names", () => {
@@ -150,12 +150,13 @@ describe("applyCase", () => {
       ["Sófús", "0;,,i,ar"],
       ["Kristólín", "0;,,,ar"],
       ["Jasper", "0;,,,s"],
-      ["Rúnel", "0;,,i,s"],
-      ["Agok", "0;,,i,s"],
+      ["Agok", "0;,,,s"],
     ];
 
     for (const [name, declension] of tests) {
-      expect(getDeclensionForName(name)).toEqual(declension);
+      expect(`${name}: ${getDeclensionForName(name)}`).toEqual(
+        `${name}: ${declension}`
+      );
     }
   });
 
@@ -170,7 +171,9 @@ describe("applyCase", () => {
     ];
 
     for (const name of tests) {
-      expect(getDeclensionForName(name)).toEqual(null);
+      expect(`${name}: ${getDeclensionForName(name)}`).toEqual(
+        `${name}: ${null}`
+      );
     }
   });
 });
diff --git a/lib/compress/types.ts b/lib/compress/types.ts
index fd8300e..0da98a9 100644
--- a/lib/compress/types.ts
+++ b/lib/compress/types.ts
@@ -5,14 +5,37 @@ export enum Case {
   Dative = 4,
 }
 
+// See https://bin.arnastofnun.is/gogn/k-snid
+export enum WordCategory {
+  PersonNames = "ism",
+  NonIcelandicPersonNames = "erm",
+  MythicalName = "hetja",
+  PlaceNames = "örn",
+  Nicknames = "gæl",
+  FamilyNames = "ætt",
+  GeographicalNames = "bær",
+  NonIcelandicGeographicalNames = "erl",
+  CountryNames = "lönd",
+  CategoriesOfPeoples = "ffl",
+  StreetNames = "göt",
+  CompanyOrOrganizationName = "fyr",
+  OtherNames = "heö",
+  AstrologicalNames = "stja",
+  General = "alm",
+}
+
 export interface DeclinedName {
   base: string;
   name: string;
   case: Case;
+  gender: string;
+  category: WordCategory;
 }
 
 export interface UnprocessedName {
   base: string;
   name: string;
   case: string;
+  gender: string;
+  category: WordCategory;
 }
diff --git a/lib/preprocess/format/case.ts b/lib/preprocess/format/case.ts
index 4857883..4fe991c 100644
--- a/lib/preprocess/format/case.ts
+++ b/lib/preprocess/format/case.ts
@@ -1,22 +1,25 @@
 import { Case } from "../../compress/types";
 
+export function isFirstVariationOfCase(caseString: string): boolean {
+  switch (caseString) {
+    case "NFET":
+    case "ÞFET":
+    case "ÞGFET":
+    case "EFET":
+      return true;
+  }
+  return false;
+}
+
 export function getCase(caseString: string): Case {
   switch (caseString) {
     case "NFET":
       return Case.Nominative;
     case "ÞFET":
-    case "ÞFET2":
-    case "ÞFET3":
-      /** @todo only use one of these, not both */
       return Case.Accusative;
     case "ÞGFET":
-    case "ÞGFET2":
-    case "ÞGFET3":
-      /** @todo only use one of these, not both */
       return Case.Dative;
     case "EFET":
-    case "EFET2":
-      /** @todo only use one of these, not both */
       return Case.Genitive;
     default:
       throw new Error(`Unexpected case '${caseString}'`);
diff --git a/lib/preprocess/format/name.ts b/lib/preprocess/format/name.ts
index 4d518a1..a47b9fe 100644
--- a/lib/preprocess/format/name.ts
+++ b/lib/preprocess/format/name.ts
@@ -1,13 +1,34 @@
-import { DeclinedName, UnprocessedName } from "../../compress/types";
+import {
+  DeclinedName,
+  UnprocessedName,
+  WordCategory,
+} from "../../compress/types";
 import { getCase } from "./case";
 
 export function getRawName(line: string): UnprocessedName {
-  const [base, _id, _gender, name, caseString] = line.split(";");
+  // Properties prefixed with '_bin' have not been translated.
+  //
+  // See https://bin.arnastofnun.is/gogn/k-snid
+  const [
+    base,
+    _id,
+    gender,
+    category,
+    _bin_einkunn,
+    _bin_malsnid_ords,
+    _bin_malfraedi,
+    _bin_millivisun,
+    _bin_birting,
+    name,
+    caseString,
+  ] = line.split(";");
 
   return {
     base,
     case: caseString,
     name,
+    category: category as WordCategory,
+    gender,
   };
 }
 
@@ -18,5 +39,7 @@ export function formatName(name: UnprocessedName): DeclinedName {
     base: name.base,
     name: name.name,
     case: nameCase,
+    category: name.category,
+    gender: name.gender,
   };
 }
diff --git a/scripts/download-words.ts b/scripts/download-words.ts
index 6b38f59..542232a 100644
--- a/scripts/download-words.ts
+++ b/scripts/download-words.ts
@@ -9,7 +9,7 @@ const csvFilePath = path.resolve(__dirname, "../data/word-cases.csv");
 console.log(`Downloading file\n`);
 
 execSync(
-  `curl -o ${zipFilePath} https://bin.arnastofnun.is/django/api/nidurhal/?file=Storasnid_beygm.csv.zip`,
+  `curl -o ${zipFilePath} https://bin.arnastofnun.is/django/api/nidurhal/?file=KRISTINsnid.csv.zip`,
   { stdio: "inherit" }
 );
 
@@ -25,8 +25,8 @@ const unzip = async () => {
         return;
       }
       zipfile.on("entry", (entry) => {
-        if (entry.fileName === "Storasnid_beygm.csv.sha256sum") return;
-        if (entry.fileName === "Storasnid_beygm.csv") {
+        if (entry.fileName === "KRISTINsnid.csv.sha256sum") return;
+        if (entry.fileName === "KRISTINsnid.csv") {
           zipfile.openReadStream(entry, (err, readStream) => {
             if (err) {
               reject(err);
diff --git a/scripts/filter-names.ts b/scripts/filter-names.ts
index 87e8ae3..e3f44d6 100644
--- a/scripts/filter-names.ts
+++ b/scripts/filter-names.ts
@@ -7,6 +7,8 @@ import fsSync from "fs";
 import path from "path";
 import { getNames } from "../lib/preprocess/data/getNames";
 import { logWriteAndSize } from "../lib/preprocess/utils/gzip";
+import { getRawName } from "../lib/preprocess/format/name";
+import { isFirstVariationOfCase } from "../lib/preprocess/format/case";
 
 const nameCasesCsvFilePath = path.resolve(__dirname, "../out/name-cases.csv");
 const wordCasesCsvFilePath = path.resolve(__dirname, "../data/word-cases.csv");
@@ -25,9 +27,11 @@ async function main() {
   for await (const line of inputFile.readLines()) {
     nInputLines++;
 
-    const name = line.split(";")[0];
-
-    if (!nameSet.has(name)) {
+    const name = getRawName(line);
+    if (!nameSet.has(name.base)) {
+      continue;
+    }
+    if (!isFirstVariationOfCase(name.case)) {
       continue;
     }
 
@@ -37,7 +41,7 @@ async function main() {
 
   console.log(`Filtered ${nInputLines} entries into ${nOutputLines} entries.`);
 
-  await logWriteAndSize(nameCasesCsvFilePath);
+  logWriteAndSize(nameCasesCsvFilePath);
 }
 
 main();
diff --git a/scripts/group-names.ts b/scripts/group-names.ts
index 598909c..bf4f83b 100644
--- a/scripts/group-names.ts
+++ b/scripts/group-names.ts
@@ -4,11 +4,48 @@ import { getNames } from "../lib/preprocess/data/getNames";
 import { isDefiniteArticle } from "../lib/preprocess/format/article";
 import { isCasePlural } from "../lib/preprocess/format/case";
 import { formatName, getRawName } from "../lib/preprocess/format/name";
-import { Case, DeclinedName } from "../lib/compress/types";
+import { Case, DeclinedName, WordCategory } from "../lib/compress/types";
 import { writeAndLogSize } from "../lib/preprocess/utils/gzip";
+import assert from "assert";
 
 const nameCasesFilePath = path.resolve(__dirname, "../out/name-cases.csv");
 
+// Some names (e.g. Eldey) can both be a personal name (eigin nafn) and the name
+// of a company/organization (stofnunar- eða fyrirtækisheiti).
+//
+// These are not always declined in the same manner. See the following explainer
+// from BÍN for an example:
+//
+//    https://bin.arnastofnun.is/korn/7
+//
+// There's LOTS of word categories, with various degrees of overlap. All of the
+// following categories contain at least one legal Icelandic name.
+//
+// The following list specifies the "category preference order". If a name
+// exists in multiple categories, the first category in the list will be picked.
+//
+// PS: Aside from the first few elements, this ordering is mostly arbitrary. If
+//     reasons for preferring one category over another is discovered, then this
+//     list can be amended.
+//
+const categoriesInOrderOfPreference = [
+  WordCategory.PersonNames,
+  WordCategory.NonIcelandicPersonNames,
+  WordCategory.Nicknames,
+  WordCategory.MythicalName,
+  WordCategory.FamilyNames,
+  WordCategory.GeographicalNames,
+  WordCategory.PlaceNames,
+  WordCategory.AstrologicalNames,
+  WordCategory.OtherNames,
+  WordCategory.NonIcelandicGeographicalNames,
+  WordCategory.CountryNames,
+  WordCategory.StreetNames,
+  WordCategory.CategoriesOfPeoples,
+  WordCategory.CompanyOrOrganizationName,
+  WordCategory.General,
+] as string[];
+
 async function main() {
   const fileContent = await fs.readFile(nameCasesFilePath, "utf-8");
   const lines = fileContent.split("\n");
@@ -38,33 +75,91 @@ async function main() {
 
   const out: string[][] = [];
 
+  const namesWithMultipleGenders = new Set();
+  const namesWithMultipleDeclensions = new Set();
+
   for (const names of Object.values(groups)) {
-    let nf: DeclinedName | undefined;
-    let þf: DeclinedName | undefined;
-    let þgf: DeclinedName | undefined;
-    let ef: DeclinedName | undefined;
+    const byCategory: {
+      [category: string]: {
+        [gender: string]: {
+          [_case: string]: DeclinedName;
+        };
+      };
+    } = {};
 
     for (const name of names) {
+      let _case;
       switch (name.case) {
         case Case.Nominative:
-          nf = name;
+          _case = "nf";
           break;
         case Case.Accusative:
-          þf = name;
+          _case = "þf";
           break;
         case Case.Dative:
-          þgf = name;
+          _case = "þgf";
           break;
         case Case.Genitive:
-          ef = name;
+          _case = "ef";
           break;
         default:
           throw new Error(`Unexpected case '${name.case}'`);
       }
+      byCategory[name.category] ||= {};
+      byCategory[name.category][name.gender] ||= {};
+      if (byCategory[name.category][name.gender][_case]) {
+        namesWithMultipleDeclensions.add(name.base);
+      }
+      byCategory[name.category][name.gender][_case] ||= name;
+    }
+
+    let category: string | undefined;
+    const categories = Object.keys(byCategory);
+
+    assert(categories.length > 0, "should have at least 1 category");
+
+    if (
+      categories.length === 1 &&
+      ["gæl,ism", "dýr,hetja"].includes(categories[0])
+    ) {
+      // This seems like a data entry error, for which BÍN should be contacted.
+      // These occur for 1 word each.
+      //
+      // Anyway, ignore these while they are sorted out in the source.
+      continue;
+    }
+
+    for (const preferredCategory of categoriesInOrderOfPreference) {
+      if (categories.includes(preferredCategory)) {
+        category = preferredCategory;
+        break;
+      }
     }
+    if (!category) {
+      throw new Error(
+        `No preferred category matched in list [${categories.join(
+          ", "
+        )}] for name '${names[0].base}'`
+      );
+    }
+
+    const byGender = byCategory[category];
+
+    let gender: string;
+    const genders = Object.keys(byGender);
+    assert(genders.length > 0, "should have at least 1 genders");
+    if (genders.length > 1) {
+      namesWithMultipleGenders.add(names[0].base);
+      continue;
+    } else {
+      gender = genders[0];
+    }
+
+    const byCase = byGender[gender];
+    const { nf, þf, þgf, ef } = byCase;
 
     if (!nf || !þf || !þgf || !ef) {
-      throw new Error(`Missing case for name '${names![0].base}'`);
+      throw new Error(`Missing case for name '${names[0].base}'`);
     }
 
     out.push([nf.name, þf.name, þgf.name, ef.name]);
@@ -79,6 +174,13 @@ async function main() {
     `${excludedNames.length} of ${names.length} names (${percentage}) in 'name-cases.csv' are not present in 'words.csv' and are not included.\n`
   );
 
+  console.log(
+    `Found ${namesWithMultipleDeclensions.size} names with multiple declensions. The last declension is used.`
+  );
+  console.log(
+    `Found ${namesWithMultipleGenders.size} names with multiple genders. They are omitted from Beygla.\n`
+  );
+
   const groupedNamesfilePath = path.resolve(
     __dirname,
     "../out/grouped-names.json"

From 936c046351b745338cddd30ad94f9a9a874d7010 Mon Sep 17 00:00:00 2001
From: alexharri <alexharri2919@gmail.com>
Date: Thu, 12 Oct 2023 11:57:21 +0000
Subject: [PATCH 2/4] fix: do not log names with multiple declensions

---
 scripts/group-names.ts | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/scripts/group-names.ts b/scripts/group-names.ts
index bf4f83b..cf65e5f 100644
--- a/scripts/group-names.ts
+++ b/scripts/group-names.ts
@@ -76,7 +76,6 @@ async function main() {
   const out: string[][] = [];
 
   const namesWithMultipleGenders = new Set();
-  const namesWithMultipleDeclensions = new Set();
 
   for (const names of Object.values(groups)) {
     const byCategory: {
@@ -107,9 +106,6 @@ async function main() {
       }
       byCategory[name.category] ||= {};
       byCategory[name.category][name.gender] ||= {};
-      if (byCategory[name.category][name.gender][_case]) {
-        namesWithMultipleDeclensions.add(name.base);
-      }
       byCategory[name.category][name.gender][_case] ||= name;
     }
 
@@ -174,9 +170,6 @@ async function main() {
     `${excludedNames.length} of ${names.length} names (${percentage}) in 'name-cases.csv' are not present in 'words.csv' and are not included.\n`
   );
 
-  console.log(
-    `Found ${namesWithMultipleDeclensions.size} names with multiple declensions. The last declension is used.`
-  );
   console.log(
     `Found ${namesWithMultipleGenders.size} names with multiple genders. They are omitted from Beygla.\n`
   );

From ff6e21d0879722b68d2de705297c20b3b4ee2635 Mon Sep 17 00:00:00 2001
From: alexharri <alexharri2919@gmail.com>
Date: Thu, 12 Oct 2023 12:28:04 +0000
Subject: [PATCH 3/4] fix(typo): replace 1 genders with 1 gender

---
 scripts/group-names.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/group-names.ts b/scripts/group-names.ts
index cf65e5f..183b4cc 100644
--- a/scripts/group-names.ts
+++ b/scripts/group-names.ts
@@ -143,7 +143,7 @@ async function main() {
 
     let gender: string;
     const genders = Object.keys(byGender);
-    assert(genders.length > 0, "should have at least 1 genders");
+    assert(genders.length > 0, "should have at least 1 gender");
     if (genders.length > 1) {
       namesWithMultipleGenders.add(names[0].base);
       continue;

From a6fd4bea4c0b6f12d5da1d43b486b0d702d6dd0b Mon Sep 17 00:00:00 2001
From: alexharri <alexharri2919@gmail.com>
Date: Thu, 12 Oct 2023 12:34:34 +0000
Subject: [PATCH 4/4] test(applyCase): add test for #11

---
 lib/beygla.spec.ts | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/lib/beygla.spec.ts b/lib/beygla.spec.ts
index 126f620..86efa13 100644
--- a/lib/beygla.spec.ts
+++ b/lib/beygla.spec.ts
@@ -176,4 +176,17 @@ describe("applyCase", () => {
       );
     }
   });
+
+  test("it uses the declensions for the person, not the company/organization", () => {
+    const tests = [
+      ["nf", "Eldey"],
+      ["þf", "Eldeyju"],
+      ["þgf", "Eldeyju"],
+      ["ef", "Eldeyjar"],
+    ] as const;
+
+    for (const [_case, name] of tests) {
+      expect(applyCase(_case, "Eldey")).toEqual(name);
+    }
+  });
 });