Skip to content

Commit

Permalink
Merge pull request #12 from alexharri/word-categories
Browse files Browse the repository at this point in the history
Consider word categories
  • Loading branch information
alexharri authored Oct 12, 2023
2 parents 8ae415a + a6fd4be commit 4a06584
Show file tree
Hide file tree
Showing 7 changed files with 196 additions and 32 deletions.
26 changes: 21 additions & 5 deletions lib/beygla.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ describe("applyCase", () => {

const out = applyCase("þgf", sourceName);

expect(out).toEqual("Gunnari Sigurbergi Brjánssyni");
expect(out).toEqual("Gunnari Sigurberg Brjánssyni");
});

it("strips whitespace in full names", () => {
Expand Down Expand Up @@ -150,12 +150,13 @@ describe("applyCase", () => {
["Sófús", "0;,,i,ar"],
["Kristólín", "0;,,,ar"],
["Jasper", "0;,,,s"],
["Rúnel", "0;,,i,s"],
["Agok", "0;,,i,s"],
["Agok", "0;,,,s"],
];

for (const [name, declension] of tests) {
expect(getDeclensionForName(name)).toEqual(declension);
expect(`${name}: ${getDeclensionForName(name)}`).toEqual(
`${name}: ${declension}`
);
}
});

Expand All @@ -170,7 +171,22 @@ describe("applyCase", () => {
];

for (const name of tests) {
expect(getDeclensionForName(name)).toEqual(null);
expect(`${name}: ${getDeclensionForName(name)}`).toEqual(
`${name}: ${null}`
);
}
});

test("it uses the declensions for the person, not the company/organization", () => {
const tests = [
["nf", "Eldey"],
["þf", "Eldeyju"],
["þgf", "Eldeyju"],
["ef", "Eldeyjar"],
] as const;

for (const [_case, name] of tests) {
expect(applyCase(_case, "Eldey")).toEqual(name);
}
});
});
23 changes: 23 additions & 0 deletions lib/compress/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,37 @@ export enum Case {
Dative = 4,
}

// See https://bin.arnastofnun.is/gogn/k-snid
export enum WordCategory {
PersonNames = "ism",
NonIcelandicPersonNames = "erm",
MythicalName = "hetja",
PlaceNames = "örn",
Nicknames = "gæl",
FamilyNames = "ætt",
GeographicalNames = "bær",
NonIcelandicGeographicalNames = "erl",
CountryNames = "lönd",
CategoriesOfPeoples = "ffl",
StreetNames = "göt",
CompanyOrOrganizationName = "fyr",
OtherNames = "heö",
AstrologicalNames = "stja",
General = "alm",
}

export interface DeclinedName {
base: string;
name: string;
case: Case;
gender: string;
category: WordCategory;
}

export interface UnprocessedName {
base: string;
name: string;
case: string;
gender: string;
category: WordCategory;
}
19 changes: 11 additions & 8 deletions lib/preprocess/format/case.ts
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
import { Case } from "../../compress/types";

export function isFirstVariationOfCase(caseString: string): boolean {
switch (caseString) {
case "NFET":
case "ÞFET":
case "ÞGFET":
case "EFET":
return true;
}
return false;
}

export function getCase(caseString: string): Case {
switch (caseString) {
case "NFET":
return Case.Nominative;
case "ÞFET":
case "ÞFET2":
case "ÞFET3":
/** @todo only use one of these, not both */
return Case.Accusative;
case "ÞGFET":
case "ÞGFET2":
case "ÞGFET3":
/** @todo only use one of these, not both */
return Case.Dative;
case "EFET":
case "EFET2":
/** @todo only use one of these, not both */
return Case.Genitive;
default:
throw new Error(`Unexpected case '${caseString}'`);
Expand Down
27 changes: 25 additions & 2 deletions lib/preprocess/format/name.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,34 @@
import { DeclinedName, UnprocessedName } from "../../compress/types";
import {
DeclinedName,
UnprocessedName,
WordCategory,
} from "../../compress/types";
import { getCase } from "./case";

export function getRawName(line: string): UnprocessedName {
const [base, _id, _gender, name, caseString] = line.split(";");
// Properties prefixed with '_bin' have not been translated.
//
// See https://bin.arnastofnun.is/gogn/k-snid
const [
base,
_id,
gender,
category,
_bin_einkunn,
_bin_malsnid_ords,
_bin_malfraedi,
_bin_millivisun,
_bin_birting,
name,
caseString,
] = line.split(";");

return {
base,
case: caseString,
name,
category: category as WordCategory,
gender,
};
}

Expand All @@ -18,5 +39,7 @@ export function formatName(name: UnprocessedName): DeclinedName {
base: name.base,
name: name.name,
case: nameCase,
category: name.category,
gender: name.gender,
};
}
6 changes: 3 additions & 3 deletions scripts/download-words.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ const csvFilePath = path.resolve(__dirname, "../data/word-cases.csv");
console.log(`Downloading file\n`);

execSync(
`curl -o ${zipFilePath} https://bin.arnastofnun.is/django/api/nidurhal/?file=Storasnid_beygm.csv.zip`,
`curl -o ${zipFilePath} https://bin.arnastofnun.is/django/api/nidurhal/?file=KRISTINsnid.csv.zip`,
{ stdio: "inherit" }
);

Expand All @@ -25,8 +25,8 @@ const unzip = async () => {
return;
}
zipfile.on("entry", (entry) => {
if (entry.fileName === "Storasnid_beygm.csv.sha256sum") return;
if (entry.fileName === "Storasnid_beygm.csv") {
if (entry.fileName === "KRISTINsnid.csv.sha256sum") return;
if (entry.fileName === "KRISTINsnid.csv") {
zipfile.openReadStream(entry, (err, readStream) => {
if (err) {
reject(err);
Expand Down
12 changes: 8 additions & 4 deletions scripts/filter-names.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import fsSync from "fs";
import path from "path";
import { getNames } from "../lib/preprocess/data/getNames";
import { logWriteAndSize } from "../lib/preprocess/utils/gzip";
import { getRawName } from "../lib/preprocess/format/name";
import { isFirstVariationOfCase } from "../lib/preprocess/format/case";

const nameCasesCsvFilePath = path.resolve(__dirname, "../out/name-cases.csv");
const wordCasesCsvFilePath = path.resolve(__dirname, "../data/word-cases.csv");
Expand All @@ -25,9 +27,11 @@ async function main() {
for await (const line of inputFile.readLines()) {
nInputLines++;

const name = line.split(";")[0];

if (!nameSet.has(name)) {
const name = getRawName(line);
if (!nameSet.has(name.base)) {
continue;
}
if (!isFirstVariationOfCase(name.case)) {
continue;
}

Expand All @@ -37,7 +41,7 @@ async function main() {

console.log(`Filtered ${nInputLines} entries into ${nOutputLines} entries.`);

await logWriteAndSize(nameCasesCsvFilePath);
logWriteAndSize(nameCasesCsvFilePath);
}

main();
115 changes: 105 additions & 10 deletions scripts/group-names.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,48 @@ import { getNames } from "../lib/preprocess/data/getNames";
import { isDefiniteArticle } from "../lib/preprocess/format/article";
import { isCasePlural } from "../lib/preprocess/format/case";
import { formatName, getRawName } from "../lib/preprocess/format/name";
import { Case, DeclinedName } from "../lib/compress/types";
import { Case, DeclinedName, WordCategory } from "../lib/compress/types";
import { writeAndLogSize } from "../lib/preprocess/utils/gzip";
import assert from "assert";

const nameCasesFilePath = path.resolve(__dirname, "../out/name-cases.csv");

// Some names (e.g. Eldey) can both be a personal name (eigin nafn) and the name
// of a company/organization (stofnunar- eða fyrirtækisheiti).
//
// These are not always declined in the same manner. See the following explainer
// from BÍN for an example:
//
// https://bin.arnastofnun.is/korn/7
//
// There's LOTS of word categories, with various degrees of overlap. All of the
// following categories contain at least one legal Icelandic name.
//
// The following list specifies the "category preference order". If a name
// exists in multiple categories, the first category in the list will be picked.
//
// PS: Aside from the first few elements, this ordering is mostly arbitrary. If
// reasons for preferring one category over another is discovered, then this
// list can be amended.
//
const categoriesInOrderOfPreference = [
WordCategory.PersonNames,
WordCategory.NonIcelandicPersonNames,
WordCategory.Nicknames,
WordCategory.MythicalName,
WordCategory.FamilyNames,
WordCategory.GeographicalNames,
WordCategory.PlaceNames,
WordCategory.AstrologicalNames,
WordCategory.OtherNames,
WordCategory.NonIcelandicGeographicalNames,
WordCategory.CountryNames,
WordCategory.StreetNames,
WordCategory.CategoriesOfPeoples,
WordCategory.CompanyOrOrganizationName,
WordCategory.General,
] as string[];

async function main() {
const fileContent = await fs.readFile(nameCasesFilePath, "utf-8");
const lines = fileContent.split("\n");
Expand Down Expand Up @@ -38,33 +75,87 @@ async function main() {

const out: string[][] = [];

const namesWithMultipleGenders = new Set();

for (const names of Object.values(groups)) {
let nf: DeclinedName | undefined;
let þf: DeclinedName | undefined;
let þgf: DeclinedName | undefined;
let ef: DeclinedName | undefined;
const byCategory: {
[category: string]: {
[gender: string]: {
[_case: string]: DeclinedName;
};
};
} = {};

for (const name of names) {
let _case;
switch (name.case) {
case Case.Nominative:
nf = name;
_case = "nf";
break;
case Case.Accusative:
þf = name;
_case = "þf";
break;
case Case.Dative:
þgf = name;
_case = "þgf";
break;
case Case.Genitive:
ef = name;
_case = "ef";
break;
default:
throw new Error(`Unexpected case '${name.case}'`);
}
byCategory[name.category] ||= {};
byCategory[name.category][name.gender] ||= {};
byCategory[name.category][name.gender][_case] ||= name;
}

let category: string | undefined;
const categories = Object.keys(byCategory);

assert(categories.length > 0, "should have at least 1 category");

if (
categories.length === 1 &&
["gæl,ism", "dýr,hetja"].includes(categories[0])
) {
// This seems like a data entry error, for which BÍN should be contacted.
// These occur for 1 word each.
//
// Anyway, ignore these while they are sorted out in the source.
continue;
}

for (const preferredCategory of categoriesInOrderOfPreference) {
if (categories.includes(preferredCategory)) {
category = preferredCategory;
break;
}
}
if (!category) {
throw new Error(
`No preferred category matched in list [${categories.join(
", "
)}] for name '${names[0].base}'`
);
}

const byGender = byCategory[category];

let gender: string;
const genders = Object.keys(byGender);
assert(genders.length > 0, "should have at least 1 gender");
if (genders.length > 1) {
namesWithMultipleGenders.add(names[0].base);
continue;
} else {
gender = genders[0];
}

const byCase = byGender[gender];
const { nf, þf, þgf, ef } = byCase;

if (!nf || !þf || !þgf || !ef) {
throw new Error(`Missing case for name '${names![0].base}'`);
throw new Error(`Missing case for name '${names[0].base}'`);
}

out.push([nf.name, þf.name, þgf.name, ef.name]);
Expand All @@ -79,6 +170,10 @@ async function main() {
`${excludedNames.length} of ${names.length} names (${percentage}) in 'name-cases.csv' are not present in 'words.csv' and are not included.\n`
);

console.log(
`Found ${namesWithMultipleGenders.size} names with multiple genders. They are omitted from Beygla.\n`
);

const groupedNamesfilePath = path.resolve(
__dirname,
"../out/grouped-names.json"
Expand Down

0 comments on commit 4a06584

Please sign in to comment.