Skip to content

Commit

Permalink
accept duplicates that are merged in OSM
Browse files Browse the repository at this point in the history
  • Loading branch information
k-yle committed Feb 28, 2024
1 parent 2fa9e4d commit 32b7b83
Show file tree
Hide file tree
Showing 3 changed files with 159 additions and 1 deletion.
4 changes: 3 additions & 1 deletion src/stage2-preprocess/maybeTeReoName.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ const bannedWordsRegExp = new RegExp(
// eslint-disable-next-line unicorn/better-regex -- it's more clear written out like this
const anyNonTeReoLetters = /[^-ghkmnprtw aeiouāēīōū]/i;

function removeEnglishPrefixesAndSuffixes(name: string): string | undefined {
export function removeEnglishPrefixesAndSuffixes(
name: string,
): string | undefined {
if (bannedWordsRegExp.test(name)) return undefined;

const newName = name
Expand Down
153 changes: 153 additions & 0 deletions src/stage3-conflate/applyCustomMerges.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
/* eslint-disable no-param-reassign */
/* eslint-disable unicorn/prevent-abbreviations -- refs is a common OSM term */
import { distanceBetween } from '../core';
import { removeEnglishPrefixesAndSuffixes } from '../stage2-preprocess/maybeTeReoName';
import type { NZGBSourceData, OSMTempFile } from '../types';

/**
* The NZGB has duplicate entries, so this function
* will look at all the OSM features with semicolon-
* delimited ref:linz:place_id tags and merge any
* that were merged by an OSM mapper (rather than the
* dual-name ones we merged automatically).
*
* If the names are the different, we will allow it, but
* all the names must be listed in alt_name/old_name etc.
* There are also various safeguards later in the process
* to prevent anyone abusing this flexibility as a means
* to remove te reo names without detection.
* In the future we may require wikidata's P2959 before we
* accept these custom merges.
*
* @returns nothing, the `nzgb` argument is mutated (to improve perf)
*/
export async function applyCustomMerges(
nzgb: NZGBSourceData,
osm: OSMTempFile,
) {
let trivialMerges = 0;

const unexpectedRefsWithSemiColons = new Set<string>();
for (const category in osm) {
for (const ref in osm[category].withRef) {
if (ref.includes(';') && !nzgb[ref]) {
unexpectedRefsWithSemiColons.add(ref);
}
}
}

/**
* refs that the previous stage has already merged (usually dual names).
*
*/
const existingNzgbRefsWithSemiColons: Record<string, string> = {};
for (const ref in nzgb) {
if (ref.includes(';')) {
for (const subRef of ref.split(';')) {
existingNzgbRefsWithSemiColons[subRef] = ref;
}
}
}

for (const mergedRef of unexpectedRefsWithSemiColons) {
const refs = mergedRef.split(';');
const features = refs.map((ref) => nzgb[ref]);

if (features.some((f) => !f)) {
const expected = refs.filter((ref) => nzgb[ref]).join(';');
if (expected) {
console.error(`(!) Invalid refs: ${mergedRef} --> ${expected}`);
// in this case, we suggest removing the refs that don't exist
// anymore. Most likely cause is that the NZGB has noticed the
// duplicates and deleted one of them.
} else {
// none of the refs exist. This is a bit bizare.
const possibleOptions = new Set(
refs.map((ref) => existingNzgbRefsWithSemiColons[ref] || ''),
);
possibleOptions.delete('');

console.error(
`(!) None of these refs exist: ${mergedRef}. Did you mean ${[...possibleOptions].join(' or ')}?`,
);
}
continue;
}

// this check applies to all cases - the merged features
// have to be reasonably close.
const anyAreFarAway = features.some(
(f) =>
distanceBetween(features[0].lat, features[0].lng, f.lat, f.lng) >
10_000,
);
if (anyAreFarAway) {
// It's possible that the location used to be the same, but
// then the NZGB fixed the location on one node. In which case
// they should not be merged
console.warn(`Refusing to merge ${refs} since they are too far apart`);
continue;
}

// a common example is "Mt X" and "X Mountain" coëxisting, so we
// strip out prefixes and suffixes per comparing names.
const uniqueNames = new Set(
features.map((f) => removeEnglishPrefixesAndSuffixes(f.name)),
);

if (uniqueNames.size === 1) {
// all the merged features have the same name. This is the easy case

// no futher checks at the moment.

// We take everything from the first ref in the list, merging only
// a few selective props.
nzgb[mergedRef] = {
...features[0],
altNames: [...new Set(features.flatMap((f) => f.altNames || []))],
oldNames: [...new Set(features.flatMap((f) => f.oldNames || []))],
oldRefs: [...new Set(features.flatMap((f) => f.oldRefs || []))],
};
for (const ref of refs) delete nzgb[ref];
trivialMerges++;
} else {
// some of the merged features have different names.

// the "main" features is what we keep. This is the official name
// if one of the names is official, otherwise it's the first ref
const mainFeature = features.find((f) => f.official) || features[0];

// reference equality is safe here
const nonMainFeatures = features.filter((f) => f !== mainFeature);

// non-main names are the names from all the non-main features.
// these names have to go in alt_name
const nonMainNames = nonMainFeatures
.map((f) => f.name)
.filter(
(name) =>
name !==
mainFeature.name.normalize('NFD').replaceAll(/\p{Diacritic}/gu, ''),
);

console.log(
`Accepting “${mainFeature.name}” over “${nonMainNames.join(' & ')}”`,
);

nzgb[mergedRef] = {
...mainFeature,
altNames: [
...new Set([
...features.flatMap((f) => f.altNames || []),
...nonMainNames,
]),
],
oldNames: [...new Set(features.flatMap((f) => f.oldNames || []))],
oldRefs: [...new Set(features.flatMap((f) => f.oldRefs || []))],
};
for (const ref of refs) delete nzgb[ref];
}
}

console.log(`Accepted ${trivialMerges} trivial merges`);
}
3 changes: 3 additions & 0 deletions src/stage3-conflate/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import { compareFeatures } from './compareFeatures/compareFeatures';
import { findMatch } from './findMatch';
import { getPresetTags } from './getPresetTags';
import { checkWikidataRedirects } from './checkWikidataRedirects';
import { applyCustomMerges } from './applyCustomMerges';

// baseline: this took 120sec on the very first run (1k refs in the planet)
function processOneType(
Expand Down Expand Up @@ -234,6 +235,8 @@ async function main() {
);
const osm: OSMTempFile = JSON.parse(await fs.readFile(tempOsmFile, 'utf8'));

applyCustomMerges(nzgb, osm);

const statsObject: Partial<StatsFile> = {};
const extraLayersObject: Record<string, OsmPatchFile> = {};

Expand Down

0 comments on commit 32b7b83

Please sign in to comment.