Skip to content

Commit

Permalink
feat(penalty): add lang meta data in toponym for housenumber penalty (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
Joxit authored Nov 24, 2019
1 parent da9266a commit f11875e
Show file tree
Hide file tree
Showing 8 changed files with 83 additions and 11 deletions.
12 changes: 11 additions & 1 deletion classifier/CompositeClassifier.js
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,17 @@ class CompositeClassifier extends SectionClassifier {
})

// classify each super phrase
superPhrases.forEach(p => p.classify(new s.Class(s.confidence)))
superPhrases.forEach(p => {
// spread children langs to the parent
const langs = p.graph.findAll('child').reduce((acc, s) => {
Object.values(s.classifications)
.filter(c => c.meta && c.meta.langs)
.map(c => Object.keys(c.meta.langs))
.forEach(lang => { acc[lang] = true })
return acc
}, {})
p.classify(new s.Class(s.confidence, { langs }))
})

// optionally classify individual phrases
composites.forEach(c => {
Expand Down
2 changes: 1 addition & 1 deletion classifier/ToponymClassifier.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class ToponymClassifier extends WordClassifier {

// use an inverted index for full token matching as it's O(1)
if (this.index.hasOwnProperty(span.norm)) {
span.classify(new ToponymClassification(1))
span.classify(new ToponymClassification(1, this.index[span.norm]))
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion classifier/ToponymClassifier.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ module.exports.tests.english_suffix = (test) => {
let s = classify(token)

t.deepEqual(s.classifications, {
ToponymClassification: new ToponymClassification(1)
ToponymClassification: new ToponymClassification(1, { langs: { en: true } })
})
t.end()
})
Expand Down
2 changes: 2 additions & 0 deletions parser/AddressParser.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ const TokenDistanceFilter = require('../solver/TokenDistanceFilter')
const MustNotPreceedFilter = require('../solver/MustNotPreceedFilter')
const MustNotFollowFilter = require('../solver/MustNotFollowFilter')
const SubsetFilter = require('../solver/SubsetFilter')
const HouseNumberPositionPenalty = require('../solver/HouseNumberPositionPenalty')

class AddressParser extends Parser {
constructor (options) {
Expand Down Expand Up @@ -111,6 +112,7 @@ class AddressParser extends Parser {
new MustNotPreceedFilter('CountryClassification', 'HouseNumberClassification'),
new MustNotFollowFilter('LocalityClassification', 'RegionClassification'),
new MustNotFollowFilter('LocalityClassification', 'CountryClassification'),
new HouseNumberPositionPenalty(),
new TokenDistanceFilter(),
new SubsetFilter()
],
Expand Down
11 changes: 6 additions & 5 deletions resources/libpostal/libpostal.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@ function load (index, langs, filename, options) {
if (!fs.existsSync(filepath)) { return }
let dict = fs.readFileSync(filepath, 'utf8')
dict.split('\n').forEach(row => {
row.split('|').forEach(add)
row.split('|').forEach(add.bind(null, lang))
}, this)
}, this)

langs.forEach(lang => {
pelias.load(path.join('libpostal', lang, filename), add, remove)
pelias.load(path.join('libpostal', lang, filename), add.bind(null, lang), remove)
})

langs.forEach(lang => {
custom.load(path.join('libpostal', lang, filename), add, remove)
custom.load(path.join('libpostal', lang, filename), add.bind(null, lang), remove)
})
}

Expand All @@ -42,10 +42,11 @@ function _normalize (cell, options) {
}

function _add (index, options) {
return cell => {
return (lang, cell) => {
const value = _normalize(cell, options)
if (value && value.length) {
index[value] = true
index[value] = index[value] || { langs: {} }
index[value].langs[lang] = true
}
}
}
Expand Down
59 changes: 59 additions & 0 deletions solver/HouseNumberPositionPenalty.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
const BaseSolver = require('./super/BaseSolver')
const HouseNumberClassification = require('../classification/HouseNumberClassification')
const StreetClassification = require('../classification/StreetClassification')
const basePenalty = 0.05
// https://github.com/pelias/api/blob/master/middleware/localNamingConventions.js
const numberLastLangs = {
'de': basePenalty,
'sl': basePenalty,
'pl': basePenalty,
'bs': basePenalty,
'hr': basePenalty,
'nl': basePenalty,
'cs': basePenalty,
'da': basePenalty,
'es': basePenalty / 2, // Guatemala & Honduras do not flip their house numbers
'fi': basePenalty,
'el': basePenalty,
'is': basePenalty,
'it': basePenalty,
'nb': basePenalty,
'pt': basePenalty,
'sv': basePenalty,
'sk': basePenalty,
'tr': basePenalty,
'ro': basePenalty,
'hu': basePenalty
}
const numberFirstLangs = {
'en': basePenalty,
'fr': basePenalty / 2 // Switzerland and Andorre has some french streets
}

class HouseNumberPositionPenalty extends BaseSolver {
solve (tokenizer) {
tokenizer.solution.forEach(s => {
const housenumber = s.pair.find(p => p.classification.constructor === HouseNumberClassification)
const street = s.pair.find(p => p.classification.constructor === StreetClassification)

// Do nothing if there is no street/housenumber or no meta in street classification
if (!housenumber || !street || !street.classification.meta || !street.classification.meta.langs) { return }

const langs = Object.keys(street.classification.meta.langs)

// For now, we don't supports multi-lang entries
if (langs.length !== 1 || langs[0] === 'all') { return }

const lang = langs[0]

// Check if the number should be in last position (after street) or first position (before street)
if (numberLastLangs.hasOwnProperty(lang) && housenumber.span.start < street.span.start) {
s.penalty += numberLastLangs[lang]
} else if (numberFirstLangs.hasOwnProperty(lang) && street.span.start < housenumber.span.start) {
s.penalty += numberFirstLangs[lang]
}
})
}
}

module.exports = HouseNumberPositionPenalty
3 changes: 2 additions & 1 deletion solver/Solution.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ class Solution {
constructor (pairs) {
this.pair = pairs || []
this.score = 0.0 // absolute score
this.penalty = 0.0
}

// create a deep copy of this solution
Expand Down Expand Up @@ -51,7 +52,7 @@ class Solution {

// absolute score
// the average character score coveered divided by the total coverage
this.score = (score.confidence / score.coverage) * (score.coverage / tokenizer.coverage)
this.score = (score.confidence / score.coverage) * (score.coverage / tokenizer.coverage) * (1.0 - this.penalty)
}

// return a mask of the input for this solution
Expand Down
3 changes: 1 addition & 2 deletions test/address.usa.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,7 @@ const testcase = (test, common) => {
assert('1210a California 10', [{ housenumber: '1210a' }, { street: 'California 10' }], true)
assert('1389a IA 42 IA', [{ housenumber: '1389a' }, { street: 'IA 42' }, { region: 'IA' }], true)

// This does not work because of MD
// assert('1111 MD 760, Lusby, MD, USA', [{ housenumber: '1111' }, { street: 'MD 760' }, { locality: 'Lusby' }, { region: 'MD' }, { country: 'USA' }], true)
assert('1111 MD 760, Lusby, MD, USA', [{ housenumber: '1111' }, { street: 'MD 760' }, { locality: 'Lusby' }, { region: 'MD' }, { country: 'USA' }], true)
}

module.exports.all = (tape, common) => {
Expand Down

0 comments on commit f11875e

Please sign in to comment.