Skip to content

Commit

Permalink
Merge pull request #214 from pelias/avoid-transliterated-names
Browse files Browse the repository at this point in the history
skip names we suspect were sourced from machine transliteration
  • Loading branch information
orangejulius authored Feb 10, 2022
2 parents 5430492 + 624a397 commit c3ce64c
Showing 1 changed file with 21 additions and 6 deletions.
27 changes: 21 additions & 6 deletions prototype/wof.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ const util = require('util');
const blacklist = require('pelias-blacklist-stream/loader')();
const analysis = require('../lib/analysis');
const language = dir('../config/language');
const LOW_POPULATION_THRESHOLD = 2000;

// list of languages / tags we favour in cases of deduplication
const LANG_PREFS = ['eng','und'];
Expand Down Expand Up @@ -97,12 +98,25 @@ function insertWofRecord( wof, next ){
}
}

// add 'name:*'
for( var attr in wof ){
// https://github.com/whosonfirst/whosonfirst-names
// names: preferred|colloquial|variant|unknown
var match = attr.match(/^name:([a-z]{3})_x_(preferred|colloquial|variant)$/);
if( match ){
// note: skip all `name:*` fields when we suspect that they were sourced from
// machine transliteration via WikiData.
// see: https://github.com/whosonfirst-data/whosonfirst-data/issues/799
const hasDeadOrObscureLanguages = _.has(wof, 'name:vol_x_preferred');
const isLowOrUnknownPopulation = _.get(doc, 'population', 0) < LOW_POPULATION_THRESHOLD;
const isMegaCity = _.get(doc, 'wof:megacity', 0) === 1;
const isCapitalCity = !_.isEmpty(_.get(doc, 'wof:capital_of'));
const isLikelyTransliterated = (
hasDeadOrObscureLanguages && isLowOrUnknownPopulation && !isMegaCity && !isCapitalCity
);
if (!isLikelyTransliterated) {

// add 'name:*' fields
for( var attr in wof ){
// https://github.com/whosonfirst/whosonfirst-names
// names: preferred|colloquial|variant|unknown
const match = attr.match(/^name:([a-z]{3})_x_(preferred|colloquial|variant)$/);
if (!match) { continue; }

// Fix for https://github.com/pelias/placeholder/pull/126
// Transform iso codes 639-2/B to 639-2/T
const lang = language.alternatives[match[1]] || match[1];
Expand All @@ -127,6 +141,7 @@ function insertWofRecord( wof, next ){
doc.names[ lang ] = wof[ attr ];
}
}

}
}

Expand Down

0 comments on commit c3ce64c

Please sign in to comment.