Skip to content

Commit

Permalink
feat(language_field_trimming): add language_field_trimming post proce…
Browse files Browse the repository at this point in the history
…ssing script
  • Loading branch information
missinglink committed Oct 14, 2020
1 parent abef8cf commit 081240c
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 1 deletion.
1 change: 1 addition & 0 deletions Document.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ function Document( source, layer, source_id ){
this.addPostProcessingScript( require('./post/intersections') );
this.addPostProcessingScript( require('./post/seperable_street_names').post );
this.addPostProcessingScript( require('./post/deduplication') );
this.addPostProcessingScript( require('./post/language_field_trimming') );

// mandatory properties
this.setSource( source );
Expand Down
51 changes: 51 additions & 0 deletions post/language_field_trimming.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/**
* Language field post-processing script ensures that language tokens
* present in the 'default' language are not duplicated in other languages.
*
* By default Pelias searches on the `name.default` field, and in some cases
* it additionally searches on the language of the browser agent.
*
* This means that any name which exists in `name.default` need not additionally
* exist in any of the other language fields.
*
* The benefits of this are that we can reduce the index size and any TF/IDF penalties.
*
* Example: the term 'Berlin' is indexed in *both* `name.default` and `name.de`.
* In this case the `de` copy of the string 'Berlin' can be removed as it offers no value.
*/

const _ = require('lodash');
const prefixes = ['name', 'phrase'];

function deduplication(doc) {
prefixes.forEach(prefix => {

// load the field data
// ie: an object keyed by language codes, each value is an array of names
let field = doc[prefix];
if (!_.isPlainObject(field)) { return; }

// fetch the 'default' language
var defaults = _.get(field, 'default');
if (!_.isArray(defaults) || _.isEmpty(defaults)) { return; }

// iterate over other languages in the field
_.each(field, (names, lang) => {

// skip the 'default' language
if (lang === 'default'){ return; }

// filter entries from this language which appear in the 'default' lang
if (_.isArray(names) || !_.isEmpty(names)) {
field[lang] = _.difference(names, defaults);
}

// clean up empty language arrays
if (_.isEmpty(field[lang])) {
delete field[lang];
}
});
});
}

module.exports = deduplication;
3 changes: 2 additions & 1 deletion test/document/post.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ const Document = require('../../Document');
const intersections = require('../../post/intersections');
const seperable_street_names = require('../../post/seperable_street_names').post;
const deduplication = require('../../post/deduplication');
const DEFAULT_SCRIPTS = [ intersections, seperable_street_names, deduplication ];
const language_field_trimming = require('../../post/language_field_trimming');
const DEFAULT_SCRIPTS = [intersections, seperable_street_names, deduplication, language_field_trimming ];

module.exports.tests = {};

Expand Down
64 changes: 64 additions & 0 deletions test/post/language_field_trimming.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@

var Document = require('../../Document');
var language_field_trimming = require('../../post/language_field_trimming');

module.exports.tests = {};

module.exports.tests.dedupe = function (test) {
test('dedupe - name', function (t) {
var doc = new Document('mysource', 'mylayer', 'myid');

doc.setName('default', 'test1');
doc.setNameAlias('default', 'test2');
doc.setNameAlias('default', 'test3');

doc.setName('en', 'test1');
doc.setNameAlias('en', 'test3');
doc.setNameAlias('en', 'test4');

doc.setName('de', 'test1');
doc.setNameAlias('de', 'test2');

language_field_trimming(doc);

t.deepEquals(doc.name.default, ['test1', 'test2', 'test3']);
t.deepEquals(doc.name.en, ['test4']);
t.false(doc.name.de);

t.end();
});

test('dedupe - phrase', function (t) {
var doc = new Document('mysource', 'mylayer', 'myid');

doc.setName('default', 'test1');
doc.setNameAlias('default', 'test2');
doc.setNameAlias('default', 'test3');

doc.setName('en', 'test1');
doc.setNameAlias('en', 'test3');
doc.setNameAlias('en', 'test4');

doc.setName('de', 'test1');
doc.setNameAlias('de', 'test2');

language_field_trimming(doc);

t.deepEquals(doc.phrase.default, ['test1', 'test2', 'test3']);
t.deepEquals(doc.phrase.en, ['test4']);
t.false(doc.phrase.de);

t.end();
});
};

module.exports.all = function (tape, common) {

function test(name, testFunction) {
return tape('post/language_field_trimming: ' + name, testFunction);
}

for (var testCase in module.exports.tests) {
module.exports.tests[testCase](test, common);
}
};
1 change: 1 addition & 0 deletions test/run.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ var tests = [
require('./post/intersections.js'),
require('./post/deduplication.js'),
require('./post/seperable_street_names.js'),
require('./post/language_field_trimming.js'),
require('./DocumentMapperStream.js'),
require('./util/transform.js'),
require('./util/valid.js'),
Expand Down

0 comments on commit 081240c

Please sign in to comment.