-
-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(language_field_trimming): add language_field_trimming post proce…
…ssing script
- Loading branch information
1 parent
abef8cf
commit 081240c
Showing
5 changed files
with
119 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
/** | ||
* Language field post-processing script ensures that language tokens | ||
* present in the 'default' language are not duplicated in other languages. | ||
* | ||
* By default Pelias searches on the `name.default` field, and in some cases | ||
* it additionally searches on the language of the browser agent. | ||
* | ||
* This means that any name which exists in `name.default` need not additionally | ||
* exist in any of the other language fields. | ||
* | ||
* The benefits of this are that we can reduce the index size and any TF/IDF penalties. | ||
* | ||
* Example: the term 'Berlin' is indexed in *both* `name.default` and `name.de`. | ||
* In this case the `de` copy of the string 'Berlin' can be removed as it offers no value. | ||
*/ | ||
|
||
const _ = require('lodash'); | ||
const prefixes = ['name', 'phrase']; | ||
|
||
function deduplication(doc) { | ||
prefixes.forEach(prefix => { | ||
|
||
// load the field data | ||
// ie: an object keyed by language codes, each value is an array of names | ||
let field = doc[prefix]; | ||
if (!_.isPlainObject(field)) { return; } | ||
|
||
// fetch the 'default' language | ||
var defaults = _.get(field, 'default'); | ||
if (!_.isArray(defaults) || _.isEmpty(defaults)) { return; } | ||
|
||
// iterate over other languages in the field | ||
_.each(field, (names, lang) => { | ||
|
||
// skip the 'default' language | ||
if (lang === 'default'){ return; } | ||
|
||
// filter entries from this language which appear in the 'default' lang | ||
if (_.isArray(names) || !_.isEmpty(names)) { | ||
field[lang] = _.difference(names, defaults); | ||
} | ||
|
||
// clean up empty language arrays | ||
if (_.isEmpty(field[lang])) { | ||
delete field[lang]; | ||
} | ||
}); | ||
}); | ||
} | ||
|
||
module.exports = deduplication; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
|
||
var Document = require('../../Document'); | ||
var language_field_trimming = require('../../post/language_field_trimming'); | ||
|
||
module.exports.tests = {}; | ||
|
||
module.exports.tests.dedupe = function (test) { | ||
test('dedupe - name', function (t) { | ||
var doc = new Document('mysource', 'mylayer', 'myid'); | ||
|
||
doc.setName('default', 'test1'); | ||
doc.setNameAlias('default', 'test2'); | ||
doc.setNameAlias('default', 'test3'); | ||
|
||
doc.setName('en', 'test1'); | ||
doc.setNameAlias('en', 'test3'); | ||
doc.setNameAlias('en', 'test4'); | ||
|
||
doc.setName('de', 'test1'); | ||
doc.setNameAlias('de', 'test2'); | ||
|
||
language_field_trimming(doc); | ||
|
||
t.deepEquals(doc.name.default, ['test1', 'test2', 'test3']); | ||
t.deepEquals(doc.name.en, ['test4']); | ||
t.false(doc.name.de); | ||
|
||
t.end(); | ||
}); | ||
|
||
test('dedupe - phrase', function (t) { | ||
var doc = new Document('mysource', 'mylayer', 'myid'); | ||
|
||
doc.setName('default', 'test1'); | ||
doc.setNameAlias('default', 'test2'); | ||
doc.setNameAlias('default', 'test3'); | ||
|
||
doc.setName('en', 'test1'); | ||
doc.setNameAlias('en', 'test3'); | ||
doc.setNameAlias('en', 'test4'); | ||
|
||
doc.setName('de', 'test1'); | ||
doc.setNameAlias('de', 'test2'); | ||
|
||
language_field_trimming(doc); | ||
|
||
t.deepEquals(doc.phrase.default, ['test1', 'test2', 'test3']); | ||
t.deepEquals(doc.phrase.en, ['test4']); | ||
t.false(doc.phrase.de); | ||
|
||
t.end(); | ||
}); | ||
}; | ||
|
||
module.exports.all = function (tape, common) { | ||
|
||
function test(name, testFunction) { | ||
return tape('post/language_field_trimming: ' + name, testFunction); | ||
} | ||
|
||
for (var testCase in module.exports.tests) { | ||
module.exports.tests[testCase](test, common); | ||
} | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters