Skip to content

Commit da06836

Browse files
authored
fix(full-text-search): fix fuzzy's prefix length, edit distance and idf (#68)
1 parent ff74219 commit da06836

File tree

6 files changed

+234
-48
lines changed

6 files changed

+234
-48
lines changed

packages/full-text-search/spec/generic/search/fuzzy.spec.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,13 +103,13 @@ describe("fuzzy query", () => {
103103
query = new QB().fuzzy("body", "ddddx").prefixLength(5).fuzziness(2).build();
104104
assertMatches(fts, query);
105105

106-
// Without prefix length (default should be 2).
106+
// Without prefix length (default should be 0).
107107
query = new QB().fuzzy("body", "aaaab").fuzziness(2).build();
108108
assertMatches(fts, query, [0, 1, 2, 3]);
109109
query = new QB().fuzzy("body", "aaabb").fuzziness(2).build();
110-
assertMatches(fts, query, [0, 1, 2, 3]);
111-
query = new QB().fuzzy("body", "aabbb").fuzziness(2).build();
112-
assertMatches(fts, query, [1, 2, 3]);
110+
assertMatches(fts, query, [0, 1, 2, 3, 4]);
111+
query = new QB().fuzzy("body", "abbbb").fuzziness(2).build();
112+
assertMatches(fts, query, [2, 3, 4, 5]);
113113

114114
// Empty.
115115
query = new QB().fuzzy("body", "").build();

packages/full-text-search/spec/node/MOCK_DATA.ts

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
export const DATA = [
22
{
33
"id": 1,
4-
"msg": "varius nulla facilisi cras non velit nec nisi vulputate nonummy maecenas tincidunt lacus at velit vivamus vel nulla eget eros elementum pellentesque quisque porta volutpat erat quisque erat eros viverra eget congue eget semper rutrum nulla nunc purus"
4+
"msg": "varius nulla facilisi cras non velit nec nisi vulputate nonummy maecenas tincidunt lacus at velit vivamus vel nulla eget eros elementum pellentesque quisque porta volutpat erat quisque erat eros viverra eget congue eget semper rutrum nulla nunc purus "
55
},
66
{
77
"id": 2,
@@ -658,67 +658,67 @@ export const DATA = [
658658
},
659659
{
660660
"id": 184,
661-
"msg": "ultrices phasellus id sapien in sapien iaculis congue vivamus metus arcu adipiscing molestie hendrerit at vulputate vitae nisl aenean lectus pellentesque eget nunc donec"
661+
"msg": "ultrices justns phasellus id sapien in sapien iaculis congue vivamus metus arcu adipiscing molestie hendrerit at vulputate vitae nisl aenean lectus pellentesque eget nunc donec"
662662
},
663663
{
664664
"id": 185,
665-
"msg": "pede malesuada in imperdiet et commodo vulputate justo in blandit ultrices enim lorem ipsum dolor sit amet consectetuer adipiscing elit proin interdum mauris non ligula pellentesque ultrices phasellus id"
665+
"msg": "pede jusnt malesuada in imperdiet et commodo vulputate justo in blandit ultrices enim lorem ipsum dolor sit amet consectetuer adipiscing elit proin interdum mauris non ligula pellentesque ultrices phasellus id"
666666
},
667667
{
668668
"id": 186,
669-
"msg": "lorem ipsum dolor sit amet consectetuer adipiscing elit proin interdum mauris non ligula pellentesque ultrices phasellus id sapien in sapien iaculis congue vivamus metus arcu adipiscing molestie hendrerit at vulputate vitae"
669+
"msg": "lorem justn ipsum dolor sit amet consectetuer adipiscing elit proin interdum mauris non ligula pellentesque ultrices phasellus id sapien in sapien iaculis congue vivamus metus arcu adipiscing molestie hendrerit at vulputate vitae"
670670
},
671671
{
672672
"id": 187,
673-
"msg": "congue risus semper porta volutpat quam pede lobortis ligula sit amet eleifend pede libero quis orci nullam molestie nibh in lectus pellentesque at nulla suspendisse potenti cras in purus eu magna vulputate luctus cum sociis natoque penatibus et magnis dis"
673+
"msg": "congue ous risus semper porta volutpat quam pede lobortis ligula sit amet eleifend pede libero quis orci nullam molestie nibh in lectus pellentesque at nulla suspendisse potenti cras in purus eu magna vulputate luctus cum sociis natoque penatibus et magnis dis"
674674
},
675675
{
676676
"id": 188,
677-
"msg": "in felis eu sapien cursus vestibulum proin eu mi nulla ac enim in tempor turpis nec euismod scelerisque quam turpis adipiscing lorem vitae mattis nibh ligula nec sem duis aliquam convallis nunc proin at turpis a pede posuere nonummy integer non velit donec diam neque vestibulum eget vulputate"
677+
"msg": "in felis jsu eu sapien cursus vestibulum proin eu mi nulla ac enim in tempor turpis nec euismod scelerisque quam turpis adipiscing lorem vitae mattis nibh ligula nec sem duis aliquam convallis nunc proin at turpis a pede posuere nonummy integer non velit donec diam neque vestibulum eget vulputate"
678678
},
679679
{
680680
"id": 189,
681-
"msg": "est quam pharetra magna ac consequat metus sapien ut nunc vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae mauris viverra diam vitae"
681+
"msg": "est justo quam pharetra magna ac consequat metus sapien ut nunc vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae mauris viverra diam vitae"
682682
},
683683
{
684684
"id": 190,
685-
"msg": "a suscipit nulla elit ac nulla sed vel enim sit amet nunc viverra dapibus nulla suscipit ligula in lacus curabitur at ipsum ac"
685+
"msg": "a suscipit jus nulla elit ac nulla sed vel enim sit amet nunc viverra dapibus nulla suscipit ligula in lacus curabitur at ipsum ac"
686686
},
687687
{
688688
"id": 191,
689-
"msg": "et magnis dis parturient montes nascetur ridiculus mus etiam vel augue vestibulum rutrum rutrum neque aenean auctor gravida sem praesent id massa id nisl venenatis lacinia aenean sit amet justo morbi ut odio cras mi pede malesuada in imperdiet et commodo"
689+
"msg": "et magnis xus dis parturient montes nascetur ridiculus mus etiam vel augue vestibulum rutrum rutrum neque aenean auctor gravida sem praesent id massa id nisl venenatis lacinia aenean sit amet justo morbi ut odio cras mi pede malesuada in imperdiet et commodo"
690690
},
691691
{
692692
"id": 192,
693-
"msg": "lacus morbi sem mauris laoreet ut rhoncus aliquet pulvinar sed nisl nunc rhoncus dui vel sem sed sagittis nam congue risus semper porta volutpat quam pede lobortis ligula sit amet eleifend pede libero quis orci nullam molestie nibh in lectus pellentesque at nulla suspendisse"
693+
"msg": "lacus eu morbi sem mauris laoreet ut rhoncus aliquet pulvinar sed nisl nunc rhoncus dui vel sem sed sagittis nam congue risus semper porta volutpat quam pede lobortis ligula sit amet eleifend pede libero quis orci nullam molestie nibh in lectus pellentesque at nulla suspendisse"
694694
},
695695
{
696696
"id": 193,
697-
"msg": "massa donec dapibus duis at velit eu est congue elementum in hac habitasse platea dictumst morbi vestibulum velit id pretium iaculis diam erat fermentum justo nec condimentum neque sapien placerat ante nulla justo aliquam quis turpis eget elit sodales scelerisque"
697+
"msg": "massa usx donec dapibus duis at velit eu est congue elementum in hac habitasse platea dictumst morbi vestibulum velit id pretium iaculis diam erat fermentum justo nec condimentum neque sapien placerat ante nulla justo aliquam quis turpis eget elit sodales scelerisque"
698698
},
699699
{
700700
"id": 194,
701-
"msg": "convallis nunc proin at turpis a pede posuere nonummy integer non velit donec diam neque vestibulum eget vulputate ut ultrices vel augue vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere"
701+
"msg": "convallis js js jus nunc proin at turpis a pede posuere nonummy integer non velit donec diam neque vestibulum eget vulputate ut ultrices vel augue vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere"
702702
},
703703
{
704704
"id": 195,
705-
"msg": "augue luctus tincidunt nulla mollis molestie lorem quisque ut erat curabitur gravida nisi at nibh in hac habitasse platea"
705+
"msg": "augue luctus us tincidunt nulla mollis molestie lorem quisque ut erat curabitur gravida nisi at nibh in hac habitasse platea"
706706
},
707707
{"id": 196, "msg": "proin leo odio porttitor id consequat in consequat ut nulla sed accumsan"},
708708
{
709709
"id": 197,
710-
"msg": "ut rhoncus aliquet pulvinar sed nisl nunc rhoncus dui vel sem sed sagittis nam congue risus semper porta volutpat quam pede lobortis ligula sit amet eleifend pede libero quis orci nullam molestie nibh in lectus pellentesque at nulla suspendisse potenti cras"
710+
"msg": "ut rhoncus just aliquet pulvinar sed nisl nunc rhoncus dui vel sem sed sagittis nam congue risus semper porta volutpat quam pede lobortis ligula sit amet eleifend pede libero quis orci nullam molestie nibh in lectus pellentesque at nulla suspendisse potenti cras"
711711
},
712712
{
713713
"id": 198,
714-
"msg": "donec semper sapien a libero nam dui proin leo odio porttitor id consequat in consequat ut nulla sed accumsan felis ut at dolor quis odio consequat"
714+
"msg": "donec semper jur sapien a libero nam dui proin leo odio porttitor id consequat in consequat ut nulla sed accumsan felis ut at dolor quis odio consequat"
715715
},
716716
{
717717
"id": 199,
718-
"msg": "pede justo eu massa donec dapibus duis at velit eu Est congue elementum in hac habitasse platea dictumst morbi vestibulum velit id pretium iaculis diam erat"
718+
"msg": "pede jt massa donec dapibus duis at velit eu Est congue elementum in hac habitasse platea dictumst morbi vestibulum velit id pretium iaculis diam erat"
719719
},
720720
{
721721
"id": 200,
722-
"msg": "sagittis dui vel nisl duis ac nibh fusce lacus purus aliquet at feugiat non pretium quis lectus suspendisse potenti in eleifend quam"
722+
"msg": "sagittis jut dui vel nisl duis ac nibh fusce lacus purus aliquet at feugiat non pretium quis lectus suspendisse potenti in eleifend quam"
723723
}
724724
];

packages/full-text-search/spec/node/QUERIES.ts

Lines changed: 135 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,10 @@ export const QUERIES = [
3636
.build(),
3737
es: {
3838
fuzzy: {
39-
[FIELD_NAME_1]: "a"
39+
[FIELD_NAME_1]: {
40+
value: "a",
41+
transpositions: true
42+
}
4043
}
4144
}
4245
},
@@ -46,7 +49,10 @@ export const QUERIES = [
4649
.build(),
4750
es: {
4851
fuzzy: {
49-
[FIELD_NAME_1]: "este"
52+
[FIELD_NAME_1]: {
53+
value: "este",
54+
transpositions: true
55+
}
5056
}
5157
}
5258
},
@@ -58,7 +64,8 @@ export const QUERIES = [
5864
fuzzy: {
5965
[FIELD_NAME_1]: {
6066
value: "est",
61-
prefix_length: 3
67+
prefix_length: 3,
68+
transpositions: true
6269
}
6370
}
6471
}
@@ -72,7 +79,8 @@ export const QUERIES = [
7279
[FIELD_NAME_1]: {
7380
value: "ege",
7481
prefix_length: 3,
75-
fuzziness: 2
82+
fuzziness: 2,
83+
transpositions: true
7684
}
7785
}
7886
},
@@ -86,7 +94,69 @@ export const QUERIES = [
8694
fuzzy: {
8795
[FIELD_NAME_1]: {
8896
value: "est",
89-
fuzziness: 0
97+
fuzziness: 0,
98+
transpositions: true
99+
}
100+
}
101+
}
102+
},
103+
{
104+
fts: new QB()
105+
.fuzzy(FIELD_NAME_1, "just").fuzziness(2)
106+
.build(),
107+
es: {
108+
fuzzy: {
109+
[FIELD_NAME_1]: {
110+
value: "just",
111+
fuzziness: 2,
112+
transpositions: true
113+
}
114+
}
115+
}
116+
},
117+
{
118+
fts: new QB()
119+
.explain(true)
120+
.fuzzy(FIELD_NAME_1, "jus").fuzziness(1)
121+
.build(),
122+
es: {
123+
fuzzy: {
124+
[FIELD_NAME_1]: {
125+
value: "jus",
126+
fuzziness: 1,
127+
transpositions: true
128+
}
129+
}
130+
}
131+
},
132+
{
133+
fts: new QB()
134+
.explain(true)
135+
.fuzzy(FIELD_NAME_1, "jus").fuzziness(2).prefixLength(1)
136+
.build(),
137+
es: {
138+
fuzzy: {
139+
[FIELD_NAME_1]: {
140+
value: "jus",
141+
fuzziness: 2,
142+
prefix_length: 1,
143+
transpositions: true
144+
}
145+
}
146+
}
147+
},
148+
{
149+
fts: new QB()
150+
.explain(true)
151+
.fuzzy(FIELD_NAME_1, "js").fuzziness(2)
152+
.build(),
153+
es: {
154+
fuzzy: {
155+
[FIELD_NAME_1]: {
156+
value: "js",
157+
fuzziness: 2,
158+
prefix_length: 0,
159+
transpositions: true
90160
}
91161
}
92162
}
@@ -206,6 +276,7 @@ export const QUERIES = [
206276
},
207277
{
208278
fts: new QB()
279+
.explain(true)
209280
.bool()
210281
.beginMust().term(FIELD_NAME_1, "a").term(FIELD_NAME_1, "ac").endMust()
211282
.build(),
@@ -226,6 +297,65 @@ export const QUERIES = [
226297
}
227298
}
228299
},
300+
{
301+
fts: new QB()
302+
.explain(true)
303+
.bool()
304+
.beginMust().term(FIELD_NAME_1, "a").fuzzy(FIELD_NAME_1, "just").term(FIELD_NAME_1, "ac").endMust()
305+
.build(),
306+
es: {
307+
bool: {
308+
must: [
309+
{
310+
term: {
311+
[FIELD_NAME_1]: "a"
312+
}
313+
},
314+
{
315+
fuzzy: {
316+
[FIELD_NAME_1]: "just"
317+
}
318+
},
319+
{
320+
term: {
321+
[FIELD_NAME_1]: "ac"
322+
}
323+
}
324+
]
325+
}
326+
}
327+
},
328+
{
329+
fts: new QB()
330+
.explain(true)
331+
.bool()
332+
.beginMust().term(FIELD_NAME_1, "a").wildcard(FIELD_NAME_1, "j*").enableScoring(true).term(FIELD_NAME_1, "ac").endMust()
333+
.build(),
334+
es: {
335+
bool: {
336+
must: [
337+
{
338+
term: {
339+
[FIELD_NAME_1]: "a"
340+
}
341+
},
342+
{
343+
wildcard: {
344+
[FIELD_NAME_1]: {
345+
value: "j*",
346+
rewrite: "scoring_boolean"
347+
}
348+
}
349+
},
350+
{
351+
term: {
352+
[FIELD_NAME_1]: "ac"
353+
}
354+
}
355+
]
356+
}
357+
}
358+
},
229359
{
230360
fts: new QB()
231361
.bool()

packages/full-text-search/spec/node/elasticsearch.spec.ts

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,11 @@ import {FullTextSearch} from "../../src/full_text_search";
55
import {Tokenizer} from "../../src/tokenizer";
66
import {Client} from "elasticsearch";
77
import {Scorer} from "../../src/scorer";
8-
import * as util from "util";
98

109
const INDEX_NAME = "test_index";
1110
const INDEX_TYPE = "MockUp";
1211
const FIELD_NAME_1 = "msg";
13-
const COMPARE_PRECISION = 1e4;
12+
const COMPARE_PRECISION = 1e3;
1413

1514
function fieldLengthES5(fieldLength: number) {
1615
// Lucene 5 uses a SmallFloat (size of 1 byte) to store the field length in scoring.
@@ -178,7 +177,7 @@ describe("Compare scoring against elasticsearch", () => {
178177
}
179178

180179
// Check if esHits should be empty.
181-
if (query.hasOwnProperty("empty") && query.empty === true) {
180+
if (query.empty === true) {
182181
expect(esHits.length).toEqual(0);
183182
done();
184183
return;
@@ -198,14 +197,14 @@ describe("Compare scoring against elasticsearch", () => {
198197
continue;
199198
}
200199

201-
let esScore = Math.round(esHits[j]._score * COMPARE_PRECISION) / COMPARE_PRECISION;
202200
let ftsScore = Math.round(ftsHits[esID].score * COMPARE_PRECISION) / COMPARE_PRECISION;
201+
let esScore = Math.round(esHits[j]._score * COMPARE_PRECISION) / COMPARE_PRECISION;
203202

204-
expect(esScore).toEqual(ftsScore);
203+
expect(ftsScore).toEqual(esScore);
205204
}
206205
done();
207206
})
208-
.catch(() => {
207+
.catch((e) => {
209208
expect(false).toBe(true);
210209
done();
211210
});
@@ -237,6 +236,8 @@ describe("Compare scoring against elasticsearch", () => {
237236
}
238237
},
239238
settings: {
239+
number_of_shards: 1,
240+
number_of_replicas: 1,
240241
analysis: {
241242
analyzer: {
242243
my_analyzer: {

0 commit comments

Comments
 (0)