Skip to content

Commit

Permalink
phone-search analyzer: don't emit int'l prefix
Browse files Browse the repository at this point in the history
this was an oversight in the initial implementation: if the tokenizer
emits the international calling prefix in the search analyzer then all
documents with the same international calling prefix will match.

e.g. when searching for `+1-555-123-4567` not only documents with this
number would match but also any other document with a `1` token (i.e.
any other number with this prefix).

thus the search functionality is currently broken for this analyzer,
making it useless.

the test coverage has now been extended to cover these and other
use-cases.

Signed-off-by: Ralph Ursprung <Ralph.Ursprung@avaloq.com>
  • Loading branch information
rursprung committed Jan 10, 2025
1 parent bbcbd21 commit 9e120b7
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 7 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Fix remote shards balance ([#15335](https://github.com/opensearch-project/OpenSearch/pull/15335))
- Always use `constant_score` query for `match_only_text` field ([#16964](https://github.com/opensearch-project/OpenSearch/pull/16964))
- Fix Shallow copy snapshot failures on closed index ([#16868](https://github.com/opensearch-project/OpenSearch/pull/16868))
- The `phone-search` analyzer no longer emits the international calling code as a token ([#16993](https://github.com/opensearch-project/OpenSearch/pull/16993))

### Security

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,12 @@ private Set<String> getTokens() throws IOException {
countryCode = Optional.of(String.valueOf(numberProto.getCountryCode()));
input = String.valueOf(numberProto.getNationalNumber());

// Add Country code, extension, and the number as tokens
tokens.add(countryCode.get());
if (addNgrams) {
// Consider the country code as an ngram - it makes no sense in the search analyzer as it'd match all values with the
// same country code
tokens.add(countryCode.get());
}
// Add extension, and the number as tokens
tokens.add(countryCode.get() + input);
if (!Strings.isEmpty(numberProto.getExtension())) {
tokens.add(numberProto.getExtension());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ public void testEuropeDetailledSearch() throws IOException {
assertTokensAreInAnyOrder(
phoneSearchAnalyzer,
"tel:+441344840400",
Arrays.asList("tel:+441344840400", "tel:", "441344840400", "44", "1344840400")
Arrays.asList("tel:+441344840400", "tel:", "441344840400", "1344840400")
);
}

Expand Down Expand Up @@ -189,21 +189,21 @@ public void testLocalNumberWithCH() throws IOException {
}

public void testSearchInternationalPrefixWithZZ() throws IOException {
assertTokensInclude(phoneSearchAnalyzer, "+41583161010", Arrays.asList("41", "41583161010", "583161010"));
assertTokensInclude(phoneSearchAnalyzer, "+41583161010", Arrays.asList("41583161010", "583161010"));
}

public void testSearchInternationalPrefixWithCH() throws IOException {
assertTokensInclude(phoneSearchCHAnalyzer, "+41583161010", Arrays.asList("41", "41583161010", "583161010"));
assertTokensInclude(phoneSearchCHAnalyzer, "+41583161010", Arrays.asList("41583161010", "583161010"));
}

public void testSearchNationalPrefixWithCH() throws IOException {
// + is equivalent to 00 in Switzerland
assertTokensInclude(phoneSearchCHAnalyzer, "0041583161010", Arrays.asList("41", "41583161010", "583161010"));
assertTokensInclude(phoneSearchCHAnalyzer, "0041583161010", Arrays.asList("41583161010", "583161010"));
}

public void testSearchLocalNumberWithCH() throws IOException {
// when omitting the international prefix swiss numbers must start with '0'
assertTokensInclude(phoneSearchCHAnalyzer, "0583161010", Arrays.asList("41", "41583161010", "583161010"));
assertTokensInclude(phoneSearchCHAnalyzer, "0583161010", Arrays.asList("41583161010", "583161010"));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,20 @@
index: test
id: 1
body: { "phone": "+41 58 316 10 10", "phone-ch": "058 316 10 10" }
- do:
index:
index: test
id: 2
body: { "phone": "+41 58 316 99 99", "phone-ch": "058 316 99 99" }
- do:
index:
index: test
id: 2
body: { "phone": "+1-888-280-4331", "phone-ch": "+1-888-280-4331" }
- do:
indices.refresh: {}

# international format in document & search will always work
- do:
search:
rest_total_hits_as_int: true
Expand All @@ -45,6 +56,7 @@
"phone": "+41583161010"
- match: { hits.total: 1 }

# correct national format & international format in search will always work
- do:
search:
rest_total_hits_as_int: true
Expand All @@ -54,3 +66,58 @@
match:
"phone-ch": "+41583161010"
- match: { hits.total: 1 }

# national format without country specified won't work
- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
"phone": "0583161010"
- match: { hits.total: 0 }

# correct national format with country specified in document & search will always work
- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
"phone-ch": "0583161010"
- match: { hits.total: 1 }

# international format in document & search will always work
- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
"phone": "+1 888 280 4331"
- match: { hits.total: 1 }

# international format in document & search will always work
- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
"phone-ch": "+1 888 280 4331"
- match: { hits.total: 1 }

# national format in search won't work if no country is specified
- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
match:
"phone": "888 280 4331"
- match: { hits.total: 0 }

0 comments on commit 9e120b7

Please sign in to comment.