From cb2c36bb46810d7680047db2f0d7de8e598cef8c Mon Sep 17 00:00:00 2001 From: Ralph Ursprung Date: Fri, 10 Jan 2025 19:36:47 +0100 Subject: [PATCH] `phone-search` analyzer: don't emit sip/tel prefix, int'l prefix, extension & unformatted input (#16993) * `phone-search` analyzer: don't emit int'l prefix this was an oversight in the initial implementation: if the tokenizer emits the international calling prefix in the search analyzer then all documents with the same international calling prefix will match. e.g. when searching for `+1-555-123-4567` not only documents with this number would match but also any other document with a `1` token (i.e. any other number with this prefix). thus the search functionality is currently broken for this analyzer, making it useless. the test coverage has now been extended to cover these and other use-cases. Signed-off-by: Ralph Ursprung * `phone-search` analyzer: don't emit extension & unformatted input if these tokens are emitted it meant that phone numbers with other international dialling prefixes still matched. e.g. searching for `+1 1234` would also match a number stored as `+2 1234`, which was wrong. the tokens still need to be emited for the `phone` analyzer, e.g. when the user only enters the extension / local number it should still match, the same is with the other ngrams: these are needed for search-as-you-type style queries where the user input needs to match against partial phone numbers. Signed-off-by: Ralph Ursprung * `phone-search` analyzer: don't emit sip/tel prefix in line with the previous two commits, this is something else the search analyzer shouldn't emit since otherwise searching for any number with such a prefix will match _any_ document with the same prefix. Signed-off-by: Ralph Ursprung --------- Signed-off-by: Ralph Ursprung --- CHANGELOG.md | 1 + .../phone/PhoneNumberTermTokenizer.java | 23 ++- .../phone/PhoneNumberAnalyzerTests.java | 20 +-- .../test/analysis-phone/20_search.yml | 139 ++++++++++++++++++ 4 files changed, 167 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ded9553f88de5..2585e7fbb6816 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -86,6 +86,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Always use `constant_score` query for `match_only_text` field ([#16964](https://github.com/opensearch-project/OpenSearch/pull/16964)) - Fix Shallow copy snapshot failures on closed index ([#16868](https://github.com/opensearch-project/OpenSearch/pull/16868)) - Fix multi-value sort for unsigned long ([#16732](https://github.com/opensearch-project/OpenSearch/pull/16732)) +- The `phone-search` analyzer no longer emits the tel/sip prefix, international calling code, extension numbers and unformatted input as a token ([#16993](https://github.com/opensearch-project/OpenSearch/pull/16993)) ### Security diff --git a/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberTermTokenizer.java b/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberTermTokenizer.java index 6b95594204eb4..e0541755a2b3e 100644 --- a/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberTermTokenizer.java +++ b/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberTermTokenizer.java @@ -98,7 +98,9 @@ private Set getTokens() throws IOException { // Rip off the "tel:" or "sip:" prefix if (input.indexOf("tel:") == 0 || input.indexOf("sip:") == 0) { - tokens.add(input.substring(0, 4)); + if (addNgrams) { + tokens.add(input.substring(0, 4)); + } input = input.substring(4); } @@ -128,14 +130,23 @@ private Set getTokens() throws IOException { countryCode = Optional.of(String.valueOf(numberProto.getCountryCode())); input = String.valueOf(numberProto.getNationalNumber()); - // Add Country code, extension, and the number as tokens - tokens.add(countryCode.get()); + // add full number as tokens tokens.add(countryCode.get() + input); - if (!Strings.isEmpty(numberProto.getExtension())) { - tokens.add(numberProto.getExtension()); + + if (addNgrams) { + // Consider the country code as an ngram - it makes no sense in the search analyzer as it'd match all values with the + // same country code + tokens.add(countryCode.get()); + + // Add extension without country code (not done for search analyzer as that might match numbers in other countries as + // well!) + if (!Strings.isEmpty(numberProto.getExtension())) { + tokens.add(numberProto.getExtension()); + } + // Add unformatted input (most likely the same as the extension now since the prefix has been removed) + tokens.add(input); } - tokens.add(input); } } catch (final NumberParseException | StringIndexOutOfBoundsException e) { // Libphone didn't like it, no biggie. We'll just ngram the number as it is. diff --git a/plugins/analysis-phonenumber/src/test/java/org/opensearch/analysis/phone/PhoneNumberAnalyzerTests.java b/plugins/analysis-phonenumber/src/test/java/org/opensearch/analysis/phone/PhoneNumberAnalyzerTests.java index 332f6d21f47d6..503cee9cc710f 100644 --- a/plugins/analysis-phonenumber/src/test/java/org/opensearch/analysis/phone/PhoneNumberAnalyzerTests.java +++ b/plugins/analysis-phonenumber/src/test/java/org/opensearch/analysis/phone/PhoneNumberAnalyzerTests.java @@ -87,11 +87,7 @@ public void testEuropeDetailled() throws IOException { * Test for all tokens which are emitted by the "phone" analyzer. */ public void testEuropeDetailledSearch() throws IOException { - assertTokensAreInAnyOrder( - phoneSearchAnalyzer, - "tel:+441344840400", - Arrays.asList("tel:+441344840400", "tel:", "441344840400", "44", "1344840400") - ); + assertTokensAreInAnyOrder(phoneSearchAnalyzer, "tel:+441344840400", Arrays.asList("tel:+441344840400", "441344840400")); } public void testEurope() throws IOException { @@ -163,7 +159,11 @@ public void testSipWithoutDomainPart() throws IOException { } public void testTelPrefix() throws IOException { - assertTokensInclude("tel:+1228", Arrays.asList("1228", "122", "228")); + assertTokensInclude(phoneAnalyzer, "tel:+1228", Arrays.asList("tel:+1228", "tel:", "1228", "122", "228")); + } + + public void testTelPrefixSearch() throws IOException { + assertTokensAreInAnyOrder(phoneSearchAnalyzer, "tel:+1228", Arrays.asList("tel:+1228", "1228")); } public void testNumberPrefix() throws IOException { @@ -189,21 +189,21 @@ public void testLocalNumberWithCH() throws IOException { } public void testSearchInternationalPrefixWithZZ() throws IOException { - assertTokensInclude(phoneSearchAnalyzer, "+41583161010", Arrays.asList("41", "41583161010", "583161010")); + assertTokensAreInAnyOrder(phoneSearchAnalyzer, "+41583161010", Arrays.asList("+41583161010", "41583161010")); } public void testSearchInternationalPrefixWithCH() throws IOException { - assertTokensInclude(phoneSearchCHAnalyzer, "+41583161010", Arrays.asList("41", "41583161010", "583161010")); + assertTokensAreInAnyOrder(phoneSearchCHAnalyzer, "+41583161010", Arrays.asList("+41583161010", "41583161010")); } public void testSearchNationalPrefixWithCH() throws IOException { // + is equivalent to 00 in Switzerland - assertTokensInclude(phoneSearchCHAnalyzer, "0041583161010", Arrays.asList("41", "41583161010", "583161010")); + assertTokensAreInAnyOrder(phoneSearchCHAnalyzer, "0041583161010", Arrays.asList("0041583161010", "41583161010")); } public void testSearchLocalNumberWithCH() throws IOException { // when omitting the international prefix swiss numbers must start with '0' - assertTokensInclude(phoneSearchCHAnalyzer, "0583161010", Arrays.asList("41", "41583161010", "583161010")); + assertTokensAreInAnyOrder(phoneSearchCHAnalyzer, "0583161010", Arrays.asList("0583161010", "41583161010")); } /** diff --git a/plugins/analysis-phonenumber/src/yamlRestTest/resources/rest-api-spec/test/analysis-phone/20_search.yml b/plugins/analysis-phonenumber/src/yamlRestTest/resources/rest-api-spec/test/analysis-phone/20_search.yml index 0bd7d2c371bfc..1c51bfa3c5347 100644 --- a/plugins/analysis-phonenumber/src/yamlRestTest/resources/rest-api-spec/test/analysis-phone/20_search.yml +++ b/plugins/analysis-phonenumber/src/yamlRestTest/resources/rest-api-spec/test/analysis-phone/20_search.yml @@ -32,9 +32,37 @@ index: test id: 1 body: { "phone": "+41 58 316 10 10", "phone-ch": "058 316 10 10" } + - do: + index: + index: test + id: 2 + body: { "phone": "+41 58 316 99 99", "phone-ch": "058 316 99 99" } + - do: + index: + index: test + id: 3 + # number not used in the examples below, just present to make sure that it's never matched + body: { "phone": "+41 12 345 67 89", "phone-ch": "012 345 67 89" } + - do: + index: + index: test + id: 4 + # germany has a different phone number length, but for this test we ignore it and pretend they're the same + body: { "phone": "+49 58 316 10 10", "phone-ch": "+49 58 316 10 10" } + - do: + index: + index: test + id: 5 + body: { "phone": "+1-888-280-4331", "phone-ch": "+1-888-280-4331" } + - do: + index: + index: test + id: 6 + body: { "phone": "tel:+441344840400", "phone-ch": "tel:+441344840400" } - do: indices.refresh: {} + # international format in document & search will always work - do: search: rest_total_hits_as_int: true @@ -45,6 +73,7 @@ "phone": "+41583161010" - match: { hits.total: 1 } + # correct national format & international format in search will always work - do: search: rest_total_hits_as_int: true @@ -54,3 +83,113 @@ match: "phone-ch": "+41583161010" - match: { hits.total: 1 } + + # national format without country specified won't work + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone": "0583161010" + - match: { hits.total: 0 } + + # correct national format with country specified in document & search will always work + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone-ch": "0583161010" + - match: { hits.total: 1 } + + # search-as-you-type style query + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone": "+4158316" + - match: { hits.total: 2 } + + # search-as-you-type style query + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone-ch": "058316" + - match: { hits.total: 2 } + + # international format in document & search will always work + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone": "+1 888 280 4331" + - match: { hits.total: 1 } + + # international format in document & search will always work + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone-ch": "+1 888 280 4331" + - match: { hits.total: 1 } + + # national format in search won't work if no country is specified + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone": "888 280 4331" + - match: { hits.total: 0 } + + # document & search have a tel: prefix + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone": "tel:+441344840400" + - match: { hits.total: 1 } + + # only document has a tel: prefix + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone": "+441344840400" + - match: { hits.total: 1 } + + # only search has a tel: prefix + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone": "tel:+1 888 280 4331" + - match: { hits.total: 1 }