Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 1cbc3f1

Browse files
reivilibreDavid Robertson
andauthored
Fix a bug introduced in Synapse v1.74.0 where searching with colons when using ICU for search term tokenisation would fail with an error. (#15079)
Co-authored-by: David Robertson <davidr@element.io>
1 parent 7ee7f49 commit 1cbc3f1

File tree

4 files changed

+90
-5
lines changed

4 files changed

+90
-5
lines changed

changelog.d/15079.bugfix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix a bug introduced in Synapse v1.74.0 where searching with colons when using ICU for search term tokenisation would fail with an error.

synapse/storage/databases/main/user_directory.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -918,11 +918,19 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
918918
We use this so that we can add prefix matching, which isn't something
919919
that is supported by default.
920920
"""
921-
results = _parse_words(search_term)
921+
escaped_words = []
922+
for word in _parse_words(search_term):
923+
# Postgres tsvector and tsquery quoting rules:
924+
# words potentially containing punctuation should be quoted
925+
# and then existing quotes and backslashes should be doubled
926+
# See: https://www.postgresql.org/docs/current/datatype-textsearch.html#DATATYPE-TSQUERY
927+
928+
quoted_word = word.replace("'", "''").replace("\\", "\\\\")
929+
escaped_words.append(f"'{quoted_word}'")
922930

923-
both = " & ".join("(%s:* | %s)" % (result, result) for result in results)
924-
exact = " & ".join("%s" % (result,) for result in results)
925-
prefix = " & ".join("%s:*" % (result,) for result in results)
931+
both = " & ".join("(%s:* | %s)" % (word, word) for word in escaped_words)
932+
exact = " & ".join("%s" % (word,) for word in escaped_words)
933+
prefix = " & ".join("%s:*" % (word,) for word in escaped_words)
926934

927935
return both, exact, prefix
928936

@@ -944,6 +952,14 @@ def _parse_words(search_term: str) -> List[str]:
944952
if USE_ICU:
945953
return _parse_words_with_icu(search_term)
946954

955+
return _parse_words_with_regex(search_term)
956+
957+
958+
def _parse_words_with_regex(search_term: str) -> List[str]:
959+
"""
960+
Break down search term into words, when we don't have ICU available.
961+
See: `_parse_words`
962+
"""
947963
return re.findall(r"([\w\-]+)", search_term, re.UNICODE)
948964

949965

tests/handlers/test_user_directory.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,13 @@ def test_excludes_appservice_sender(self) -> None:
192192
self.helper.join(room, self.appservice.sender, tok=self.appservice.token)
193193
self._check_only_one_user_in_directory(user, room)
194194

195+
def test_search_term_with_colon_in_it_does_not_raise(self) -> None:
196+
"""
197+
Regression test: Test that search terms with colons in them are acceptable.
198+
"""
199+
u1 = self.register_user("user1", "pass")
200+
self.get_success(self.handler.search_users(u1, "haha:paamayim-nekudotayim", 10))
201+
195202
def test_user_not_in_users_table(self) -> None:
196203
"""Unclear how it happens, but on matrix.org we've seen join events
197204
for users who aren't in the users table. Test that we don't fall over

tests/storage/test_user_directory.py

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@
2525
from synapse.server import HomeServer
2626
from synapse.storage import DataStore
2727
from synapse.storage.background_updates import _BackgroundUpdateHandler
28+
from synapse.storage.databases.main import user_directory
29+
from synapse.storage.databases.main.user_directory import (
30+
_parse_words_with_icu,
31+
_parse_words_with_regex,
32+
)
2833
from synapse.storage.roommember import ProfileInfo
2934
from synapse.util import Clock
3035

@@ -42,7 +47,7 @@
4247
BOB = "@bob:b"
4348
BOBBY = "@bobby:a"
4449
# The localpart isn't 'Bela' on purpose so we can test looking up display names.
45-
BELA = "@somenickname:a"
50+
BELA = "@somenickname:example.org"
4651

4752

4853
class GetUserDirectoryTables:
@@ -423,6 +428,8 @@ async def mocked_process_users(*args: Any, **kwargs: Any) -> int:
423428

424429

425430
class UserDirectoryStoreTestCase(HomeserverTestCase):
431+
use_icu = False
432+
426433
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
427434
self.store = hs.get_datastores().main
428435

@@ -434,6 +441,12 @@ def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
434441
self.get_success(self.store.update_profile_in_user_dir(BELA, "Bela", None))
435442
self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE, BOB)))
436443

444+
self._restore_use_icu = user_directory.USE_ICU
445+
user_directory.USE_ICU = self.use_icu
446+
447+
def tearDown(self) -> None:
448+
user_directory.USE_ICU = self._restore_use_icu
449+
437450
def test_search_user_dir(self) -> None:
438451
# normally when alice searches the directory she should just find
439452
# bob because bobby doesn't share a room with her.
@@ -478,6 +491,26 @@ def test_search_user_dir_stop_words(self) -> None:
478491
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
479492
)
480493

494+
@override_config({"user_directory": {"search_all_users": True}})
495+
def test_search_user_dir_start_of_user_id(self) -> None:
496+
"""Tests that a user can look up another user by searching for the start
497+
of their user ID.
498+
"""
499+
r = self.get_success(self.store.search_user_dir(ALICE, "somenickname:exa", 10))
500+
self.assertFalse(r["limited"])
501+
self.assertEqual(1, len(r["results"]))
502+
self.assertDictEqual(
503+
r["results"][0],
504+
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
505+
)
506+
507+
508+
class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase):
509+
use_icu = True
510+
511+
if not icu:
512+
skip = "Requires PyICU"
513+
481514

482515
class UserDirectoryICUTestCase(HomeserverTestCase):
483516
if not icu:
@@ -513,3 +546,31 @@ def test_icu_word_boundary(self) -> None:
513546
r["results"][0],
514547
{"user_id": ALICE, "display_name": display_name, "avatar_url": None},
515548
)
549+
550+
def test_icu_word_boundary_punctuation(self) -> None:
551+
"""
552+
Tests the behaviour of punctuation with the ICU tokeniser.
553+
554+
Seems to depend on underlying version of ICU.
555+
"""
556+
557+
# Note: either tokenisation is fine, because Postgres actually splits
558+
# words itself afterwards.
559+
self.assertIn(
560+
_parse_words_with_icu("lazy'fox jumped:over the.dog"),
561+
(
562+
# ICU 66 on Ubuntu 20.04
563+
["lazy'fox", "jumped", "over", "the", "dog"],
564+
# ICU 70 on Ubuntu 22.04
565+
["lazy'fox", "jumped:over", "the.dog"],
566+
),
567+
)
568+
569+
def test_regex_word_boundary_punctuation(self) -> None:
570+
"""
571+
Tests the behaviour of punctuation with the non-ICU tokeniser
572+
"""
573+
self.assertEqual(
574+
_parse_words_with_regex("lazy'fox jumped:over the.dog"),
575+
["lazy", "fox", "jumped", "over", "the", "dog"],
576+
)

0 commit comments

Comments
 (0)