Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Improve handling of non-ASCII characters in user directory search #15143

Merged
merged 8 commits into from
Feb 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/15143.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a long-standing bug where the user directory search was not case-insensitive for accented characters.
52 changes: 50 additions & 2 deletions synapse/storage/databases/main/user_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import logging
import re
import unicodedata
from typing import (
TYPE_CHECKING,
Iterable,
Expand Down Expand Up @@ -491,6 +492,11 @@ def _update_profile_in_user_dir_txn(txn: LoggingTransaction) -> None:
values={"display_name": display_name, "avatar_url": avatar_url},
)

# The display name that goes into the database index.
index_display_name = display_name
if index_display_name is not None:
index_display_name = _filter_text_for_index(index_display_name)

if isinstance(self.database_engine, PostgresEngine):
# We weight the localpart most highly, then display name and finally
# server name
Expand All @@ -508,11 +514,15 @@ def _update_profile_in_user_dir_txn(txn: LoggingTransaction) -> None:
user_id,
get_localpart_from_id(user_id),
get_domain_from_id(user_id),
display_name,
index_display_name,
),
)
elif isinstance(self.database_engine, Sqlite3Engine):
value = "%s %s" % (user_id, display_name) if display_name else user_id
value = (
"%s %s" % (user_id, index_display_name)
if index_display_name
else user_id
)
self.db_pool.simple_upsert_txn(
txn,
table="user_directory_search",
Expand Down Expand Up @@ -897,6 +907,41 @@ async def search_user_dir(
return {"limited": limited, "results": results[0:limit]}


def _filter_text_for_index(text: str) -> str:
"""Transforms text before it is inserted into the user directory index, or searched
for in the user directory index.

Note that the user directory search table needs to be rebuilt whenever this function
changes.
"""
# Lowercase the text, to make searches case-insensitive.
# This is necessary for both PostgreSQL and SQLite. PostgreSQL's
# `to_tsquery/to_tsvector` functions don't lowercase non-ASCII characters when using
# the "C" collation, while SQLite just doesn't lowercase non-ASCII characters at
# all.
text = text.lower()

# Normalize the text. NFKC normalization has two effects:
# 1. It canonicalizes the text, ie. maps all visually identical strings to the same
# string. For example, ["e", "◌́"] is mapped to ["é"].
# 2. It maps strings that are roughly equivalent to the same string.
# For example, ["dž"] is mapped to ["d", "ž"], ["①"] to ["1"] and ["i⁹"] to
# ["i", "9"].
text = unicodedata.normalize("NFKC", text)

# Note that nothing is done to make searches accent-insensitive.
# That could be achieved by converting to NFKD form instead (with combining accents
# split out) and filtering out combining accents using `unicodedata.combining(c)`.
# The downside of this may be noisier search results, since search terms with
# explicit accents will match characters with no accents, or completely different
# accents.
#
# text = unicodedata.normalize("NFKD", text)
# text = "".join([c for c in text if not unicodedata.combining(c)])

return text


def _parse_query_sqlite(search_term: str) -> str:
"""Takes a plain unicode string from the user and converts it into a form
that can be passed to database.
Expand All @@ -906,6 +951,7 @@ def _parse_query_sqlite(search_term: str) -> str:
We specifically add both a prefix and non prefix matching term so that
exact matches get ranked higher.
"""
search_term = _filter_text_for_index(search_term)

# Pull out the individual words, discarding any non-word characters.
results = _parse_words(search_term)
Expand All @@ -918,6 +964,8 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
We use this so that we can add prefix matching, which isn't something
that is supported by default.
"""
search_term = _filter_text_for_index(search_term)

escaped_words = []
for word in _parse_words(search_term):
# Postgres tsvector and tsquery quoting rules:
Expand Down
133 changes: 133 additions & 0 deletions tests/storage/test_user_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,139 @@ def test_search_user_dir_start_of_user_id(self) -> None:
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
)

@override_config({"user_directory": {"search_all_users": True}})
def test_search_user_dir_ascii_case_insensitivity(self) -> None:
"""Tests that a user can look up another user by searching for their name in a
different case.
"""
CHARLIE = "@someuser:example.org"
self.get_success(
self.store.update_profile_in_user_dir(CHARLIE, "Charlie", None)
)

r = self.get_success(self.store.search_user_dir(ALICE, "cHARLIE", 10))
self.assertFalse(r["limited"])
self.assertEqual(1, len(r["results"]))
self.assertDictEqual(
r["results"][0],
{"user_id": CHARLIE, "display_name": "Charlie", "avatar_url": None},
)

@override_config({"user_directory": {"search_all_users": True}})
def test_search_user_dir_unicode_case_insensitivity(self) -> None:
"""Tests that a user can look up another user by searching for their name in a
different case.
"""
IVAN = "@someuser:example.org"
self.get_success(self.store.update_profile_in_user_dir(IVAN, "Иван", None))

r = self.get_success(self.store.search_user_dir(ALICE, "иВАН", 10))
self.assertFalse(r["limited"])
self.assertEqual(1, len(r["results"]))
self.assertDictEqual(
r["results"][0],
{"user_id": IVAN, "display_name": "Иван", "avatar_url": None},
)

@override_config({"user_directory": {"search_all_users": True}})
def test_search_user_dir_dotted_dotless_i_case_insensitivity(self) -> None:
"""Tests that a user can look up another user by searching for their name in a
different case, when their name contains dotted or dotless "i"s.

Some languages have dotted and dotless versions of "i", which are considered to
be different letters: i <-> İ, ı <-> I. To make things difficult, they reuse the
ASCII "i" and "I" code points, despite having different lowercase / uppercase
forms.
"""
USER = "@someuser:example.org"

expected_matches = [
# (search_term, display_name)
# A search for "i" should match "İ".
("iiiii", "İİİİİ"),
# A search for "I" should match "ı".
("IIIII", "ııııı"),
# A search for "ı" should match "I".
("ııııı", "IIIII"),
# A search for "İ" should match "i".
("İİİİİ", "iiiii"),
]

for search_term, display_name in expected_matches:
self.get_success(
self.store.update_profile_in_user_dir(USER, display_name, None)
)

r = self.get_success(self.store.search_user_dir(ALICE, search_term, 10))
self.assertFalse(r["limited"])
self.assertEqual(
1,
len(r["results"]),
f"searching for {search_term!r} did not match {display_name!r}",
)
self.assertDictEqual(
r["results"][0],
{"user_id": USER, "display_name": display_name, "avatar_url": None},
)

# We don't test for negative matches, to allow implementations that consider all
# the i variants to be the same.

test_search_user_dir_dotted_dotless_i_case_insensitivity.skip = "not supported" # type: ignore

@override_config({"user_directory": {"search_all_users": True}})
def test_search_user_dir_unicode_normalization(self) -> None:
"""Tests that a user can look up another user by searching for their name with
either composed or decomposed accents.
"""
AMELIE = "@someuser:example.org"

expected_matches = [
# (search_term, display_name)
("Ame\u0301lie", "Amélie"),
("Amélie", "Ame\u0301lie"),
]

for search_term, display_name in expected_matches:
self.get_success(
self.store.update_profile_in_user_dir(AMELIE, display_name, None)
)

r = self.get_success(self.store.search_user_dir(ALICE, search_term, 10))
self.assertFalse(r["limited"])
self.assertEqual(
1,
len(r["results"]),
f"searching for {search_term!r} did not match {display_name!r}",
squahtx marked this conversation as resolved.
Show resolved Hide resolved
)
self.assertDictEqual(
r["results"][0],
{"user_id": AMELIE, "display_name": display_name, "avatar_url": None},
)

@override_config({"user_directory": {"search_all_users": True}})
def test_search_user_dir_accent_insensitivity(self) -> None:
"""Tests that a user can look up another user by searching for their name
without any accents.
"""
AMELIE = "@someuser:example.org"
self.get_success(self.store.update_profile_in_user_dir(AMELIE, "Amélie", None))

r = self.get_success(self.store.search_user_dir(ALICE, "amelie", 10))
self.assertFalse(r["limited"])
self.assertEqual(1, len(r["results"]))
self.assertDictEqual(
r["results"][0],
{"user_id": AMELIE, "display_name": "Amélie", "avatar_url": None},
)

# It may be desirable for "é"s in search terms to not match plain "e"s and we
# really don't want "é"s in search terms to match "e"s with different accents.
# But we don't test for this to allow implementations that consider all
# "e"-lookalikes to be the same.

test_search_user_dir_accent_insensitivity.skip = "not supported yet" # type: ignore


class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase):
use_icu = True
Expand Down