Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Unified search query syntax using the full-text search capabilities o…
Browse files Browse the repository at this point in the history
…f the underlying DB. (#11635)

Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.

Supports, with the same syntax across Postgresql 11+ and Sqlite:

- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
  documents containing "dogs". 

This is achieved by 

- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
  query syntax.

Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.

Multiple terms separated by a space are implicitly ANDed.

Note that:

1. There is no escaping of full-text syntax that might be supported by the database;
  e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
  as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
  language needs to be known at the time of indexing the message (via room metadata,
  or otherwise), or a separate index for each language supported could be created.

Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
  • Loading branch information
novocaine authored Oct 25, 2022
1 parent 85fcbba commit d902181
Show file tree
Hide file tree
Showing 5 changed files with 454 additions and 35 deletions.
1 change: 1 addition & 0 deletions changelog.d/11635.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Allow use of postgres and sqllite full-text search operators in search queries.
197 changes: 162 additions & 35 deletions synapse/storage/databases/main/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,22 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import enum
import logging
import re
from typing import TYPE_CHECKING, Any, Collection, Iterable, List, Optional, Set, Tuple
from collections import deque
from dataclasses import dataclass
from typing import (
TYPE_CHECKING,
Any,
Collection,
Iterable,
List,
Optional,
Set,
Tuple,
Union,
)

import attr

Expand All @@ -27,7 +39,7 @@
LoggingTransaction,
)
from synapse.storage.databases.main.events_worker import EventRedactBehaviour
from synapse.storage.engines import BaseDatabaseEngine, PostgresEngine, Sqlite3Engine
from synapse.storage.engines import PostgresEngine, Sqlite3Engine
from synapse.types import JsonDict

if TYPE_CHECKING:
Expand Down Expand Up @@ -421,8 +433,6 @@ async def search_msgs(
"""
clauses = []

search_query = _parse_query(self.database_engine, search_term)

args: List[Any] = []

# Make sure we don't explode because the person is in too many rooms.
Expand All @@ -444,20 +454,24 @@ async def search_msgs(
count_clauses = clauses

if isinstance(self.database_engine, PostgresEngine):
search_query = search_term
tsquery_func = self.database_engine.tsquery_func
sql = (
"SELECT ts_rank_cd(vector, to_tsquery('english', ?)) AS rank,"
f"SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) AS rank,"
" room_id, event_id"
" FROM event_search"
" WHERE vector @@ to_tsquery('english', ?)"
f" WHERE vector @@ {tsquery_func}('english', ?)"
)
args = [search_query, search_query] + args

count_sql = (
"SELECT room_id, count(*) as count FROM event_search"
" WHERE vector @@ to_tsquery('english', ?)"
f" WHERE vector @@ {tsquery_func}('english', ?)"
)
count_args = [search_query] + count_args
elif isinstance(self.database_engine, Sqlite3Engine):
search_query = _parse_query_for_sqlite(search_term)

sql = (
"SELECT rank(matchinfo(event_search)) as rank, room_id, event_id"
" FROM event_search"
Expand All @@ -469,7 +483,7 @@ async def search_msgs(
"SELECT room_id, count(*) as count FROM event_search"
" WHERE value MATCH ?"
)
count_args = [search_term] + count_args
count_args = [search_query] + count_args
else:
# This should be unreachable.
raise Exception("Unrecognized database engine")
Expand Down Expand Up @@ -501,7 +515,9 @@ async def search_msgs(

highlights = None
if isinstance(self.database_engine, PostgresEngine):
highlights = await self._find_highlights_in_postgres(search_query, events)
highlights = await self._find_highlights_in_postgres(
search_query, events, tsquery_func
)

count_sql += " GROUP BY room_id"

Expand All @@ -510,7 +526,6 @@ async def search_msgs(
)

count = sum(row["count"] for row in count_results if row["room_id"] in room_ids)

return {
"results": [
{"event": event_map[r["event_id"]], "rank": r["rank"]}
Expand Down Expand Up @@ -542,9 +557,6 @@ async def search_rooms(
Each match as a dictionary.
"""
clauses = []

search_query = _parse_query(self.database_engine, search_term)

args: List[Any] = []

# Make sure we don't explode because the person is in too many rooms.
Expand Down Expand Up @@ -582,20 +594,23 @@ async def search_rooms(
args.extend([origin_server_ts, origin_server_ts, stream])

if isinstance(self.database_engine, PostgresEngine):
search_query = search_term
tsquery_func = self.database_engine.tsquery_func
sql = (
"SELECT ts_rank_cd(vector, to_tsquery('english', ?)) as rank,"
f"SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) as rank,"
" origin_server_ts, stream_ordering, room_id, event_id"
" FROM event_search"
" WHERE vector @@ to_tsquery('english', ?) AND "
f" WHERE vector @@ {tsquery_func}('english', ?) AND "
)
args = [search_query, search_query] + args

count_sql = (
"SELECT room_id, count(*) as count FROM event_search"
" WHERE vector @@ to_tsquery('english', ?) AND "
f" WHERE vector @@ {tsquery_func}('english', ?) AND "
)
count_args = [search_query] + count_args
elif isinstance(self.database_engine, Sqlite3Engine):

# We use CROSS JOIN here to ensure we use the right indexes.
# https://sqlite.org/optoverview.html#crossjoin
#
Expand All @@ -614,13 +629,14 @@ async def search_rooms(
" CROSS JOIN events USING (event_id)"
" WHERE "
)
search_query = _parse_query_for_sqlite(search_term)
args = [search_query] + args

count_sql = (
"SELECT room_id, count(*) as count FROM event_search"
" WHERE value MATCH ? AND "
)
count_args = [search_term] + count_args
count_args = [search_query] + count_args
else:
# This should be unreachable.
raise Exception("Unrecognized database engine")
Expand Down Expand Up @@ -660,7 +676,9 @@ async def search_rooms(

highlights = None
if isinstance(self.database_engine, PostgresEngine):
highlights = await self._find_highlights_in_postgres(search_query, events)
highlights = await self._find_highlights_in_postgres(
search_query, events, tsquery_func
)

count_sql += " GROUP BY room_id"

Expand All @@ -686,7 +704,7 @@ async def search_rooms(
}

async def _find_highlights_in_postgres(
self, search_query: str, events: List[EventBase]
self, search_query: str, events: List[EventBase], tsquery_func: str
) -> Set[str]:
"""Given a list of events and a search term, return a list of words
that match from the content of the event.
Expand All @@ -697,6 +715,7 @@ async def _find_highlights_in_postgres(
Args:
search_query
events: A list of events
tsquery_func: The tsquery_* function to use when making queries
Returns:
A set of strings.
Expand Down Expand Up @@ -729,7 +748,7 @@ def f(txn: LoggingTransaction) -> Set[str]:
while stop_sel in value:
stop_sel += ">"

query = "SELECT ts_headline(?, to_tsquery('english', ?), %s)" % (
query = f"SELECT ts_headline(?, {tsquery_func}('english', ?), %s)" % (
_to_postgres_options(
{
"StartSel": start_sel,
Expand Down Expand Up @@ -760,20 +779,128 @@ def _to_postgres_options(options_dict: JsonDict) -> str:
return "'%s'" % (",".join("%s=%s" % (k, v) for k, v in options_dict.items()),)


def _parse_query(database_engine: BaseDatabaseEngine, search_term: str) -> str:
"""Takes a plain unicode string from the user and converts it into a form
that can be passed to database.
We use this so that we can add prefix matching, which isn't something
that is supported by default.
@dataclass
class Phrase:
phrase: List[str]


class SearchToken(enum.Enum):
Not = enum.auto()
Or = enum.auto()
And = enum.auto()


Token = Union[str, Phrase, SearchToken]
TokenList = List[Token]


def _is_stop_word(word: str) -> bool:
# TODO Pull these out of the dictionary:
# https://github.com/postgres/postgres/blob/master/src/backend/snowball/stopwords/english.stop
return word in {"the", "a", "you", "me", "and", "but"}


def _tokenize_query(query: str) -> TokenList:
"""
Convert the user-supplied `query` into a TokenList, which can be translated into
some DB-specific syntax.
The following constructs are supported:
- phrase queries using "double quotes"
- case-insensitive `or` and `and` operators
- negation of a keyword via unary `-`
- unary hyphen to denote NOT e.g. 'include -exclude'
The following differs from websearch_to_tsquery:
- Stop words are not removed.
- Unclosed phrases are treated differently.
"""
tokens: TokenList = []

# Find phrases.
in_phrase = False
parts = deque(query.split('"'))
for i, part in enumerate(parts):
# The contents inside double quotes is treated as a phrase, a trailing
# double quote is not implied.
in_phrase = bool(i % 2) and i != (len(parts) - 1)

# Pull out the individual words, discarding any non-word characters.
words = deque(re.findall(r"([\w\-]+)", part, re.UNICODE))

# Phrases have simplified handling of words.
if in_phrase:
# Skip stop words.
phrase = [word for word in words if not _is_stop_word(word)]

# Consecutive words are implicitly ANDed together.
if tokens and tokens[-1] not in (SearchToken.Not, SearchToken.Or):
tokens.append(SearchToken.And)

# Add the phrase.
tokens.append(Phrase(phrase))
continue

# Otherwise, not in a phrase.
while words:
word = words.popleft()

if word.startswith("-"):
tokens.append(SearchToken.Not)

# If there's more word, put it back to be processed again.
word = word[1:]
if word:
words.appendleft(word)
elif word.lower() == "or":
tokens.append(SearchToken.Or)
else:
# Skip stop words.
if _is_stop_word(word):
continue

# Consecutive words are implicitly ANDed together.
if tokens and tokens[-1] not in (SearchToken.Not, SearchToken.Or):
tokens.append(SearchToken.And)

# Add the search term.
tokens.append(word)

return tokens


def _tokens_to_sqlite_match_query(tokens: TokenList) -> str:
"""
Convert the list of tokens to a string suitable for passing to sqlite's MATCH.
Assume sqlite was compiled with enhanced query syntax.
Ref: https://www.sqlite.org/fts3.html#full_text_index_queries
"""
match_query = []
for token in tokens:
if isinstance(token, str):
match_query.append(token)
elif isinstance(token, Phrase):
match_query.append('"' + " ".join(token.phrase) + '"')
elif token == SearchToken.Not:
# TODO: SQLite treats NOT as a *binary* operator. Hopefully a search
# term has already been added before this.
match_query.append(" NOT ")
elif token == SearchToken.Or:
match_query.append(" OR ")
elif token == SearchToken.And:
match_query.append(" AND ")
else:
raise ValueError(f"unknown token {token}")

return "".join(match_query)

# Pull out the individual words, discarding any non-word characters.
results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)

if isinstance(database_engine, PostgresEngine):
return " & ".join(result + ":*" for result in results)
elif isinstance(database_engine, Sqlite3Engine):
return " & ".join(result + "*" for result in results)
else:
# This should be unreachable.
raise Exception("Unrecognized database engine")
def _parse_query_for_sqlite(search_term: str) -> str:
"""Takes a plain unicode string from the user and converts it into a form
that can be passed to sqllite's matchinfo().
"""
return _tokens_to_sqlite_match_query(_tokenize_query(search_term))
16 changes: 16 additions & 0 deletions synapse/storage/engines/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,22 @@ def supports_returning(self) -> bool:
"""Do we support the `RETURNING` clause in insert/update/delete?"""
return True

@property
def tsquery_func(self) -> str:
"""
Selects a tsquery_* func to use.
Ref: https://www.postgresql.org/docs/current/textsearch-controls.html
Returns:
The function name.
"""
# Postgres 11 added support for websearch_to_tsquery.
assert self._version is not None
if self._version >= 110000:
return "websearch_to_tsquery"
return "plainto_tsquery"

def is_deadlock(self, error: Exception) -> bool:
if isinstance(error, psycopg2.DatabaseError):
# https://www.postgresql.org/docs/current/static/errcodes-appendix.html
Expand Down
Loading

0 comments on commit d902181

Please sign in to comment.