Skip to content

Commit 1815843

Browse files
committed
feat: add Typesense backend for search
Also run `make upgrade` to be able to successfully compile dependencies. Private-ref: https://tasks.opencraft.com/browse/BB-9975
1 parent 51ec507 commit 1815843

File tree

11 files changed

+1344
-511
lines changed

11 files changed

+1344
-511
lines changed

forum/search/typesense.py

Lines changed: 370 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,370 @@
1+
"""
2+
Typesense backend for searching comments and threads.
3+
"""
4+
5+
from typing import Any, Optional
6+
7+
from bs4 import BeautifulSoup
8+
from django.conf import settings
9+
from django.core.paginator import Paginator
10+
from typesense import Client
11+
from typesense.types.collection import CollectionCreateSchema
12+
from typesense.types.document import DocumentSchema, SearchParameters
13+
14+
from forum.backends.mysql.models import Comment, CommentThread
15+
from forum.constants import FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT
16+
from forum.search.base import (
17+
BaseDocumentSearchBackend,
18+
BaseIndexSearchBackend,
19+
BaseSearchBackend,
20+
BaseThreadSearchBackend,
21+
)
22+
23+
_TYPESENSE_CLIENT: Client | None = None
24+
25+
26+
def get_typesense_client() -> Client:
27+
"""
28+
Return a singleton Typesense client instance.
29+
"""
30+
global _TYPESENSE_CLIENT
31+
if _TYPESENSE_CLIENT is None:
32+
_TYPESENSE_CLIENT = Client(
33+
{
34+
"api_key": settings.TYPESENSE_API_KEY,
35+
"nodes": [settings.TYPESENSE_URL],
36+
}
37+
)
38+
return _TYPESENSE_CLIENT
39+
40+
41+
class CommentsIndex:
42+
"""
43+
Common data and operations relating to the comments index.
44+
"""
45+
46+
model = Comment
47+
48+
@staticmethod
49+
def name() -> str:
50+
"""
51+
Return the Typesense index name for the index.
52+
"""
53+
return settings.TYPESENSE_COLLECTION_PREFIX + "comments"
54+
55+
@classmethod
56+
def schema(cls) -> CollectionCreateSchema:
57+
return {
58+
"name": cls.name(),
59+
"fields": [
60+
{"name": "course_id", "type": "string"},
61+
{"name": "comment_thread_id", "type": "string"},
62+
{"name": "body", "type": "string"},
63+
],
64+
}
65+
66+
@staticmethod
67+
def build_document(doc_id: str | int, data: dict[str, Any]) -> DocumentSchema:
68+
"""
69+
Build a Typesense document for this index.
70+
"""
71+
# NOTE: Comments have no commentable_id or title, and the context is hardcoded to "course".
72+
return {
73+
"id": str(doc_id),
74+
"course_id": str(data.get("course_id", "")),
75+
"comment_thread_id": str(data.get("comment_thread_id", "")),
76+
"body": (
77+
BeautifulSoup(data["body"], features="html.parser").get_text()
78+
if data.get("body")
79+
else ""
80+
),
81+
}
82+
83+
@staticmethod
84+
def build_search_parameters(
85+
*, search_text: str, course_id: str | None
86+
) -> SearchParameters:
87+
"""
88+
Build Typesense search parameters for this index.
89+
"""
90+
return {
91+
"q": search_text,
92+
"query_by": "body",
93+
"filter_by": (
94+
f"course_id:={quote_filter_value(course_id)}" if course_id else ""
95+
),
96+
"per_page": FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT,
97+
}
98+
99+
100+
class CommentThreadsIndex:
101+
"""
102+
Common data and operations relating to the comments index.
103+
"""
104+
105+
model = CommentThread
106+
107+
@staticmethod
108+
def name() -> str:
109+
"""
110+
Return the Typesense index name for the index.
111+
"""
112+
return settings.TYPESENSE_COLLECTION_PREFIX + "comment_threads"
113+
114+
@classmethod
115+
def schema(cls) -> CollectionCreateSchema:
116+
return {
117+
"name": cls.name(),
118+
"fields": [
119+
{"name": "course_id", "type": "string"},
120+
{"name": "commentable_id", "type": "string"},
121+
{"name": "context", "type": "string"},
122+
{"name": "title", "type": "string"},
123+
{"name": "body", "type": "string"},
124+
],
125+
}
126+
127+
@staticmethod
128+
def build_document(doc_id: str | int, data: dict[str, Any]) -> DocumentSchema:
129+
"""
130+
Build a Typesense document for this index.
131+
"""
132+
return {
133+
"id": str(doc_id),
134+
"course_id": str(data.get("course_id", "")),
135+
"commentable_id": str(data.get("commentable_id", "")),
136+
"context": str(data.get("context", "")),
137+
"title": str(data.get("title", "")),
138+
"body": (
139+
BeautifulSoup(data["body"], features="html.parser").get_text()
140+
if data.get("body")
141+
else ""
142+
),
143+
}
144+
145+
@staticmethod
146+
def build_search_parameters(
147+
*,
148+
search_text: str,
149+
course_id: str | None,
150+
context: str,
151+
commentable_ids: list[str] | None,
152+
) -> SearchParameters:
153+
"""
154+
Build Typesense search parameters for this index.
155+
"""
156+
# Context is always a single word, so we can use the faster `:` operator, without sacrificing accuracy.
157+
filters = [f"context:{quote_filter_value(context)}"]
158+
if commentable_ids:
159+
safe_ids = ", ".join(quote_filter_value(value) for value in commentable_ids)
160+
filters.append(f"commentable_ids:[{safe_ids}]")
161+
if course_id:
162+
filters.append(f"course_id:={quote_filter_value(course_id)}")
163+
164+
return {
165+
"q": search_text,
166+
"query_by": "title,body",
167+
"filter_by": " && ".join(filters),
168+
"per_page": FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT,
169+
}
170+
171+
172+
INDICES: dict[str, type[CommentsIndex] | type[CommentThreadsIndex]] = {
173+
"comments": CommentsIndex,
174+
"comment_threads": CommentThreadsIndex,
175+
}
176+
177+
178+
class TypesenseDocumentBackend(BaseDocumentSearchBackend):
179+
"""
180+
Document backend implementation for Typesense.
181+
"""
182+
183+
def index_document(
184+
self, index_name: str, doc_id: str | int, document: dict[str, Any]
185+
) -> None:
186+
"""
187+
Index a document in Typesense.
188+
"""
189+
client = get_typesense_client()
190+
index = INDICES[index_name]
191+
typesense_document = index.build_document(doc_id, document)
192+
client.collections[index.name()].documents.upsert(typesense_document)
193+
194+
def update_document(
195+
self, index_name: str, doc_id: str | int, update_data: dict[str, Any]
196+
) -> None:
197+
"""
198+
Same operation as index_document, because upsert is used.
199+
"""
200+
return self.index_document(index_name, doc_id, update_data)
201+
202+
def delete_document(self, index_name: str, doc_id: str | int) -> None:
203+
"""
204+
Delete a document from Typesense.
205+
"""
206+
client = get_typesense_client()
207+
index = INDICES[index_name]
208+
client.collections[index.name()].documents[str(doc_id)].delete(
209+
delete_parameters={"ignore_not_found": True},
210+
)
211+
212+
213+
class TypesenseIndexBackend(BaseIndexSearchBackend):
214+
"""
215+
Manage indexes for the Typesense backend.
216+
217+
Typesense calls these "collections". https://typesense.org/docs/29.0/api/collections.html
218+
"""
219+
220+
def initialize_indices(self, force_new_index: bool = False) -> None:
221+
"""
222+
Initialize the indices in Typesense.
223+
224+
If force_new_index is True, the indexes will be dropped before being recreated.
225+
"""
226+
client = get_typesense_client()
227+
for index in INDICES.values():
228+
if force_new_index:
229+
client.collections[index.name()].delete()
230+
# XXX: this fails if the index already exists
231+
client.collections.create(index.schema())
232+
233+
def rebuild_indices(
234+
self, batch_size: int = 500, extra_catchup_minutes: int = 5
235+
) -> None:
236+
"""
237+
Reindex everything in Typesense
238+
239+
The Typesense collections are dropped and recreated,
240+
and data is reindexed from the MySQL database.
241+
242+
Only MySQL-backed instances are supported.
243+
Note that the `extra_catchup_minutes` argument is ignored.
244+
"""
245+
client = get_typesense_client()
246+
self.initialize_indices(force_new_index=True)
247+
for index in INDICES.values():
248+
paginator = Paginator(index.model.objects.all(), per_page=batch_size)
249+
for page_number in paginator.page_range:
250+
page = paginator.get_page(page_number)
251+
documents = [
252+
index.build_document(obj.pk, obj.doc_to_hash())
253+
for obj in page.object_list
254+
]
255+
if documents:
256+
client.collections[index.name()].documents.import_(
257+
documents, {"action": "upsert"}
258+
)
259+
260+
def validate_indices(self) -> None:
261+
"""
262+
Check if the indices exist and are valid.
263+
264+
Raise an exception if any do not exist or if any are not valid.
265+
"""
266+
client = get_typesense_client()
267+
for index in INDICES.values():
268+
collection = client.collections[index.name()].retrieve()
269+
# TODO: collection returns more information than the initial create schema,
270+
# so we need a better comparison here
271+
if collection != index.schema():
272+
print(f"Expected schema: {index.schema()}")
273+
print(f"Found schema: {collection}")
274+
raise AssertionError(f"Collection {index.name()} exists, but schema does not match expected.")
275+
276+
def refresh_indices(self) -> None:
277+
"""
278+
Noop on Typesense, as all write API operations are synchronous.
279+
280+
See https://typesense.org/docs/guide/migrating-from-algolia.html#synchronous-write-apis for more information.
281+
"""
282+
return None
283+
284+
def delete_unused_indices(self) -> int:
285+
"""
286+
Noop on Typesense.
287+
"""
288+
return 0
289+
290+
291+
def quote_filter_value(value: str) -> str:
292+
"""
293+
Sanitize and safely quote a value for use in a Typesense filter.
294+
295+
https://typesense.org/docs/guide/tips-for-filtering.html#escaping-special-characters
296+
"""
297+
# TODO: It may be possible to escape backticks, rather than removing them.
298+
# It also may be possible that there are issues with the backtick method.
299+
# See https://github.com/typesense/typesense/issues/196
300+
return "`" + value.replace("`", "") + "`"
301+
302+
303+
class TypesenseThreadSearchBackend(BaseThreadSearchBackend):
304+
"""
305+
Thread search backend implementation for Typesense.
306+
"""
307+
308+
def get_thread_ids(
309+
self,
310+
context: str,
311+
# This argument is unsupported. Anyway, its only role was to boost some results,
312+
# which did not have much effect because they are shuffled anyway downstream.
313+
group_ids: list[int],
314+
search_text: str,
315+
# This parameter is unsupported, but as far as we know it's not used anywhere.
316+
sort_criteria: Optional[list[dict[str, str]]] = None,
317+
commentable_ids: Optional[list[str]] = None,
318+
course_id: Optional[str] = None,
319+
) -> list[str]:
320+
"""
321+
Retrieve thread IDs based on search criteria.
322+
"""
323+
client = get_typesense_client()
324+
thread_ids: set[str] = set()
325+
326+
# All comments have "course" as their context, and none of them have a commentable_id.
327+
if context == "course" and not commentable_ids:
328+
comment_results = client.collections[CommentsIndex.name()].documents.search(
329+
CommentsIndex.build_search_parameters(
330+
search_text=search_text, course_id=course_id
331+
)
332+
)
333+
for hit in comment_results.get("hits", []):
334+
thread_ids.add(hit["document"]["comment_thread_id"])
335+
336+
thread_results = client.collections[
337+
CommentThreadsIndex.name()
338+
].documents.search(
339+
CommentThreadsIndex.build_search_parameters(
340+
search_text=search_text,
341+
course_id=course_id,
342+
context=context,
343+
commentable_ids=commentable_ids,
344+
)
345+
)
346+
for hit in thread_results.get("hits", []):
347+
thread_ids.add(hit["document"]["id"])
348+
349+
return list(thread_ids)
350+
351+
def get_suggested_text(self, search_text: str) -> Optional[str]:
352+
"""
353+
Retrieve text suggestions for a given search query.
354+
355+
:param search_text: Text to search for suggestions
356+
:return: Suggested text or None
357+
"""
358+
# TODO: https://typesense.org/docs/guide/query-suggestions.html
359+
# TODO: if this is implemented, do we need to also implement get_thread_ids_with_corrected_text?
360+
return None
361+
362+
363+
class TypesenseBackend(BaseSearchBackend):
364+
"""
365+
Typesense-powered search backend.
366+
"""
367+
368+
DOCUMENT_SEARCH_CLASS = TypesenseDocumentBackend
369+
INDEX_SEARCH_CLASS = TypesenseIndexBackend
370+
THREAD_SEARCH_CLASS = TypesenseThreadSearchBackend

forum/settings/common.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,13 @@ def plugin_settings(settings: Any) -> None:
1010
Common settings for forum app
1111
"""
1212
# Search backend
13-
if getattr(settings, "MEILISEARCH_ENABLED", False):
13+
if getattr(settings, "TYPESENSE_ENABLED", False):
14+
settings.FORUM_SEARCH_BACKEND = getattr(
15+
settings,
16+
"FORUM_SEARCH_BACKEND",
17+
"forum.search.typesense.TypesenseBackend",
18+
)
19+
elif getattr(settings, "MEILISEARCH_ENABLED", False):
1420
settings.FORUM_SEARCH_BACKEND = getattr(
1521
settings,
1622
"FORUM_SEARCH_BACKEND",

requirements/base.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ requests
1111
pymongo
1212
elasticsearch
1313
edx-search # meilisearch backend
14+
typesense
1415
mysqlclient

0 commit comments

Comments
 (0)