Skip to content

Commit b1c75f2

Browse files
committed
wip: feat: add Typesense backend for search
Also run `make upgrade` to be able to successfully compile dependencies. Private-ref: https://tasks.opencraft.com/browse/BB-9975
1 parent 51ec507 commit b1c75f2

File tree

11 files changed

+1336
-510
lines changed

11 files changed

+1336
-510
lines changed

forum/search/typesense.py

Lines changed: 363 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,363 @@
1+
"""
2+
Typesense backend for searching comments and threads.
3+
"""
4+
5+
from typing import Any, Optional
6+
7+
from bs4 import BeautifulSoup
8+
from django.conf import settings
9+
from django.core.paginator import Paginator
10+
from typesense import Client
11+
from typesense.types.collection import CollectionCreateSchema
12+
from typesense.types.document import DocumentSchema, SearchParameters
13+
14+
from forum.backends.mysql.models import Comment, CommentThread
15+
from forum.constants import FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT
16+
from forum.search.base import (
17+
BaseDocumentSearchBackend,
18+
BaseIndexSearchBackend,
19+
BaseSearchBackend,
20+
BaseThreadSearchBackend,
21+
)
22+
23+
_TYPESENSE_CLIENT: Client | None = None
24+
25+
26+
def get_typesense_client() -> Client:
27+
"""
28+
Return a singleton Typesense client instance.
29+
"""
30+
global _TYPESENSE_CLIENT
31+
if _TYPESENSE_CLIENT is None:
32+
_TYPESENSE_CLIENT = Client(
33+
{
34+
"api_key": settings.TYPESENSE_API_KEY,
35+
"nodes": [settings.TYPESENSE_URL],
36+
}
37+
)
38+
return _TYPESENSE_CLIENT
39+
40+
41+
class CommentsIndex:
42+
"""
43+
Common data and operations relating to the comments index.
44+
"""
45+
46+
model = Comment
47+
48+
@staticmethod
49+
def name() -> str:
50+
"""
51+
Return the Typesense index name for the index.
52+
"""
53+
return settings.TYPESENSE_COLLECTION_PREFIX + "comments"
54+
55+
@classmethod
56+
def schema(cls) -> CollectionCreateSchema:
57+
return {
58+
"name": cls.name(),
59+
"fields": [
60+
{"name": "course_id", "type": "string"},
61+
{"name": "comment_thread_id", "type": "string"},
62+
{"name": "body", "type": "string"},
63+
],
64+
}
65+
66+
@staticmethod
67+
def build_document(doc_id: str | int, data: dict[str, Any]) -> DocumentSchema:
68+
"""
69+
Build a Typesense document for this index.
70+
"""
71+
# TODO: what happens if any fields are empty strings?
72+
# NOTE: Comments have no commentable_id or title, and the context is hardcoded to "course".
73+
return {
74+
"id": str(doc_id),
75+
"course_id": str(data.get("course_id", "")),
76+
"comment_thread_id": str(data.get("comment_thread_id", "")),
77+
"body": (
78+
BeautifulSoup(data["body"], features="html.parser").get_text()
79+
if data.get("body")
80+
else ""
81+
),
82+
}
83+
84+
@staticmethod
85+
def build_search_parameters(
86+
*, search_text: str, course_id: str | None
87+
) -> SearchParameters:
88+
"""
89+
Build Typesense search parameters for this index.
90+
"""
91+
return {
92+
"q": search_text,
93+
"query_by": "body",
94+
"filter_by": (
95+
f"course_id:={quote_filter_value(course_id)}" if course_id else ""
96+
),
97+
"per_page": FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT,
98+
}
99+
100+
101+
class CommentThreadsIndex:
102+
"""
103+
Common data and operations relating to the comments index.
104+
"""
105+
106+
model = CommentThread
107+
108+
@staticmethod
109+
def name() -> str:
110+
"""
111+
Return the Typesense index name for the index.
112+
"""
113+
return settings.TYPESENSE_COLLECTION_PREFIX + "comment_threads"
114+
115+
@classmethod
116+
def schema(cls) -> CollectionCreateSchema:
117+
return {
118+
"name": cls.name(),
119+
"fields": [
120+
{"name": "course_id", "type": "string"},
121+
{"name": "commentable_id", "type": "string"},
122+
{"name": "context", "type": "string"},
123+
{"name": "title", "type": "string"},
124+
{"name": "body", "type": "string"},
125+
],
126+
}
127+
128+
@staticmethod
129+
def build_document(doc_id: str | int, data: dict[str, Any]) -> DocumentSchema:
130+
"""
131+
Build a Typesense document for this index.
132+
"""
133+
return {
134+
"id": str(doc_id),
135+
"course_id": str(data.get("course_id", "")),
136+
"commentable_id": str(data.get("commentable_id", "")),
137+
"context": str(data.get("context", "")),
138+
"title": str(data.get("title", "")),
139+
"body": (
140+
BeautifulSoup(data["body"], features="html.parser").get_text()
141+
if data.get("body")
142+
else ""
143+
),
144+
}
145+
146+
@staticmethod
147+
def build_search_parameters(
148+
*,
149+
search_text: str,
150+
course_id: str | None,
151+
context: str,
152+
commentable_ids: list[str] | None,
153+
) -> SearchParameters:
154+
"""
155+
Build Typesense search parameters for this index.
156+
"""
157+
# Context is always a single word, so we can use the faster `:` operator, without sacrificing accuracy.
158+
filters = [f"context:{quote_filter_value(context)}"]
159+
if commentable_ids:
160+
safe_ids = ", ".join(quote_filter_value(value) for value in commentable_ids)
161+
filters.append(f"commentable_ids:[{safe_ids}]")
162+
if course_id:
163+
filters.append(f"course_id:={quote_filter_value(course_id)}")
164+
165+
return {
166+
"q": search_text,
167+
"query_by": "title,body",
168+
"filter_by": " && ".join(filters),
169+
"per_page": FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT,
170+
}
171+
172+
173+
INDICES: dict[str, type[CommentsIndex] | type[CommentThreadsIndex]] = {
174+
"comments": CommentsIndex,
175+
"comment_threads": CommentThreadsIndex,
176+
}
177+
178+
179+
class TypesenseDocumentBackend(BaseDocumentSearchBackend):
180+
"""
181+
Document backend implementation for Typesense.
182+
"""
183+
184+
def index_document(
185+
self, index_name: str, doc_id: str | int, document: dict[str, Any]
186+
) -> None:
187+
"""
188+
Index a document in Typesense.
189+
"""
190+
client = get_typesense_client()
191+
index = INDICES[index_name]
192+
typesense_document = index.build_document(doc_id, document)
193+
client.collections[index.name()].documents.upsert(typesense_document)
194+
195+
def update_document(
196+
self, index_name: str, doc_id: str | int, update_data: dict[str, Any]
197+
) -> None:
198+
"""
199+
Same operation as index_document, because upsert is used.
200+
"""
201+
return self.index_document(index_name, doc_id, update_data)
202+
203+
def delete_document(self, index_name: str, doc_id: str | int) -> None:
204+
"""
205+
Delete a document from Typesense.
206+
"""
207+
client = get_typesense_client()
208+
index = INDICES[index_name]
209+
client.collections[index.name()].documents[str(doc_id)].delete(
210+
delete_parameters={"ignore_not_found": True},
211+
)
212+
213+
214+
class TypesenseIndexBackend(BaseIndexSearchBackend):
215+
"""
216+
Manage indexes for the Typesense backend.
217+
218+
Typesense calls these "collections". https://typesense.org/docs/29.0/api/collections.html
219+
"""
220+
221+
def initialize_indices(self, force_new_index: bool = False) -> None:
222+
"""
223+
Initialize the indices in Typesense.
224+
225+
If force_new_index is True, the indexes will be dropped before being recreated.
226+
"""
227+
client = get_typesense_client()
228+
for index in INDICES.values():
229+
if force_new_index:
230+
client.collections[index.name()].delete()
231+
# TODO: what happens here if the collection already exists?
232+
client.collections.create(index.schema())
233+
234+
def rebuild_indices(
235+
self, batch_size: int = 500, extra_catchup_minutes: int = 5
236+
) -> None:
237+
"""
238+
Parse model instances and insert them in Typesense.
239+
240+
Only MySQL-backed instances are supported.
241+
Note that the `extra_catchup_minutes` argument is ignored.
242+
"""
243+
client = get_typesense_client()
244+
# TODO: Do we want to force recreate the indices here perhaps?
245+
# Does 'rebuild' here imply we would also want to drop documents that are no longer present in the database?
246+
self.initialize_indices()
247+
for index in INDICES.values():
248+
paginator = Paginator(index.model.objects.all(), per_page=batch_size)
249+
for page_number in paginator.page_range:
250+
page = paginator.get_page(page_number)
251+
documents = [
252+
index.build_document(obj.pk, obj.doc_to_hash())
253+
for obj in page.object_list
254+
]
255+
if documents:
256+
client.collections[index.name()].documents.import_(
257+
documents, {"action": "upsert"}
258+
)
259+
260+
def validate_indices(self) -> None:
261+
"""
262+
Check if the indices exist and are valid.
263+
264+
Raise an exception if any do not exist or if any are not valid.
265+
"""
266+
# TODO: check if each index exists. Can we also check the schema matches too?
267+
raise NotImplementedError
268+
269+
def refresh_indices(self) -> None:
270+
"""
271+
Noop on Typesense, as all write API operations are synchronous.
272+
273+
See https://typesense.org/docs/guide/migrating-from-algolia.html#synchronous-write-apis for more information.
274+
"""
275+
return None
276+
277+
def delete_unused_indices(self) -> int:
278+
"""
279+
Noop on Typesense.
280+
"""
281+
return 0
282+
283+
284+
def quote_filter_value(value: str) -> str:
285+
"""
286+
Sanitize and safely quote a value for use in a Typesense filter.
287+
288+
https://typesense.org/docs/guide/tips-for-filtering.html#escaping-special-characters
289+
"""
290+
# TODO: It may be possible to escape backticks, rather than removing them.
291+
# It also may be possible that there are issues with the backtick method.
292+
# See https://github.com/typesense/typesense/issues/196
293+
return "`" + value.replace("`", "") + "`"
294+
295+
296+
class TypesenseThreadSearchBackend(BaseThreadSearchBackend):
297+
"""
298+
Thread search backend implementation for Typesense.
299+
"""
300+
301+
def get_thread_ids(
302+
self,
303+
context: str,
304+
# This argument is unsupported. Anyway, its only role was to boost some results,
305+
# which did not have much effect because they are shuffled anyway downstream.
306+
group_ids: list[int],
307+
search_text: str,
308+
# This parameter is unsupported, but as far as we know it's not used anywhere.
309+
sort_criteria: Optional[list[dict[str, str]]] = None,
310+
commentable_ids: Optional[list[str]] = None,
311+
course_id: Optional[str] = None,
312+
) -> list[str]:
313+
"""
314+
Retrieve thread IDs based on search criteria.
315+
"""
316+
client = get_typesense_client()
317+
thread_ids: set[str] = set()
318+
319+
# All comments have "course" as their context, and none of them have a commentable_id.
320+
if context == "course" and not commentable_ids:
321+
comment_results = client.collections[CommentsIndex.name()].documents.search(
322+
CommentsIndex.build_search_parameters(
323+
search_text=search_text, course_id=course_id
324+
)
325+
)
326+
for hit in comment_results.get("hits", []):
327+
thread_ids.add(hit["document"]["comment_thread_id"])
328+
329+
thread_results = client.collections[
330+
CommentThreadsIndex.name()
331+
].documents.search(
332+
CommentThreadsIndex.build_search_parameters(
333+
search_text=search_text,
334+
course_id=course_id,
335+
context=context,
336+
commentable_ids=commentable_ids,
337+
)
338+
)
339+
for hit in thread_results.get("hits", []):
340+
thread_ids.add(hit["document"]["id"])
341+
342+
return list(thread_ids)
343+
344+
def get_suggested_text(self, search_text: str) -> Optional[str]:
345+
"""
346+
Retrieve text suggestions for a given search query.
347+
348+
:param search_text: Text to search for suggestions
349+
:return: Suggested text or None
350+
"""
351+
# TODO: https://typesense.org/docs/guide/query-suggestions.html
352+
# TODO: if this is implemented, do we need to also implement get_thread_ids_with_corrected_text?
353+
raise NotImplementedError
354+
355+
356+
class TypesenseBackend(BaseSearchBackend):
357+
"""
358+
Typesense-powered search backend.
359+
"""
360+
361+
DOCUMENT_SEARCH_CLASS = TypesenseDocumentBackend
362+
INDEX_SEARCH_CLASS = TypesenseIndexBackend
363+
THREAD_SEARCH_CLASS = TypesenseThreadSearchBackend

forum/settings/common.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ def plugin_settings(settings: Any) -> None:
1616
"FORUM_SEARCH_BACKEND",
1717
"forum.search.meilisearch.MeilisearchBackend",
1818
)
19+
elif getattr(settings, "TYPESENSE_ENABLED", False):
20+
settings.FORUM_SEARCH_BACKEND = getattr(
21+
settings,
22+
"FORUM_SEARCH_BACKEND",
23+
"forum.search.typesense.TypesenseBackend",
24+
)
1925
else:
2026
settings.FORUM_SEARCH_BACKEND = getattr(
2127
settings, "FORUM_SEARCH_BACKEND", "forum.search.es.ElasticsearchBackend"

requirements/base.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ requests
1111
pymongo
1212
elasticsearch
1313
edx-search # meilisearch backend
14+
typesense
1415
mysqlclient

0 commit comments

Comments
 (0)