Skip to content

Commit afbb4a7

Browse files
committed
wip: feat: add Typesense backend for search
- run `make upgrade` - this may need to be reverted Private-ref: https://tasks.opencraft.com/browse/BB-9975
1 parent 51ec507 commit afbb4a7

File tree

11 files changed

+1203
-510
lines changed

11 files changed

+1203
-510
lines changed

forum/search/typesense.py

Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
"""
2+
Typesense backend for search comment and thread objects.
3+
"""
4+
5+
import meilisearch
6+
import search.meilisearch as m
7+
from bs4 import BeautifulSoup
8+
from django.core.paginator import Paginator
9+
10+
from forum import constants
11+
from forum.backends.mysql import MODEL_INDICES
12+
from forum.search import base
13+
14+
FILTERABLE_FIELDS = [
15+
"context",
16+
"course_id",
17+
"commentable_id",
18+
]
19+
INDEXED_FIELDS = ["body", "title", "comment_thread_id"]
20+
ALL_FIELDS = FILTERABLE_FIELDS + INDEXED_FIELDS
21+
22+
23+
class MeilisearchClientMixin:
24+
"""
25+
Expose a simple meilisearch client property, which is actually a singleton.
26+
"""
27+
28+
CLIENT: meilisearch.Client | None = None
29+
30+
@property
31+
def meilisearch_client(self) -> meilisearch.Client:
32+
if self.CLIENT is None:
33+
self.CLIENT = m.get_meilisearch_client()
34+
return self.CLIENT
35+
36+
def get_index(self, index_name: str) -> meilisearch.index.Index:
37+
"""
38+
Get the Meilisearch index associated to a (non-prefixed) name. This index should
39+
already exist.
40+
"""
41+
meilisearch_index_name = m.get_meilisearch_index_name(index_name)
42+
return self.meilisearch_client.get_index(meilisearch_index_name)
43+
44+
45+
def create_document(document: dict[str, t.Any], doc_id: str) -> dict[str, t.Any]:
46+
"""
47+
We index small documents in Meilisearch, with just a handful of fields.
48+
"""
49+
processed = {"id": doc_id, m.PRIMARY_KEY_FIELD_NAME: m.id2pk(doc_id)}
50+
for field in ALL_FIELDS:
51+
if field in document:
52+
processed[field] = document[field]
53+
# We remove html markup, which breaks search in some places. For instance
54+
# "<p>Word" will not match "Word", which is a shame.
55+
if body := processed.get("body"):
56+
processed["body"] = BeautifulSoup(body, features="html.parser").get_text()
57+
return processed
58+
59+
60+
class TypesenseDocumentBackend(
61+
base.BaseDocumentSearchBackend, MeilisearchClientMixin
62+
):
63+
"""
64+
Simple document management.
65+
"""
66+
67+
def index_document(
68+
self, index_name: str, doc_id: str | int, document: dict[str, t.Any]
69+
) -> None:
70+
"""
71+
Insert a single document in the Meilisearch index.
72+
"""
73+
meilisearch_index = self.get_index(index_name)
74+
processed = create_document(document, str(doc_id))
75+
meilisearch_index.add_documents([processed])
76+
77+
def update_document(
78+
self, index_name: str, doc_id: str | int, update_data: dict[str, t.Any]
79+
) -> None:
80+
"""
81+
Updating is the same as inserting in meilisearch
82+
"""
83+
return self.index_document(index_name, doc_id, update_data)
84+
85+
def delete_document(self, index_name: str, doc_id: str | int) -> None:
86+
"""
87+
Delete a single document, identified by its ID.
88+
"""
89+
meilisearch_index = self.get_index(index_name)
90+
doc_pk = m.id2pk(str(doc_id))
91+
meilisearch_index.delete_document(doc_pk)
92+
93+
94+
class TypesenseIndexBackend(base.BaseIndexSearchBackend, MeilisearchClientMixin):
95+
"""
96+
Meilisearch index management.
97+
"""
98+
99+
def initialize_indices(self, force_new_index: bool = False) -> None:
100+
filterable_fields = [m.PRIMARY_KEY_FIELD_NAME] + FILTERABLE_FIELDS
101+
index_filterables = {
102+
Model.index_name: filterable_fields for Model in MODEL_INDICES
103+
}
104+
if force_new_index:
105+
for index_name in index_filterables:
106+
meilisearch_index_name = m.get_meilisearch_index_name(index_name)
107+
task_info = self.meilisearch_client.delete_index(meilisearch_index_name)
108+
m.wait_for_task_to_succeed(self.meilisearch_client, task_info)
109+
m.create_indexes(index_filterables=index_filterables)
110+
111+
def rebuild_indices(
112+
self, batch_size: int = 500, extra_catchup_minutes: int = 5
113+
) -> None:
114+
"""
115+
Parse model instances and insert them in Meilisearch. Only MySQL-backed
116+
instances are supported.
117+
118+
Note that the `extra_catchup_minutes` argument is ignored.
119+
"""
120+
self.initialize_indices()
121+
for Model in MODEL_INDICES:
122+
meilisearch_index = self.get_index(Model.index_name)
123+
paginator = Paginator(Model.objects.all(), per_page=batch_size)
124+
for page_number in paginator.page_range:
125+
page = paginator.get_page(page_number)
126+
documents = [
127+
create_document(obj.doc_to_hash(), str(obj.id))
128+
for obj in page.object_list
129+
]
130+
if documents:
131+
meilisearch_index.add_documents(documents)
132+
133+
def delete_unused_indices(self) -> int:
134+
"""
135+
This is a no-op, because this search backend does not handle indices like
136+
Elasticsearch.
137+
"""
138+
return 0
139+
140+
def refresh_indices(self) -> None:
141+
"""
142+
In Meilisearch, this command consists of waiting for pending tasks.
143+
"""
144+
for enqueued_task in self.meilisearch_client.get_tasks(
145+
{"statuses": ["enqueued", "processing"]}
146+
).results:
147+
task = self.meilisearch_client.wait_for_task(
148+
enqueued_task.uid, timeout_in_ms=5000
149+
)
150+
if task.status != "succeeded":
151+
raise RuntimeError(f"Failed meilisearch task: {task}")
152+
153+
def validate_indices(self) -> None:
154+
"""
155+
Initialization is in charge of defining filterable fields, so all validation is
156+
done there.
157+
"""
158+
self.initialize_indices(force_new_index=False)
159+
160+
161+
class TypesenseThreadSearchBackend(
162+
base.BaseThreadSearchBackend, MeilisearchClientMixin
163+
):
164+
"""
165+
Thread search backend.
166+
167+
This class is actually much simpler than it's ES equivalent, because it does not
168+
support text suggestion, nor some of the search parameters (which have little effect anyway).
169+
"""
170+
171+
def get_thread_ids(
172+
self,
173+
context: str,
174+
# This argument is unsupported. Anyway, its only role was to boost some results,
175+
# which did not have much effect because they are shuffled anyway downstream.
176+
group_ids: list[int],
177+
search_text: str,
178+
# This parameter is unsupported, but as far as we know it's not used anywhere.
179+
sort_criteria: t.Optional[list[dict[str, str]]] = None,
180+
commentable_ids: t.Optional[list[str]] = None,
181+
course_id: t.Optional[str] = None,
182+
) -> list[str]:
183+
"""
184+
Retrieve thread IDs based on search criteria.
185+
"""
186+
# Build search parameters
187+
constraints: dict[str, t.Any] = {
188+
"context": context,
189+
}
190+
if course_id:
191+
constraints["course_id"] = course_id
192+
if commentable_ids:
193+
constraints["commentable_id"] = commentable_ids
194+
search_params = m.get_search_params(
195+
size=constants.FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT,
196+
field_dictionary=constraints,
197+
)
198+
search_params["attributesToSearchOn"] = ["title", "body"]
199+
200+
# Collect thread IDs
201+
# Note that it's absolutely useless to try to sort threads by score, because
202+
# results are shuffled downstream. I don't even know why the "title" field is
203+
# weighted more in the es.py backend...
204+
thread_ids: set[str] = set()
205+
for Model in MODEL_INDICES:
206+
meilisearch_index = self.get_index(Model.index_name)
207+
results = meilisearch_index.search(search_text, opt_params=search_params)
208+
for result in results["hits"]:
209+
thread_id = result.get("comment_thread_id") or result["id"]
210+
thread_ids.add(thread_id)
211+
212+
# Don't make the slightest attempt to sort results
213+
return list(thread_ids)
214+
215+
def get_suggested_text(self, search_text: str) -> t.Optional[str]:
216+
"""
217+
Meilisearch does not support query suggestion
218+
https://github.com/orgs/meilisearch/discussions/740
219+
"""
220+
return None
221+
222+
223+
class TypesenseBackend(base.BaseSearchBackend):
224+
"""
225+
Meilisearch-powered search backend.
226+
"""
227+
228+
DOCUMENT_SEARCH_CLASS = TypesenseDocumentBackend
229+
INDEX_SEARCH_CLASS = TypesenseIndexBackend
230+
THREAD_SEARCH_CLASS = TypesenseThreadSearchBackend

forum/settings/common.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ def plugin_settings(settings: Any) -> None:
1616
"FORUM_SEARCH_BACKEND",
1717
"forum.search.meilisearch.MeilisearchBackend",
1818
)
19+
elif getattr(settings, "TYPESENSE_ENABLED", False):
20+
settings.FORUM_SEARCH_BACKEND = getattr(
21+
settings,
22+
"FORUM_SEARCH_BACKEND",
23+
"forum.search.typesense.TypesenseBackend",
24+
)
1925
else:
2026
settings.FORUM_SEARCH_BACKEND = getattr(
2127
settings, "FORUM_SEARCH_BACKEND", "forum.search.es.ElasticsearchBackend"

requirements/base.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ requests
1111
pymongo
1212
elasticsearch
1313
edx-search # meilisearch backend
14+
typesense
1415
mysqlclient

0 commit comments

Comments
 (0)