|
| 1 | +""" |
| 2 | +Typesense backend for search comment and thread objects. |
| 3 | +""" |
| 4 | + |
| 5 | +import meilisearch |
| 6 | +import search.meilisearch as m |
| 7 | +from bs4 import BeautifulSoup |
| 8 | +from django.core.paginator import Paginator |
| 9 | + |
| 10 | +from forum import constants |
| 11 | +from forum.backends.mysql import MODEL_INDICES |
| 12 | +from forum.search import base |
| 13 | + |
| 14 | +FILTERABLE_FIELDS = [ |
| 15 | + "context", |
| 16 | + "course_id", |
| 17 | + "commentable_id", |
| 18 | +] |
| 19 | +INDEXED_FIELDS = ["body", "title", "comment_thread_id"] |
| 20 | +ALL_FIELDS = FILTERABLE_FIELDS + INDEXED_FIELDS |
| 21 | + |
| 22 | + |
| 23 | +class MeilisearchClientMixin: |
| 24 | + """ |
| 25 | + Expose a simple meilisearch client property, which is actually a singleton. |
| 26 | + """ |
| 27 | + |
| 28 | + CLIENT: meilisearch.Client | None = None |
| 29 | + |
| 30 | + @property |
| 31 | + def meilisearch_client(self) -> meilisearch.Client: |
| 32 | + if self.CLIENT is None: |
| 33 | + self.CLIENT = m.get_meilisearch_client() |
| 34 | + return self.CLIENT |
| 35 | + |
| 36 | + def get_index(self, index_name: str) -> meilisearch.index.Index: |
| 37 | + """ |
| 38 | + Get the Meilisearch index associated to a (non-prefixed) name. This index should |
| 39 | + already exist. |
| 40 | + """ |
| 41 | + meilisearch_index_name = m.get_meilisearch_index_name(index_name) |
| 42 | + return self.meilisearch_client.get_index(meilisearch_index_name) |
| 43 | + |
| 44 | + |
| 45 | +def create_document(document: dict[str, t.Any], doc_id: str) -> dict[str, t.Any]: |
| 46 | + """ |
| 47 | + We index small documents in Meilisearch, with just a handful of fields. |
| 48 | + """ |
| 49 | + processed = {"id": doc_id, m.PRIMARY_KEY_FIELD_NAME: m.id2pk(doc_id)} |
| 50 | + for field in ALL_FIELDS: |
| 51 | + if field in document: |
| 52 | + processed[field] = document[field] |
| 53 | + # We remove html markup, which breaks search in some places. For instance |
| 54 | + # "<p>Word" will not match "Word", which is a shame. |
| 55 | + if body := processed.get("body"): |
| 56 | + processed["body"] = BeautifulSoup(body, features="html.parser").get_text() |
| 57 | + return processed |
| 58 | + |
| 59 | + |
| 60 | +class TypesenseDocumentBackend( |
| 61 | + base.BaseDocumentSearchBackend, MeilisearchClientMixin |
| 62 | +): |
| 63 | + """ |
| 64 | + Simple document management. |
| 65 | + """ |
| 66 | + |
| 67 | + def index_document( |
| 68 | + self, index_name: str, doc_id: str | int, document: dict[str, t.Any] |
| 69 | + ) -> None: |
| 70 | + """ |
| 71 | + Insert a single document in the Meilisearch index. |
| 72 | + """ |
| 73 | + meilisearch_index = self.get_index(index_name) |
| 74 | + processed = create_document(document, str(doc_id)) |
| 75 | + meilisearch_index.add_documents([processed]) |
| 76 | + |
| 77 | + def update_document( |
| 78 | + self, index_name: str, doc_id: str | int, update_data: dict[str, t.Any] |
| 79 | + ) -> None: |
| 80 | + """ |
| 81 | + Updating is the same as inserting in meilisearch |
| 82 | + """ |
| 83 | + return self.index_document(index_name, doc_id, update_data) |
| 84 | + |
| 85 | + def delete_document(self, index_name: str, doc_id: str | int) -> None: |
| 86 | + """ |
| 87 | + Delete a single document, identified by its ID. |
| 88 | + """ |
| 89 | + meilisearch_index = self.get_index(index_name) |
| 90 | + doc_pk = m.id2pk(str(doc_id)) |
| 91 | + meilisearch_index.delete_document(doc_pk) |
| 92 | + |
| 93 | + |
| 94 | +class TypesenseIndexBackend(base.BaseIndexSearchBackend, MeilisearchClientMixin): |
| 95 | + """ |
| 96 | + Meilisearch index management. |
| 97 | + """ |
| 98 | + |
| 99 | + def initialize_indices(self, force_new_index: bool = False) -> None: |
| 100 | + filterable_fields = [m.PRIMARY_KEY_FIELD_NAME] + FILTERABLE_FIELDS |
| 101 | + index_filterables = { |
| 102 | + Model.index_name: filterable_fields for Model in MODEL_INDICES |
| 103 | + } |
| 104 | + if force_new_index: |
| 105 | + for index_name in index_filterables: |
| 106 | + meilisearch_index_name = m.get_meilisearch_index_name(index_name) |
| 107 | + task_info = self.meilisearch_client.delete_index(meilisearch_index_name) |
| 108 | + m.wait_for_task_to_succeed(self.meilisearch_client, task_info) |
| 109 | + m.create_indexes(index_filterables=index_filterables) |
| 110 | + |
| 111 | + def rebuild_indices( |
| 112 | + self, batch_size: int = 500, extra_catchup_minutes: int = 5 |
| 113 | + ) -> None: |
| 114 | + """ |
| 115 | + Parse model instances and insert them in Meilisearch. Only MySQL-backed |
| 116 | + instances are supported. |
| 117 | +
|
| 118 | + Note that the `extra_catchup_minutes` argument is ignored. |
| 119 | + """ |
| 120 | + self.initialize_indices() |
| 121 | + for Model in MODEL_INDICES: |
| 122 | + meilisearch_index = self.get_index(Model.index_name) |
| 123 | + paginator = Paginator(Model.objects.all(), per_page=batch_size) |
| 124 | + for page_number in paginator.page_range: |
| 125 | + page = paginator.get_page(page_number) |
| 126 | + documents = [ |
| 127 | + create_document(obj.doc_to_hash(), str(obj.id)) |
| 128 | + for obj in page.object_list |
| 129 | + ] |
| 130 | + if documents: |
| 131 | + meilisearch_index.add_documents(documents) |
| 132 | + |
| 133 | + def delete_unused_indices(self) -> int: |
| 134 | + """ |
| 135 | + This is a no-op, because this search backend does not handle indices like |
| 136 | + Elasticsearch. |
| 137 | + """ |
| 138 | + return 0 |
| 139 | + |
| 140 | + def refresh_indices(self) -> None: |
| 141 | + """ |
| 142 | + In Meilisearch, this command consists of waiting for pending tasks. |
| 143 | + """ |
| 144 | + for enqueued_task in self.meilisearch_client.get_tasks( |
| 145 | + {"statuses": ["enqueued", "processing"]} |
| 146 | + ).results: |
| 147 | + task = self.meilisearch_client.wait_for_task( |
| 148 | + enqueued_task.uid, timeout_in_ms=5000 |
| 149 | + ) |
| 150 | + if task.status != "succeeded": |
| 151 | + raise RuntimeError(f"Failed meilisearch task: {task}") |
| 152 | + |
| 153 | + def validate_indices(self) -> None: |
| 154 | + """ |
| 155 | + Initialization is in charge of defining filterable fields, so all validation is |
| 156 | + done there. |
| 157 | + """ |
| 158 | + self.initialize_indices(force_new_index=False) |
| 159 | + |
| 160 | + |
| 161 | +class TypesenseThreadSearchBackend( |
| 162 | + base.BaseThreadSearchBackend, MeilisearchClientMixin |
| 163 | +): |
| 164 | + """ |
| 165 | + Thread search backend. |
| 166 | +
|
| 167 | + This class is actually much simpler than it's ES equivalent, because it does not |
| 168 | + support text suggestion, nor some of the search parameters (which have little effect anyway). |
| 169 | + """ |
| 170 | + |
| 171 | + def get_thread_ids( |
| 172 | + self, |
| 173 | + context: str, |
| 174 | + # This argument is unsupported. Anyway, its only role was to boost some results, |
| 175 | + # which did not have much effect because they are shuffled anyway downstream. |
| 176 | + group_ids: list[int], |
| 177 | + search_text: str, |
| 178 | + # This parameter is unsupported, but as far as we know it's not used anywhere. |
| 179 | + sort_criteria: t.Optional[list[dict[str, str]]] = None, |
| 180 | + commentable_ids: t.Optional[list[str]] = None, |
| 181 | + course_id: t.Optional[str] = None, |
| 182 | + ) -> list[str]: |
| 183 | + """ |
| 184 | + Retrieve thread IDs based on search criteria. |
| 185 | + """ |
| 186 | + # Build search parameters |
| 187 | + constraints: dict[str, t.Any] = { |
| 188 | + "context": context, |
| 189 | + } |
| 190 | + if course_id: |
| 191 | + constraints["course_id"] = course_id |
| 192 | + if commentable_ids: |
| 193 | + constraints["commentable_id"] = commentable_ids |
| 194 | + search_params = m.get_search_params( |
| 195 | + size=constants.FORUM_MAX_DEEP_SEARCH_COMMENT_COUNT, |
| 196 | + field_dictionary=constraints, |
| 197 | + ) |
| 198 | + search_params["attributesToSearchOn"] = ["title", "body"] |
| 199 | + |
| 200 | + # Collect thread IDs |
| 201 | + # Note that it's absolutely useless to try to sort threads by score, because |
| 202 | + # results are shuffled downstream. I don't even know why the "title" field is |
| 203 | + # weighted more in the es.py backend... |
| 204 | + thread_ids: set[str] = set() |
| 205 | + for Model in MODEL_INDICES: |
| 206 | + meilisearch_index = self.get_index(Model.index_name) |
| 207 | + results = meilisearch_index.search(search_text, opt_params=search_params) |
| 208 | + for result in results["hits"]: |
| 209 | + thread_id = result.get("comment_thread_id") or result["id"] |
| 210 | + thread_ids.add(thread_id) |
| 211 | + |
| 212 | + # Don't make the slightest attempt to sort results |
| 213 | + return list(thread_ids) |
| 214 | + |
| 215 | + def get_suggested_text(self, search_text: str) -> t.Optional[str]: |
| 216 | + """ |
| 217 | + Meilisearch does not support query suggestion |
| 218 | + https://github.com/orgs/meilisearch/discussions/740 |
| 219 | + """ |
| 220 | + return None |
| 221 | + |
| 222 | + |
| 223 | +class TypesenseBackend(base.BaseSearchBackend): |
| 224 | + """ |
| 225 | + Meilisearch-powered search backend. |
| 226 | + """ |
| 227 | + |
| 228 | + DOCUMENT_SEARCH_CLASS = TypesenseDocumentBackend |
| 229 | + INDEX_SEARCH_CLASS = TypesenseIndexBackend |
| 230 | + THREAD_SEARCH_CLASS = TypesenseThreadSearchBackend |
0 commit comments