|
2 | 2 | Typesense backend for searching comments and threads. |
3 | 3 | """ |
4 | 4 |
|
5 | | -from typing import Any, Optional |
| 5 | +from typing import Any, Optional, cast |
6 | 6 |
|
7 | 7 | from bs4 import BeautifulSoup |
8 | 8 | from django.conf import settings |
9 | 9 | from django.core.paginator import Paginator |
10 | 10 |
|
11 | | -# mypy complains: 'Module "typesense" does not explicitly export attribute "Client"' |
12 | | -from typesense import Client # type: ignore |
| 11 | +from typesense.client import Client |
13 | 12 | from typesense.types.collection import CollectionCreateSchema |
14 | 13 | from typesense.types.document import DocumentSchema, SearchParameters |
15 | 14 | from typesense.exceptions import ObjectNotFound |
@@ -79,75 +78,44 @@ def expected_full_collection_schema() -> dict[str, Any]: |
79 | 78 | What is expected to be the full collection schema. |
80 | 79 |
|
81 | 80 | Use this to validate the actual schema from the server. |
| 81 | + Note that Typesense may add new keys to the schema; |
| 82 | + this is ok, and validation should still pass. |
82 | 83 | """ |
| 84 | + field_defaults = { |
| 85 | + "facet": False, |
| 86 | + "index": True, |
| 87 | + "infix": False, |
| 88 | + "locale": "", |
| 89 | + "optional": False, |
| 90 | + "sort": False, |
| 91 | + "stem": False, |
| 92 | + "stem_dictionary": "", |
| 93 | + "store": True, |
| 94 | + "type": "string", |
| 95 | + } |
83 | 96 | return { |
84 | 97 | "default_sorting_field": "", |
85 | 98 | "enable_nested_fields": False, |
86 | 99 | "fields": [ |
87 | 100 | { |
88 | | - "facet": False, |
89 | | - "index": True, |
90 | | - "infix": False, |
91 | | - "locale": "", |
| 101 | + **field_defaults, |
92 | 102 | "name": "thread_id", |
93 | | - "optional": False, |
94 | | - "sort": False, |
95 | | - "stem": False, |
96 | | - "stem_dictionary": "", |
97 | | - "store": True, |
98 | | - "type": "string", |
99 | 103 | }, |
100 | 104 | { |
101 | | - "facet": False, |
102 | | - "index": True, |
103 | | - "infix": False, |
104 | | - "locale": "", |
| 105 | + **field_defaults, |
105 | 106 | "name": "course_id", |
106 | | - "optional": False, |
107 | | - "sort": False, |
108 | | - "stem": False, |
109 | | - "stem_dictionary": "", |
110 | | - "store": True, |
111 | | - "type": "string", |
112 | 107 | }, |
113 | 108 | { |
114 | | - "facet": False, |
115 | | - "index": True, |
116 | | - "infix": False, |
117 | | - "locale": "", |
| 109 | + **field_defaults, |
118 | 110 | "name": "commentable_id", |
119 | | - "optional": False, |
120 | | - "sort": False, |
121 | | - "stem": False, |
122 | | - "stem_dictionary": "", |
123 | | - "store": True, |
124 | | - "type": "string", |
125 | 111 | }, |
126 | 112 | { |
127 | | - "facet": False, |
128 | | - "index": True, |
129 | | - "infix": False, |
130 | | - "locale": "", |
| 113 | + **field_defaults, |
131 | 114 | "name": "context", |
132 | | - "optional": False, |
133 | | - "sort": False, |
134 | | - "stem": False, |
135 | | - "stem_dictionary": "", |
136 | | - "store": True, |
137 | | - "type": "string", |
138 | 115 | }, |
139 | 116 | { |
140 | | - "facet": False, |
141 | | - "index": True, |
142 | | - "infix": False, |
143 | | - "locale": "", |
| 117 | + **field_defaults, |
144 | 118 | "name": "text", |
145 | | - "optional": False, |
146 | | - "sort": False, |
147 | | - "stem": False, |
148 | | - "stem_dictionary": "", |
149 | | - "store": True, |
150 | | - "type": "string", |
151 | 119 | }, |
152 | 120 | ], |
153 | 121 | "name": collection_name(), |
@@ -206,7 +174,9 @@ def build_search_parameters( |
206 | 174 | """ |
207 | 175 | Build Typesense search parameters for searching the index. |
208 | 176 | """ |
209 | | - # Context is always a single word, so we can use the faster `:` operator, without sacrificing accuracy. |
| 177 | + # `context` is always a single word, |
| 178 | + # so we can gain performance without losing accuracy by using the faster `:` (non-exact) operator. |
| 179 | + # See https://typesense.org/docs/29.0/api/search.html#filter-parameters for more information. |
210 | 180 | filters = [f"context:{quote_filter_value(context)}"] |
211 | 181 |
|
212 | 182 | if commentable_ids: |
@@ -340,18 +310,66 @@ def validate_indices(self) -> None: |
340 | 310 | Check if the indices exist and are valid. |
341 | 311 |
|
342 | 312 | Raise an exception if any do not exist or if any are not valid. |
| 313 | + Note that the validation is lengthy, |
| 314 | + because Typesense may add new keys to the schema. |
| 315 | + This is fine - we only want to assert that keys we know about are set as expected. |
| 316 | + There are also some fields in the retrieved schema we don't care about - eg. 'created_at' |
343 | 317 | """ |
344 | 318 | client = get_typesense_client() |
345 | | - collection = client.collections[collection_name()].retrieve() |
346 | | - collection.pop("created_at") # type: ignore |
347 | | - collection.pop("num_documents") # type: ignore |
348 | | - if collection != expected_full_collection_schema(): |
349 | | - print(f"Expected schema: {expected_full_collection_schema()}") |
350 | | - print(f"Found schema: {collection}") |
351 | | - raise AssertionError( |
352 | | - f"Collection {collection_name()} exists, but schema does not match expected." |
| 319 | + # cast to a wider type, because we want to use it in a more flexible way than TypedDict normally allows. |
| 320 | + actual_schema = cast( |
| 321 | + dict[str, Any], client.collections[collection_name()].retrieve() |
| 322 | + ) |
| 323 | + expected_schema = expected_full_collection_schema() |
| 324 | + errors: list[str] = [] |
| 325 | + |
| 326 | + expected_field_names = set( |
| 327 | + map(lambda field: field["name"], expected_schema["fields"]) |
| 328 | + ) |
| 329 | + actual_field_names = set( |
| 330 | + map(lambda field: field["name"], actual_schema["fields"]) |
| 331 | + ) |
| 332 | + |
| 333 | + if missing_fields := expected_field_names - actual_field_names: |
| 334 | + errors.append( |
| 335 | + f"ERROR: '{collection_name()}' collection schema 'fields' has missing field(s): {missing_fields}." |
353 | 336 | ) |
354 | 337 |
|
| 338 | + if extra_fields := actual_field_names - expected_field_names: |
| 339 | + errors.append( |
| 340 | + f"ERROR: '{collection_name()}' collection schema 'fields' " |
| 341 | + f"has unexpected extra field(s): {extra_fields}." |
| 342 | + ) |
| 343 | + |
| 344 | + if actual_field_names == expected_field_names: |
| 345 | + for expected_field, actual_field in zip( |
| 346 | + sorted(expected_schema["fields"], key=lambda field: field["name"]), |
| 347 | + sorted(actual_schema["fields"], key=lambda field: field["name"]), |
| 348 | + ): |
| 349 | + for key, expected_value in expected_field.items(): |
| 350 | + if expected_value != actual_field[key]: |
| 351 | + errors.append( |
| 352 | + f"ERROR: in collection '{collection_name()}' fields, field '{expected_field['name']}', " |
| 353 | + f"key '{key}' failed to validate. " |
| 354 | + f"Expected: '{expected_value}', actual '{actual_field[key]}'." |
| 355 | + ) |
| 356 | + |
| 357 | + for key, expected_value in expected_schema.items(): |
| 358 | + if key == "fields": |
| 359 | + # we've already validated fields separately above |
| 360 | + continue |
| 361 | + |
| 362 | + if expected_value != actual_schema[key]: |
| 363 | + errors.append( |
| 364 | + f"ERROR: in collection '{collection_name()}', key '{key}' failed to validate. " |
| 365 | + f"Expected: '{expected_value}', actual '{actual_schema[key]}'." |
| 366 | + ) |
| 367 | + |
| 368 | + if errors: |
| 369 | + for error in errors: |
| 370 | + print(error) |
| 371 | + raise AssertionError("\n".join(errors)) |
| 372 | + |
355 | 373 | def refresh_indices(self) -> None: |
356 | 374 | """ |
357 | 375 | Noop on Typesense, as all write API operations are synchronous. |
|
0 commit comments