Skip to content

Commit 98a92fb

Browse files
committed
fix: improve validation and type checking
in response to review
1 parent d073dce commit 98a92fb

File tree

1 file changed

+80
-62
lines changed

1 file changed

+80
-62
lines changed

forum/search/typesense.py

Lines changed: 80 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,13 @@
22
Typesense backend for searching comments and threads.
33
"""
44

5-
from typing import Any, Optional
5+
from typing import Any, Optional, cast
66

77
from bs4 import BeautifulSoup
88
from django.conf import settings
99
from django.core.paginator import Paginator
1010

11-
# mypy complains: 'Module "typesense" does not explicitly export attribute "Client"'
12-
from typesense import Client # type: ignore
11+
from typesense.client import Client
1312
from typesense.types.collection import CollectionCreateSchema
1413
from typesense.types.document import DocumentSchema, SearchParameters
1514
from typesense.exceptions import ObjectNotFound
@@ -79,75 +78,44 @@ def expected_full_collection_schema() -> dict[str, Any]:
7978
What is expected to be the full collection schema.
8079
8180
Use this to validate the actual schema from the server.
81+
Note that Typesense may add new keys to the schema;
82+
this is ok, and validation should still pass.
8283
"""
84+
field_defaults = {
85+
"facet": False,
86+
"index": True,
87+
"infix": False,
88+
"locale": "",
89+
"optional": False,
90+
"sort": False,
91+
"stem": False,
92+
"stem_dictionary": "",
93+
"store": True,
94+
"type": "string",
95+
}
8396
return {
8497
"default_sorting_field": "",
8598
"enable_nested_fields": False,
8699
"fields": [
87100
{
88-
"facet": False,
89-
"index": True,
90-
"infix": False,
91-
"locale": "",
101+
**field_defaults,
92102
"name": "thread_id",
93-
"optional": False,
94-
"sort": False,
95-
"stem": False,
96-
"stem_dictionary": "",
97-
"store": True,
98-
"type": "string",
99103
},
100104
{
101-
"facet": False,
102-
"index": True,
103-
"infix": False,
104-
"locale": "",
105+
**field_defaults,
105106
"name": "course_id",
106-
"optional": False,
107-
"sort": False,
108-
"stem": False,
109-
"stem_dictionary": "",
110-
"store": True,
111-
"type": "string",
112107
},
113108
{
114-
"facet": False,
115-
"index": True,
116-
"infix": False,
117-
"locale": "",
109+
**field_defaults,
118110
"name": "commentable_id",
119-
"optional": False,
120-
"sort": False,
121-
"stem": False,
122-
"stem_dictionary": "",
123-
"store": True,
124-
"type": "string",
125111
},
126112
{
127-
"facet": False,
128-
"index": True,
129-
"infix": False,
130-
"locale": "",
113+
**field_defaults,
131114
"name": "context",
132-
"optional": False,
133-
"sort": False,
134-
"stem": False,
135-
"stem_dictionary": "",
136-
"store": True,
137-
"type": "string",
138115
},
139116
{
140-
"facet": False,
141-
"index": True,
142-
"infix": False,
143-
"locale": "",
117+
**field_defaults,
144118
"name": "text",
145-
"optional": False,
146-
"sort": False,
147-
"stem": False,
148-
"stem_dictionary": "",
149-
"store": True,
150-
"type": "string",
151119
},
152120
],
153121
"name": collection_name(),
@@ -206,7 +174,9 @@ def build_search_parameters(
206174
"""
207175
Build Typesense search parameters for searching the index.
208176
"""
209-
# Context is always a single word, so we can use the faster `:` operator, without sacrificing accuracy.
177+
# `context` is always a single word,
178+
# so we can gain performance without losing accuracy by using the faster `:` (non-exact) operator.
179+
# See https://typesense.org/docs/29.0/api/search.html#filter-parameters for more information.
210180
filters = [f"context:{quote_filter_value(context)}"]
211181

212182
if commentable_ids:
@@ -340,18 +310,66 @@ def validate_indices(self) -> None:
340310
Check if the indices exist and are valid.
341311
342312
Raise an exception if any do not exist or if any are not valid.
313+
Note that the validation is lengthy,
314+
because Typesense may add new keys to the schema.
315+
This is fine - we only want to assert that keys we know about are set as expected.
316+
There are also some fields in the retrieved schema we don't care about - eg. 'created_at'
343317
"""
344318
client = get_typesense_client()
345-
collection = client.collections[collection_name()].retrieve()
346-
collection.pop("created_at") # type: ignore
347-
collection.pop("num_documents") # type: ignore
348-
if collection != expected_full_collection_schema():
349-
print(f"Expected schema: {expected_full_collection_schema()}")
350-
print(f"Found schema: {collection}")
351-
raise AssertionError(
352-
f"Collection {collection_name()} exists, but schema does not match expected."
319+
# cast to a wider type, because we want to use it in a more flexible way than TypedDict normally allows.
320+
actual_schema = cast(
321+
dict[str, Any], client.collections[collection_name()].retrieve()
322+
)
323+
expected_schema = expected_full_collection_schema()
324+
errors: list[str] = []
325+
326+
expected_field_names = set(
327+
map(lambda field: field["name"], expected_schema["fields"])
328+
)
329+
actual_field_names = set(
330+
map(lambda field: field["name"], actual_schema["fields"])
331+
)
332+
333+
if missing_fields := expected_field_names - actual_field_names:
334+
errors.append(
335+
f"ERROR: '{collection_name()}' collection schema 'fields' has missing field(s): {missing_fields}."
353336
)
354337

338+
if extra_fields := actual_field_names - expected_field_names:
339+
errors.append(
340+
f"ERROR: '{collection_name()}' collection schema 'fields' "
341+
f"has unexpected extra field(s): {extra_fields}."
342+
)
343+
344+
if actual_field_names == expected_field_names:
345+
for expected_field, actual_field in zip(
346+
sorted(expected_schema["fields"], key=lambda field: field["name"]),
347+
sorted(actual_schema["fields"], key=lambda field: field["name"]),
348+
):
349+
for key, expected_value in expected_field.items():
350+
if expected_value != actual_field[key]:
351+
errors.append(
352+
f"ERROR: in collection '{collection_name()}' fields, field '{expected_field['name']}', "
353+
f"key '{key}' failed to validate. "
354+
f"Expected: '{expected_value}', actual '{actual_field[key]}'."
355+
)
356+
357+
for key, expected_value in expected_schema.items():
358+
if key == "fields":
359+
# we've already validated fields separately above
360+
continue
361+
362+
if expected_value != actual_schema[key]:
363+
errors.append(
364+
f"ERROR: in collection '{collection_name()}', key '{key}' failed to validate. "
365+
f"Expected: '{expected_value}', actual '{actual_schema[key]}'."
366+
)
367+
368+
if errors:
369+
for error in errors:
370+
print(error)
371+
raise AssertionError("\n".join(errors))
372+
355373
def refresh_indices(self) -> None:
356374
"""
357375
Noop on Typesense, as all write API operations are synchronous.

0 commit comments

Comments
 (0)