Skip to content

Commit

Permalink
some search index optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
jonaswinkler committed Feb 15, 2021
1 parent 56bd966 commit 8bf4241
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 34 deletions.
10 changes: 6 additions & 4 deletions src/documents/admin.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from django.contrib import admin
from whoosh.writing import AsyncWriter

from . import index
from .models import Correspondent, Document, DocumentType, Tag, \
SavedView, SavedViewFilterRule

Expand Down Expand Up @@ -84,17 +82,21 @@ def created_(self, obj):
created_.short_description = "Created"

def delete_queryset(self, request, queryset):
ix = index.open_index()
with AsyncWriter(ix) as writer:
from documents import index

with index.open_index_writer() as writer:
for o in queryset:
index.remove_document(writer, o)

super(DocumentAdmin, self).delete_queryset(request, queryset)

def delete_model(self, request, obj):
from documents import index
index.remove_document_from_index(obj)
super(DocumentAdmin, self).delete_model(request, obj)

def save_model(self, request, obj, form, change):
from documents import index
index.add_or_update_document(obj)
super(DocumentAdmin, self).save_model(request, obj, form, change)

Expand Down
7 changes: 3 additions & 4 deletions src/documents/bulk_edit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@

from django.db.models import Q
from django_q.tasks import async_task
from whoosh.writing import AsyncWriter

from documents import index
from documents.models import Document, Correspondent, DocumentType


Expand Down Expand Up @@ -99,8 +97,9 @@ def modify_tags(doc_ids, add_tags, remove_tags):
def delete(doc_ids):
Document.objects.filter(id__in=doc_ids).delete()

ix = index.open_index()
with AsyncWriter(ix) as writer:
from documents import index

with index.open_index_writer() as writer:
for id in doc_ids:
index.remove_document_by_id(writer, id)

Expand Down
22 changes: 18 additions & 4 deletions src/documents/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,22 @@ def open_index(recreate=False):
return create_in(settings.INDEX_DIR, get_schema())


@contextmanager
def open_index_writer(ix=None, optimize=False):
if ix:
writer = AsyncWriter(ix)
else:
writer = AsyncWriter(open_index())

try:
yield writer
except Exception as e:
logger.exception(str(e))
writer.cancel()
finally:
writer.commit(optimize=optimize)


def update_document(writer, doc):
tags = ",".join([t.name for t in doc.tags.all()])
writer.update_document(
Expand All @@ -110,14 +126,12 @@ def remove_document_by_id(writer, doc_id):


def add_or_update_document(document):
ix = open_index()
with AsyncWriter(ix) as writer:
with open_index_writer() as writer:
update_document(writer, document)


def remove_document_from_index(document):
ix = open_index()
with AsyncWriter(ix) as writer:
with open_index_writer() as writer:
remove_document(writer, document)


Expand Down
4 changes: 3 additions & 1 deletion src/documents/signals/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from django.utils import timezone
from filelock import FileLock

from .. import index, matching
from .. import matching
from ..file_handling import delete_empty_directories, \
create_source_path_directory, \
generate_unique_filename
Expand Down Expand Up @@ -305,4 +305,6 @@ def set_log_entry(sender, document=None, logging_group=None, **kwargs):


def add_to_index(sender, document, **kwargs):
from documents import index

index.add_or_update_document(document)
36 changes: 26 additions & 10 deletions src/documents/tests/test_admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,44 +4,60 @@
from django.test import TestCase
from django.utils import timezone

from documents import index
from documents.admin import DocumentAdmin
from documents.models import Document
from documents.tests.utils import DirectoriesMixin


class TestDocumentAdmin(DirectoriesMixin, TestCase):

def get_document_from_index(self, doc):
ix = index.open_index()
with ix.searcher() as searcher:
return searcher.document(id=doc.id)

def setUp(self) -> None:
super(TestDocumentAdmin, self).setUp()
self.doc_admin = DocumentAdmin(model=Document, admin_site=AdminSite())

@mock.patch("documents.admin.index.add_or_update_document")
def test_save_model(self, m):
def test_save_model(self):
doc = Document.objects.create(title="test")

doc.title = "new title"
self.doc_admin.save_model(None, doc, None, None)
self.assertEqual(Document.objects.get(id=doc.id).title, "new title")
m.assert_called_once()
self.assertEqual(self.get_document_from_index(doc)['title'], "new title")

@mock.patch("documents.admin.index.remove_document")
def test_delete_model(self, m):
def test_delete_model(self):
doc = Document.objects.create(title="test")
index.add_or_update_document(doc)
self.assertIsNotNone(self.get_document_from_index(doc))

self.doc_admin.delete_model(None, doc)

self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id)
m.assert_called_once()
self.assertIsNone(self.get_document_from_index(doc))

@mock.patch("documents.admin.index.remove_document")
def test_delete_queryset(self, m):
def test_delete_queryset(self):
docs = []
for i in range(42):
Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
doc = Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
docs.append(doc)
index.add_or_update_document(doc)

self.assertEqual(Document.objects.count(), 42)

for doc in docs:
self.assertIsNotNone(self.get_document_from_index(doc))

self.doc_admin.delete_queryset(None, Document.objects.all())

self.assertEqual(m.call_count, 42)
self.assertEqual(Document.objects.count(), 0)

for doc in docs:
self.assertIsNone(self.get_document_from_index(doc))

def test_created(self):
doc = Document.objects.create(title="test", created=timezone.datetime(2020, 4, 12))
self.assertEqual(self.doc_admin.created_(doc), "2020-04-12")
22 changes: 11 additions & 11 deletions src/documents/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
ViewSet
)

import documents.index as index
from paperless.db import GnuPG
from paperless.views import StandardPagination
from .classifier import load_classifier
Expand Down Expand Up @@ -176,10 +175,12 @@ def get_serializer(self, *args, **kwargs):
def update(self, request, *args, **kwargs):
response = super(DocumentViewSet, self).update(
request, *args, **kwargs)
from documents import index
index.add_or_update_document(self.get_object())
return response

def destroy(self, request, *args, **kwargs):
from documents import index
index.remove_document_from_index(self.get_object())
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)

Expand Down Expand Up @@ -501,10 +502,6 @@ class SearchView(APIView):

permission_classes = (IsAuthenticated,)

def __init__(self, *args, **kwargs):
super(SearchView, self).__init__(*args, **kwargs)
self.ix = index.open_index()

def add_infos_to_hit(self, r):
try:
doc = Document.objects.get(id=r['id'])
Expand All @@ -525,6 +522,7 @@ def add_infos_to_hit(self, r):
}

def get(self, request, format=None):
from documents import index

if 'query' in request.query_params:
query = request.query_params['query']
Expand Down Expand Up @@ -554,8 +552,10 @@ def get(self, request, format=None):
if page < 1:
page = 1

ix = index.open_index()

try:
with index.query_page(self.ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query): # NOQA: E501
with index.query_page(ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query): # NOQA: E501
return Response(
{'count': len(result_page),
'page': result_page.pagenum,
Expand All @@ -570,10 +570,6 @@ class SearchAutoCompleteView(APIView):

permission_classes = (IsAuthenticated,)

def __init__(self, *args, **kwargs):
super(SearchAutoCompleteView, self).__init__(*args, **kwargs)
self.ix = index.open_index()

def get(self, request, format=None):
if 'term' in request.query_params:
term = request.query_params['term']
Expand All @@ -587,7 +583,11 @@ def get(self, request, format=None):
else:
limit = 10

return Response(index.autocomplete(self.ix, term, limit))
from documents import index

ix = index.open_index()

return Response(index.autocomplete(ix, term, limit))


class StatisticsView(APIView):
Expand Down

0 comments on commit 8bf4241

Please sign in to comment.