Initial database impl. Including FTS.

blindpandas · Nov 29, 2021 · edf167d · edf167d
1 parent a1b1369
commit edf167d
Show file tree

Hide file tree

Showing 18 changed files with 529 additions and 16 deletions.
diff --git a/bookworm/app.py b/bookworm/app.py
@@ -10,8 +10,8 @@
 description = "The Universally accessible document reader"
 author = "Blind Pandas"
 author_email = "info@blindpandas.com"
-version = "0.4b1"
-version_ex = "0.4.0.1"
+version = "2021.12b1"
+version_ex = "2021.12.0.0"
 url = "https://github.com/blindpandas/bookworm"
 website = "https://getbookworm.com"
 update_url = "https://getbookworm.com/update_info.json"

diff --git a/bookworm/bookshelf/__init__.py b/bookworm/bookshelf/__init__.py
@@ -0,0 +1,27 @@
+# coding: utf-8
+
+from bookworm.service import BookwormService
+from bookworm.signals import reader_book_loaded
+from bookworm.concurrency import process_worker
+from .models import BaseModel, Document, Page, DocumentFTSIndex
+from .tasks import add_document_to_library
+
+
+class LibraryService(BookwormService):
+    name = "library"
+    has_gui = True
+
+    def __post_init__(self):
+        BaseModel.create_all()
+        reader_book_loaded.connect(
+            self.on_reader_loaded,
+            sender=self.reader
+        )
+
+    def on_reader_loaded(self, sender):
+        process_worker.submit(
+            add_document_to_library,
+            document=sender.document,
+            category="Uncategorized",
+            tags=["Hello", "world",]
+        )
diff --git a/bookworm/bookshelf/commandline_app.py b/bookworm/bookshelf/commandline_app.py
@@ -0,0 +1,20 @@
+# coding: utf-8
+
+import argparse
+from bookworm.library.tasks import add_document_to_library
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("uri", help="Document URI to open")
+    parser.add_argument("--category", help="Category of the given document", type=str)
+    parser.add_argument("--tags", help="Tags of the given document", type=str)
+    args = parser.parse_args()
+    print(
+        f"Opening document: {args.uri}\n"
+        f"Document category: {args.category}\n"
+        f"Document tags: {args.tags}"
+    )
+
+if __name__ == '__main__':
+    main()
diff --git a/bookworm/bookshelf/database.py b/bookworm/bookshelf/database.py
@@ -0,0 +1,92 @@
+# coding: utf-8
+
+import typing
+from peewee import *
+from peewee import NodeList, EnclosedNodeList, ColumnBase
+from playhouse.apsw_ext import APSWDatabase, BooleanField, DateTimeField
+from bookworm.document.uri import DocumentUri
+from bookworm.image_io import ImageIO
+
+
+class AutoOptimizedAPSWDatabase(APSWDatabase):
+    """Optimizes the database after closing each connection as per recommended practices for sqlite3."""
+
+    def close(self):
+        cursor = self.connection().cursor()
+        cursor.execute("PRAGMA optimize")
+        super().close()
+
+
+class AutoCalculatedField(Field):
+
+    AUTO_GEN_COLUMN_TYPES = ('virtual', 'stored',)
+
+    def __init__(self, *args, auto_gen_data_type: typing.Union[Field, str], auto_gen_expression: ColumnBase, auto_gen_always: bool=True, auto_gen_column_type: str='virtual', **kwargs):
+        assert auto_gen_column_type in self.AUTO_GEN_COLUMN_TYPES, f"auto_gen_column_type must be one of {self.AUTO_GEN_COLUMN_TYPES}"
+        super().__init__(*args, **kwargs)
+        self.auto_gen_data_type = auto_gen_data_type
+        self.auto_gen_expression = auto_gen_expression
+        self.auto_gen_always = auto_gen_always
+        self.auto_gen_column_type = auto_gen_column_type
+
+    def ddl_datatype(self, ctx):
+        return (
+            self.auto_gen_data_type
+            if type(self.auto_gen_data_type) is str
+            else self.auto_gen_data_type().ddl_datatype(ctx)
+        )
+
+    def ddl(self, ctx):
+        node_list = super().ddl(ctx)
+        ag_auto_gen = SQL("GENERATED ALWAYS" if self.auto_gen_always else "")
+        ag_col_type = SQL(self.auto_gen_column_type.upper())
+        return NodeList((
+            node_list,
+            ag_auto_gen,
+            SQL('AS'),
+            EnclosedNodeList([self.auto_gen_expression,]),
+            ag_col_type
+        ))
+
+
+
+
+class ImageField(BlobField):
+    """Uses ImageIO to store and retreive images from the database."""
+
+    def db_value(self, value):
+        return value.as_bytes(format="JPEG")
+
+    def python_value(self, value):
+        return ImageIO.from_bytes(value)
+
+
+
+class DocumentUriField(TextField):
+
+    def db_value(self, value):
+        return value.to_uri_string()
+
+    def python_value(self, value):
+        return DocumentUri.from_uri_string(value)
+
+
+class SqliteViewSchemaManager(SchemaManager):
+    def _create_table(self, safe=True, **options):
+        if not getattr(self.model, 'view_select_builder', None):
+            raise TypeError("view_select_builder method is required on view tables.")
+        meta = self.model._meta
+        columns = {
+            field.column_name
+            for field in meta.sorted_fields
+        }
+        is_temp = options.pop('temporary', False)
+        ctx = self._create_context()
+        ctx.literal('CREATE TEMPORARY VIEW ' if is_temp else 'CREATE VIEW ')
+        if safe:
+            ctx.literal('IF NOT EXISTS ')
+        ctx.sql(self.model).literal(' ')
+        ctx.literal('AS ')
+        ctx.sql(self.model.view_select_builder())
+        return ctx
+
diff --git a/bookworm/bookshelf/interface.py b/bookworm/bookshelf/interface.py
@@ -0,0 +1,2 @@
+# coding: utf-8
+
diff --git a/bookworm/bookshelf/models.py b/bookworm/bookshelf/models.py
@@ -0,0 +1,221 @@
+# coding: utf-8
+
+import os
+import ujson
+from dataclasses import dataclass
+from datetime import datetime
+from peewee import *
+from playhouse.sqlite_ext import (
+    FTSModel,
+    AutoIncrementField,
+    RowIDField,
+    JSONField,
+    SearchField,
+)
+from bookworm.paths import db_path
+from .database import (
+    AutoOptimizedAPSWDatabase,
+    AutoCalculatedField,
+    BooleanField,
+    DateTimeField,
+    DocumentUriField,
+    ImageField,
+    SqliteViewSchemaManager,
+)
+
+
+BOOKWORM_BOOKSHELF_APP_ID = 10194273
+BOOKWORM_BOOKSHELF_SCHEMA_VERSION = 1
+LIBRARY_DATABASE_FILE = db_path("bookshelf.sqlite")
+database = AutoOptimizedAPSWDatabase(
+    os.fspath(LIBRARY_DATABASE_FILE),
+    json_contains=True,
+    pragmas=[
+        ('cache_size', -1024 * 64),
+        ('journal_mode', 'wal'),
+        ('foreign_keys', 'ON'),
+    ]
+)
+
+
+@dataclass
+class FullTextSearchResult:
+    document_id: int
+    page_index: int
+    document_title: str = None
+    snippet: str = None
+
+    @property
+    def document(self):
+        return Document.get_by_id(self.document_id)
+        doc_id = VwDocumentPage.get(page_id=1).document_id
+        return Document.get_by_id(doc_id)
+
+
+class BaseModel(Model):
+    class Meta:
+        database = database
+        legacy_table_names = False 
+
+    @classmethod
+    def create_all(cls):
+        database = cls._meta.database
+        database.create_tables((
+            Author,
+            Category,
+            Format,
+            Tag,
+            Document,
+            Page,
+            VwDocumentPage,
+            DocumentAuthor,
+            DocumentTag,
+            DocumentFTSIndex,
+        ))
+        with database:
+            cursor = database.connection().cursor()
+            cursor.execute(f"PRAGMA application_id={BOOKWORM_BOOKSHELF_APP_ID}")
+            cursor.execute(f"PRAGMA user_version={BOOKWORM_BOOKSHELF_SCHEMA_VERSION}")
+
+
+class Author(BaseModel):
+    name = TextField(index=True, null=False)
+
+
+class Category(BaseModel):
+    name = TextField(unique=True, null=False)
+
+
+class Format(BaseModel):
+    name = TextField(unique=True, null=False)
+
+
+class Tag(BaseModel):
+    name = TextField(unique=True, null=False)
+
+
+class Document(BaseModel):
+    id = AutoIncrementField()
+    uri = DocumentUriField(unique=True, null=False)
+    title = TextField(index=True, null=False)
+    publication_date = DateTimeField(index=True, null=True)
+    date_added = DateTimeField(default=datetime.utcnow, index=True, null=False)
+    cover_image = ImageField(null=True)
+    format = ForeignKeyField(column_name="format_id", field="id", model=Format, backref="documents")
+    category = ForeignKeyField(column_name="category_id", field="id", model=Category, backref="documents")
+    metadata = JSONField(json_dumps=ujson.dumps, json_loads=ujson.loads, null=True)
+
+
+class Page(BaseModel):
+    number = IntegerField(null=False)
+    content = TextField(null=False)
+    document = ForeignKeyField(column_name="document_id", field="id", model=Document, backref="pages")
+
+
+class DocumentAuthor(BaseModel):
+    document = ForeignKeyField(column_name="document_id", field="id", model=Document, backref="authors")
+    author = ForeignKeyField(column_name="author_id", field="id", model=Author, backref="documents")
+
+    class Meta:
+        indexes = ((("document", "author"), True),)
+        primary_key = CompositeKey("document", "author")
+
+
+class DocumentTag(BaseModel):
+    document = ForeignKeyField(column_name="document_id", field="id", model=Document, backref="tags")
+    tag = ForeignKeyField(column_name="tag_id", field="id", model=Tag, backref="documents")
+
+    class Meta:
+        indexes = ((("document", "tag"), True),)
+        primary_key = CompositeKey("document", "tag")
+
+
+class VwDocumentPage(BaseModel):
+    """A custom view to aggregate information from the document and page tables."""
+
+    page_id = IntegerField()
+    page_number = IntegerField()
+    document_id = IntegerField()
+    document_title = TextField()
+    content = TextField()
+
+    @classmethod
+    def view_select_builder(cls):
+        return (
+            Page.select(
+                Page.id.alias("page_id"),
+                Page.number.alias("page_number"),
+                Document.id.alias("document_id"),
+                Document.title.alias("document_title"),
+                Page.content.alias("content"),
+            )
+            .join(Document, on=Page.document_id == Document.id)
+        )
+
+    class Meta:
+        primary_key = False
+        schema_manager_class = SqliteViewSchemaManager
+
+
+class DocumentFTSIndex(BaseModel, FTSModel):
+    rowid = RowIDField()
+    page_number = SearchField(unindexed=True)
+    document_id = SearchField(unindexed=True)
+    document_title = SearchField(unindexed=True)
+    content = SearchField()
+
+    @classmethod
+    def add_document_to_search_index(cls, document_id):
+        return DocumentFTSIndex.insert_from(
+            (
+                VwDocumentPage.select(
+                    VwDocumentPage.page_id.alias("rowid"),
+                    VwDocumentPage.page_number.alias("page_number"),
+                    VwDocumentPage.document_id.alias("document_id"),
+                    VwDocumentPage.document_title.alias("document_title"),
+                    VwDocumentPage.content.alias("content"),
+                )
+                .join(Document, on=VwDocumentPage.document_id == Document.id)
+                .join(Page, on=VwDocumentPage.page_id == Page.id)
+                .where(Document.id == document_id)
+            ),
+            fields=[
+                "rowid",
+                "page_number",
+                "document_id",
+                "document_title",
+                "content",
+            ]
+        )
+
+    @classmethod
+    def perform_search(cls, column, term):
+        return (
+            cls.select(
+                cls.page_number,
+                cls.document_id,
+                cls.document_title,
+                column.snippet(left='', right='', over_length='', max_tokens=24),
+            )
+            .where(column.match(term))
+            .order_by(fn.bm25(cls._meta.entity))
+            .order_by(cls.document_id)
+            .order_by(cls.page_number.asc())
+        )
+
+    @classmethod
+    def search_for_term(cls, term) -> list[FullTextSearchResult]:
+        connection = cls._meta.database.connection()
+        with connection:
+            cursor = connection.cursor()
+            content_matches = cursor.execute(str(cls.perform_search(cls.content, term)))
+            for (page_number, document_id, document_title, snippet) in content_matches:
+                yield FullTextSearchResult(page_index=page_number, document_id=document_id, document_title=document_title, snippet=snippet)
+
+    class Meta:
+        extension_module  = "fts5"
+        options = {
+            'tokenize': 'porter',
+            "content": VwDocumentPage,
+            "content_rowid": VwDocumentPage.page_id,
+        }