-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial database impl. Including FTS.
- Loading branch information
Showing
18 changed files
with
529 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# coding: utf-8 | ||
|
||
from bookworm.service import BookwormService | ||
from bookworm.signals import reader_book_loaded | ||
from bookworm.concurrency import process_worker | ||
from .models import BaseModel, Document, Page, DocumentFTSIndex | ||
from .tasks import add_document_to_library | ||
|
||
|
||
class LibraryService(BookwormService): | ||
name = "library" | ||
has_gui = True | ||
|
||
def __post_init__(self): | ||
BaseModel.create_all() | ||
reader_book_loaded.connect( | ||
self.on_reader_loaded, | ||
sender=self.reader | ||
) | ||
|
||
def on_reader_loaded(self, sender): | ||
process_worker.submit( | ||
add_document_to_library, | ||
document=sender.document, | ||
category="Uncategorized", | ||
tags=["Hello", "world",] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# coding: utf-8 | ||
|
||
import argparse | ||
from bookworm.library.tasks import add_document_to_library | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("uri", help="Document URI to open") | ||
parser.add_argument("--category", help="Category of the given document", type=str) | ||
parser.add_argument("--tags", help="Tags of the given document", type=str) | ||
args = parser.parse_args() | ||
print( | ||
f"Opening document: {args.uri}\n" | ||
f"Document category: {args.category}\n" | ||
f"Document tags: {args.tags}" | ||
) | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
# coding: utf-8 | ||
|
||
import typing | ||
from peewee import * | ||
from peewee import NodeList, EnclosedNodeList, ColumnBase | ||
from playhouse.apsw_ext import APSWDatabase, BooleanField, DateTimeField | ||
from bookworm.document.uri import DocumentUri | ||
from bookworm.image_io import ImageIO | ||
|
||
|
||
class AutoOptimizedAPSWDatabase(APSWDatabase): | ||
"""Optimizes the database after closing each connection as per recommended practices for sqlite3.""" | ||
|
||
def close(self): | ||
cursor = self.connection().cursor() | ||
cursor.execute("PRAGMA optimize") | ||
super().close() | ||
|
||
|
||
class AutoCalculatedField(Field): | ||
|
||
AUTO_GEN_COLUMN_TYPES = ('virtual', 'stored',) | ||
|
||
def __init__(self, *args, auto_gen_data_type: typing.Union[Field, str], auto_gen_expression: ColumnBase, auto_gen_always: bool=True, auto_gen_column_type: str='virtual', **kwargs): | ||
assert auto_gen_column_type in self.AUTO_GEN_COLUMN_TYPES, f"auto_gen_column_type must be one of {self.AUTO_GEN_COLUMN_TYPES}" | ||
super().__init__(*args, **kwargs) | ||
self.auto_gen_data_type = auto_gen_data_type | ||
self.auto_gen_expression = auto_gen_expression | ||
self.auto_gen_always = auto_gen_always | ||
self.auto_gen_column_type = auto_gen_column_type | ||
|
||
def ddl_datatype(self, ctx): | ||
return ( | ||
self.auto_gen_data_type | ||
if type(self.auto_gen_data_type) is str | ||
else self.auto_gen_data_type().ddl_datatype(ctx) | ||
) | ||
|
||
def ddl(self, ctx): | ||
node_list = super().ddl(ctx) | ||
ag_auto_gen = SQL("GENERATED ALWAYS" if self.auto_gen_always else "") | ||
ag_col_type = SQL(self.auto_gen_column_type.upper()) | ||
return NodeList(( | ||
node_list, | ||
ag_auto_gen, | ||
SQL('AS'), | ||
EnclosedNodeList([self.auto_gen_expression,]), | ||
ag_col_type | ||
)) | ||
|
||
|
||
|
||
|
||
class ImageField(BlobField): | ||
"""Uses ImageIO to store and retreive images from the database.""" | ||
|
||
def db_value(self, value): | ||
return value.as_bytes(format="JPEG") | ||
|
||
def python_value(self, value): | ||
return ImageIO.from_bytes(value) | ||
|
||
|
||
|
||
class DocumentUriField(TextField): | ||
|
||
def db_value(self, value): | ||
return value.to_uri_string() | ||
|
||
def python_value(self, value): | ||
return DocumentUri.from_uri_string(value) | ||
|
||
|
||
class SqliteViewSchemaManager(SchemaManager): | ||
def _create_table(self, safe=True, **options): | ||
if not getattr(self.model, 'view_select_builder', None): | ||
raise TypeError("view_select_builder method is required on view tables.") | ||
meta = self.model._meta | ||
columns = { | ||
field.column_name | ||
for field in meta.sorted_fields | ||
} | ||
is_temp = options.pop('temporary', False) | ||
ctx = self._create_context() | ||
ctx.literal('CREATE TEMPORARY VIEW ' if is_temp else 'CREATE VIEW ') | ||
if safe: | ||
ctx.literal('IF NOT EXISTS ') | ||
ctx.sql(self.model).literal(' ') | ||
ctx.literal('AS ') | ||
ctx.sql(self.model.view_select_builder()) | ||
return ctx | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# coding: utf-8 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,221 @@ | ||
# coding: utf-8 | ||
|
||
import os | ||
import ujson | ||
from dataclasses import dataclass | ||
from datetime import datetime | ||
from peewee import * | ||
from playhouse.sqlite_ext import ( | ||
FTSModel, | ||
AutoIncrementField, | ||
RowIDField, | ||
JSONField, | ||
SearchField, | ||
) | ||
from bookworm.paths import db_path | ||
from .database import ( | ||
AutoOptimizedAPSWDatabase, | ||
AutoCalculatedField, | ||
BooleanField, | ||
DateTimeField, | ||
DocumentUriField, | ||
ImageField, | ||
SqliteViewSchemaManager, | ||
) | ||
|
||
|
||
BOOKWORM_BOOKSHELF_APP_ID = 10194273 | ||
BOOKWORM_BOOKSHELF_SCHEMA_VERSION = 1 | ||
LIBRARY_DATABASE_FILE = db_path("bookshelf.sqlite") | ||
database = AutoOptimizedAPSWDatabase( | ||
os.fspath(LIBRARY_DATABASE_FILE), | ||
json_contains=True, | ||
pragmas=[ | ||
('cache_size', -1024 * 64), | ||
('journal_mode', 'wal'), | ||
('foreign_keys', 'ON'), | ||
] | ||
) | ||
|
||
|
||
@dataclass | ||
class FullTextSearchResult: | ||
document_id: int | ||
page_index: int | ||
document_title: str = None | ||
snippet: str = None | ||
|
||
@property | ||
def document(self): | ||
return Document.get_by_id(self.document_id) | ||
doc_id = VwDocumentPage.get(page_id=1).document_id | ||
return Document.get_by_id(doc_id) | ||
|
||
|
||
class BaseModel(Model): | ||
class Meta: | ||
database = database | ||
legacy_table_names = False | ||
|
||
@classmethod | ||
def create_all(cls): | ||
database = cls._meta.database | ||
database.create_tables(( | ||
Author, | ||
Category, | ||
Format, | ||
Tag, | ||
Document, | ||
Page, | ||
VwDocumentPage, | ||
DocumentAuthor, | ||
DocumentTag, | ||
DocumentFTSIndex, | ||
)) | ||
with database: | ||
cursor = database.connection().cursor() | ||
cursor.execute(f"PRAGMA application_id={BOOKWORM_BOOKSHELF_APP_ID}") | ||
cursor.execute(f"PRAGMA user_version={BOOKWORM_BOOKSHELF_SCHEMA_VERSION}") | ||
|
||
|
||
class Author(BaseModel): | ||
name = TextField(index=True, null=False) | ||
|
||
|
||
class Category(BaseModel): | ||
name = TextField(unique=True, null=False) | ||
|
||
|
||
class Format(BaseModel): | ||
name = TextField(unique=True, null=False) | ||
|
||
|
||
class Tag(BaseModel): | ||
name = TextField(unique=True, null=False) | ||
|
||
|
||
class Document(BaseModel): | ||
id = AutoIncrementField() | ||
uri = DocumentUriField(unique=True, null=False) | ||
title = TextField(index=True, null=False) | ||
publication_date = DateTimeField(index=True, null=True) | ||
date_added = DateTimeField(default=datetime.utcnow, index=True, null=False) | ||
cover_image = ImageField(null=True) | ||
format = ForeignKeyField(column_name="format_id", field="id", model=Format, backref="documents") | ||
category = ForeignKeyField(column_name="category_id", field="id", model=Category, backref="documents") | ||
metadata = JSONField(json_dumps=ujson.dumps, json_loads=ujson.loads, null=True) | ||
|
||
|
||
class Page(BaseModel): | ||
number = IntegerField(null=False) | ||
content = TextField(null=False) | ||
document = ForeignKeyField(column_name="document_id", field="id", model=Document, backref="pages") | ||
|
||
|
||
class DocumentAuthor(BaseModel): | ||
document = ForeignKeyField(column_name="document_id", field="id", model=Document, backref="authors") | ||
author = ForeignKeyField(column_name="author_id", field="id", model=Author, backref="documents") | ||
|
||
class Meta: | ||
indexes = ((("document", "author"), True),) | ||
primary_key = CompositeKey("document", "author") | ||
|
||
|
||
class DocumentTag(BaseModel): | ||
document = ForeignKeyField(column_name="document_id", field="id", model=Document, backref="tags") | ||
tag = ForeignKeyField(column_name="tag_id", field="id", model=Tag, backref="documents") | ||
|
||
class Meta: | ||
indexes = ((("document", "tag"), True),) | ||
primary_key = CompositeKey("document", "tag") | ||
|
||
|
||
class VwDocumentPage(BaseModel): | ||
"""A custom view to aggregate information from the document and page tables.""" | ||
|
||
page_id = IntegerField() | ||
page_number = IntegerField() | ||
document_id = IntegerField() | ||
document_title = TextField() | ||
content = TextField() | ||
|
||
@classmethod | ||
def view_select_builder(cls): | ||
return ( | ||
Page.select( | ||
Page.id.alias("page_id"), | ||
Page.number.alias("page_number"), | ||
Document.id.alias("document_id"), | ||
Document.title.alias("document_title"), | ||
Page.content.alias("content"), | ||
) | ||
.join(Document, on=Page.document_id == Document.id) | ||
) | ||
|
||
class Meta: | ||
primary_key = False | ||
schema_manager_class = SqliteViewSchemaManager | ||
|
||
|
||
class DocumentFTSIndex(BaseModel, FTSModel): | ||
rowid = RowIDField() | ||
page_number = SearchField(unindexed=True) | ||
document_id = SearchField(unindexed=True) | ||
document_title = SearchField(unindexed=True) | ||
content = SearchField() | ||
|
||
@classmethod | ||
def add_document_to_search_index(cls, document_id): | ||
return DocumentFTSIndex.insert_from( | ||
( | ||
VwDocumentPage.select( | ||
VwDocumentPage.page_id.alias("rowid"), | ||
VwDocumentPage.page_number.alias("page_number"), | ||
VwDocumentPage.document_id.alias("document_id"), | ||
VwDocumentPage.document_title.alias("document_title"), | ||
VwDocumentPage.content.alias("content"), | ||
) | ||
.join(Document, on=VwDocumentPage.document_id == Document.id) | ||
.join(Page, on=VwDocumentPage.page_id == Page.id) | ||
.where(Document.id == document_id) | ||
), | ||
fields=[ | ||
"rowid", | ||
"page_number", | ||
"document_id", | ||
"document_title", | ||
"content", | ||
] | ||
) | ||
|
||
@classmethod | ||
def perform_search(cls, column, term): | ||
return ( | ||
cls.select( | ||
cls.page_number, | ||
cls.document_id, | ||
cls.document_title, | ||
column.snippet(left='', right='', over_length='', max_tokens=24), | ||
) | ||
.where(column.match(term)) | ||
.order_by(fn.bm25(cls._meta.entity)) | ||
.order_by(cls.document_id) | ||
.order_by(cls.page_number.asc()) | ||
) | ||
|
||
@classmethod | ||
def search_for_term(cls, term) -> list[FullTextSearchResult]: | ||
connection = cls._meta.database.connection() | ||
with connection: | ||
cursor = connection.cursor() | ||
content_matches = cursor.execute(str(cls.perform_search(cls.content, term))) | ||
for (page_number, document_id, document_title, snippet) in content_matches: | ||
yield FullTextSearchResult(page_index=page_number, document_id=document_id, document_title=document_title, snippet=snippet) | ||
|
||
class Meta: | ||
extension_module = "fts5" | ||
options = { | ||
'tokenize': 'porter', | ||
"content": VwDocumentPage, | ||
"content_rowid": VwDocumentPage.page_id, | ||
} |
Oops, something went wrong.