Skip to content

Commit

Permalink
Initial database impl. Including FTS.
Browse files Browse the repository at this point in the history
  • Loading branch information
mush42 committed Nov 29, 2021
1 parent a1b1369 commit edf167d
Show file tree
Hide file tree
Showing 18 changed files with 529 additions and 16 deletions.
4 changes: 2 additions & 2 deletions bookworm/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
description = "The Universally accessible document reader"
author = "Blind Pandas"
author_email = "info@blindpandas.com"
version = "0.4b1"
version_ex = "0.4.0.1"
version = "2021.12b1"
version_ex = "2021.12.0.0"
url = "https://github.com/blindpandas/bookworm"
website = "https://getbookworm.com"
update_url = "https://getbookworm.com/update_info.json"
Expand Down
27 changes: 27 additions & 0 deletions bookworm/bookshelf/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# coding: utf-8

from bookworm.service import BookwormService
from bookworm.signals import reader_book_loaded
from bookworm.concurrency import process_worker
from .models import BaseModel, Document, Page, DocumentFTSIndex
from .tasks import add_document_to_library


class LibraryService(BookwormService):
name = "library"
has_gui = True

def __post_init__(self):
BaseModel.create_all()
reader_book_loaded.connect(
self.on_reader_loaded,
sender=self.reader
)

def on_reader_loaded(self, sender):
process_worker.submit(
add_document_to_library,
document=sender.document,
category="Uncategorized",
tags=["Hello", "world",]
)
20 changes: 20 additions & 0 deletions bookworm/bookshelf/commandline_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# coding: utf-8

import argparse
from bookworm.library.tasks import add_document_to_library


def main():
parser = argparse.ArgumentParser()
parser.add_argument("uri", help="Document URI to open")
parser.add_argument("--category", help="Category of the given document", type=str)
parser.add_argument("--tags", help="Tags of the given document", type=str)
args = parser.parse_args()
print(
f"Opening document: {args.uri}\n"
f"Document category: {args.category}\n"
f"Document tags: {args.tags}"
)

if __name__ == '__main__':
main()
92 changes: 92 additions & 0 deletions bookworm/bookshelf/database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# coding: utf-8

import typing
from peewee import *
from peewee import NodeList, EnclosedNodeList, ColumnBase
from playhouse.apsw_ext import APSWDatabase, BooleanField, DateTimeField
from bookworm.document.uri import DocumentUri
from bookworm.image_io import ImageIO


class AutoOptimizedAPSWDatabase(APSWDatabase):
"""Optimizes the database after closing each connection as per recommended practices for sqlite3."""

def close(self):
cursor = self.connection().cursor()
cursor.execute("PRAGMA optimize")
super().close()


class AutoCalculatedField(Field):

AUTO_GEN_COLUMN_TYPES = ('virtual', 'stored',)

def __init__(self, *args, auto_gen_data_type: typing.Union[Field, str], auto_gen_expression: ColumnBase, auto_gen_always: bool=True, auto_gen_column_type: str='virtual', **kwargs):
assert auto_gen_column_type in self.AUTO_GEN_COLUMN_TYPES, f"auto_gen_column_type must be one of {self.AUTO_GEN_COLUMN_TYPES}"
super().__init__(*args, **kwargs)
self.auto_gen_data_type = auto_gen_data_type
self.auto_gen_expression = auto_gen_expression
self.auto_gen_always = auto_gen_always
self.auto_gen_column_type = auto_gen_column_type

def ddl_datatype(self, ctx):
return (
self.auto_gen_data_type
if type(self.auto_gen_data_type) is str
else self.auto_gen_data_type().ddl_datatype(ctx)
)

def ddl(self, ctx):
node_list = super().ddl(ctx)
ag_auto_gen = SQL("GENERATED ALWAYS" if self.auto_gen_always else "")
ag_col_type = SQL(self.auto_gen_column_type.upper())
return NodeList((
node_list,
ag_auto_gen,
SQL('AS'),
EnclosedNodeList([self.auto_gen_expression,]),
ag_col_type
))




class ImageField(BlobField):
"""Uses ImageIO to store and retreive images from the database."""

def db_value(self, value):
return value.as_bytes(format="JPEG")

def python_value(self, value):
return ImageIO.from_bytes(value)



class DocumentUriField(TextField):

def db_value(self, value):
return value.to_uri_string()

def python_value(self, value):
return DocumentUri.from_uri_string(value)


class SqliteViewSchemaManager(SchemaManager):
def _create_table(self, safe=True, **options):
if not getattr(self.model, 'view_select_builder', None):
raise TypeError("view_select_builder method is required on view tables.")
meta = self.model._meta
columns = {
field.column_name
for field in meta.sorted_fields
}
is_temp = options.pop('temporary', False)
ctx = self._create_context()
ctx.literal('CREATE TEMPORARY VIEW ' if is_temp else 'CREATE VIEW ')
if safe:
ctx.literal('IF NOT EXISTS ')
ctx.sql(self.model).literal(' ')
ctx.literal('AS ')
ctx.sql(self.model.view_select_builder())
return ctx

2 changes: 2 additions & 0 deletions bookworm/bookshelf/interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# coding: utf-8

221 changes: 221 additions & 0 deletions bookworm/bookshelf/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
# coding: utf-8

import os
import ujson
from dataclasses import dataclass
from datetime import datetime
from peewee import *
from playhouse.sqlite_ext import (
FTSModel,
AutoIncrementField,
RowIDField,
JSONField,
SearchField,
)
from bookworm.paths import db_path
from .database import (
AutoOptimizedAPSWDatabase,
AutoCalculatedField,
BooleanField,
DateTimeField,
DocumentUriField,
ImageField,
SqliteViewSchemaManager,
)


BOOKWORM_BOOKSHELF_APP_ID = 10194273
BOOKWORM_BOOKSHELF_SCHEMA_VERSION = 1
LIBRARY_DATABASE_FILE = db_path("bookshelf.sqlite")
database = AutoOptimizedAPSWDatabase(
os.fspath(LIBRARY_DATABASE_FILE),
json_contains=True,
pragmas=[
('cache_size', -1024 * 64),
('journal_mode', 'wal'),
('foreign_keys', 'ON'),
]
)


@dataclass
class FullTextSearchResult:
document_id: int
page_index: int
document_title: str = None
snippet: str = None

@property
def document(self):
return Document.get_by_id(self.document_id)
doc_id = VwDocumentPage.get(page_id=1).document_id
return Document.get_by_id(doc_id)


class BaseModel(Model):
class Meta:
database = database
legacy_table_names = False

@classmethod
def create_all(cls):
database = cls._meta.database
database.create_tables((
Author,
Category,
Format,
Tag,
Document,
Page,
VwDocumentPage,
DocumentAuthor,
DocumentTag,
DocumentFTSIndex,
))
with database:
cursor = database.connection().cursor()
cursor.execute(f"PRAGMA application_id={BOOKWORM_BOOKSHELF_APP_ID}")
cursor.execute(f"PRAGMA user_version={BOOKWORM_BOOKSHELF_SCHEMA_VERSION}")


class Author(BaseModel):
name = TextField(index=True, null=False)


class Category(BaseModel):
name = TextField(unique=True, null=False)


class Format(BaseModel):
name = TextField(unique=True, null=False)


class Tag(BaseModel):
name = TextField(unique=True, null=False)


class Document(BaseModel):
id = AutoIncrementField()
uri = DocumentUriField(unique=True, null=False)
title = TextField(index=True, null=False)
publication_date = DateTimeField(index=True, null=True)
date_added = DateTimeField(default=datetime.utcnow, index=True, null=False)
cover_image = ImageField(null=True)
format = ForeignKeyField(column_name="format_id", field="id", model=Format, backref="documents")
category = ForeignKeyField(column_name="category_id", field="id", model=Category, backref="documents")
metadata = JSONField(json_dumps=ujson.dumps, json_loads=ujson.loads, null=True)


class Page(BaseModel):
number = IntegerField(null=False)
content = TextField(null=False)
document = ForeignKeyField(column_name="document_id", field="id", model=Document, backref="pages")


class DocumentAuthor(BaseModel):
document = ForeignKeyField(column_name="document_id", field="id", model=Document, backref="authors")
author = ForeignKeyField(column_name="author_id", field="id", model=Author, backref="documents")

class Meta:
indexes = ((("document", "author"), True),)
primary_key = CompositeKey("document", "author")


class DocumentTag(BaseModel):
document = ForeignKeyField(column_name="document_id", field="id", model=Document, backref="tags")
tag = ForeignKeyField(column_name="tag_id", field="id", model=Tag, backref="documents")

class Meta:
indexes = ((("document", "tag"), True),)
primary_key = CompositeKey("document", "tag")


class VwDocumentPage(BaseModel):
"""A custom view to aggregate information from the document and page tables."""

page_id = IntegerField()
page_number = IntegerField()
document_id = IntegerField()
document_title = TextField()
content = TextField()

@classmethod
def view_select_builder(cls):
return (
Page.select(
Page.id.alias("page_id"),
Page.number.alias("page_number"),
Document.id.alias("document_id"),
Document.title.alias("document_title"),
Page.content.alias("content"),
)
.join(Document, on=Page.document_id == Document.id)
)

class Meta:
primary_key = False
schema_manager_class = SqliteViewSchemaManager


class DocumentFTSIndex(BaseModel, FTSModel):
rowid = RowIDField()
page_number = SearchField(unindexed=True)
document_id = SearchField(unindexed=True)
document_title = SearchField(unindexed=True)
content = SearchField()

@classmethod
def add_document_to_search_index(cls, document_id):
return DocumentFTSIndex.insert_from(
(
VwDocumentPage.select(
VwDocumentPage.page_id.alias("rowid"),
VwDocumentPage.page_number.alias("page_number"),
VwDocumentPage.document_id.alias("document_id"),
VwDocumentPage.document_title.alias("document_title"),
VwDocumentPage.content.alias("content"),
)
.join(Document, on=VwDocumentPage.document_id == Document.id)
.join(Page, on=VwDocumentPage.page_id == Page.id)
.where(Document.id == document_id)
),
fields=[
"rowid",
"page_number",
"document_id",
"document_title",
"content",
]
)

@classmethod
def perform_search(cls, column, term):
return (
cls.select(
cls.page_number,
cls.document_id,
cls.document_title,
column.snippet(left='', right='', over_length='', max_tokens=24),
)
.where(column.match(term))
.order_by(fn.bm25(cls._meta.entity))
.order_by(cls.document_id)
.order_by(cls.page_number.asc())
)

@classmethod
def search_for_term(cls, term) -> list[FullTextSearchResult]:
connection = cls._meta.database.connection()
with connection:
cursor = connection.cursor()
content_matches = cursor.execute(str(cls.perform_search(cls.content, term)))
for (page_number, document_id, document_title, snippet) in content_matches:
yield FullTextSearchResult(page_index=page_number, document_id=document_id, document_title=document_title, snippet=snippet)

class Meta:
extension_module = "fts5"
options = {
'tokenize': 'porter',
"content": VwDocumentPage,
"content_rowid": VwDocumentPage.page_id,
}
Loading

0 comments on commit edf167d

Please sign in to comment.