Initial daisy implementation

pauliyobo · pauliyobo · commit 7d8ba6fab613 · 2025-06-25T01:47:29.000+02:00
diff --git a/bookworm/document/formats/__init__.py b/bookworm/document/formats/__init__.py
@@ -1,6 +1,7 @@
 # coding: utf-8
 
-from .archive import ArchivedDocument
+#from .archive import ArchivedDocument
+from .daisy import DaisyDocument
 from .epub import EpubDocument
 from .fb2 import FB2Document, FitzFB2Document
 from .html import FileSystemHtmlDocument, WebHtmlDocument
diff --git a/bookworm/document/formats/daisy.py b/bookworm/document/formats/daisy.py
@@ -1,19 +1,30 @@
 """Daisy 3.0  document format """
+from collections import OrderedDict
 from dataclasses import dataclass
+import glob
 from pathlib import Path
 from typing import Dict, List
+import zipfile
 from zipfile import ZipFile
 
 from lxml import etree
 
+from bookworm.document.base import SinglePageDocument, SINGLE_PAGE_DOCUMENT_PAGER, TreeStackBuilder
+from bookworm.document import BookMetadata, DocumentCapability as DC, Section
+from bookworm.logger import logger
+from bookworm.structured_text import TextRange
+from bookworm.structured_text.structured_html_parser import StructuredHtmlParser
+
+log = logger.getChild(__name__)
+
 @dataclass
 class DaisyMetadata:
     """metadata of a daisy book"""
     title: str
     author: str
     publisher: str
     language: str
-    path: str
+    path: Path | zipfile.Path
 
 @dataclass
 class DaisyNavPoint:
@@ -22,14 +33,15 @@ class DaisyNavPoint:
     content: str
     label: str
 
-
-def _parse_opf(path: Path) -> DaisyMetadata:
-    entries = list(path.glob("*.opf"))
+def _parse_opf(path: Path | zipfile.Path) -> DaisyMetadata:
+    """Parses the OPF file of a daisy3 book in order to obtain its book metadata"""
+    # we have to use path.iterdir() instead of path.glob() because we want to be generic over the type of path this is
+    # ZipFile.Path() does not support glob
+    entries = [x for x in list(path.iterdir()) if x.name.endswith('.opf')]
     if not entries:
         raise FileNotFoundError("Could not find daisy OPF file")
     opf = entries[0]
-    with open(opf, 'rb') as f:
-        tree = etree.fromstring(f.read())
+    tree = etree.fromstring(opf.read_bytes())
     dc_metadata = tree.find('metadata/dc-metadata', tree.nsmap)
     nsmap = dc_metadata.nsmap
     # We can now obtain the book's information
@@ -44,16 +56,20 @@ def _parse_opf(path: Path) -> DaisyMetadata:
 
 @dataclass
 class DaisyBook:
+    """A daisy3 book representation"""
     metadata: DaisyMetadata
     toc: List[DaisyNavPoint]
     nav_ref: Dict[str, str]
 
-def _parse_ncx(path: Path) -> List[DaisyNavPoint]:
-    entries = list(path.glob("*.ncx"))
+def _parse_ncx(path: Path | zipfile.Path) -> List[DaisyNavPoint]:
+    """
+    Parses a daisy NCX file in order to extract the book's table of content
+    """
+    entries = [x for x in list(path.iterdir()) if x.name.endswith('.ncx')]
     if not entries:
+        # We return an empty list if no NCX file is found
         return []
-    with open(entries[0], 'rb') as f:
-        tree = etree.fromstring(f.read())
+    tree = etree.fromstring(entries[0].read_bytes())
     # navPoints are all nested inside the navMap
     # We are not interested in the navInfo element, which means that findall() will likely suffice
     nav_points = tree.findall('navMap/navPoint', tree.nsmap)
@@ -71,15 +87,21 @@ def parse_point(element) -> DaisyNavPoint:
 
 
 def read_daisy(path: Path) -> DaisyBook:
+    """
+    Reads a daisy book either from an extracted directory, or from a zipfile
+    """
+    # TODO: Is it ok to just read from the zipfile rather than extracting it and be done with it?
+    if path.is_file() and zipfile.is_zipfile(path):
+            zip = ZipFile(path)
+            path = zipfile.Path(zip)
     metadata = _parse_opf(path)
     toc = _parse_ncx(path)
     tree_cache = {}
     nav_ref = {}
     def get_smil(file: str):
         entry = tree_cache.get(file)
         if not entry:
-            with open(path / file, 'rb') as f:
-                entry = etree.parse(f)
+            entry = etree.fromstring((path / file).read_bytes())
             tree_cache[file] = entry
         return entry
     for point in toc:
@@ -94,3 +116,71 @@ def get_smil(file: str):
         toc=toc,
         nav_ref=nav_ref
     )
+
+class DaisyDocument(SinglePageDocument):
+    """Daisy document"""
+    format = "daisy"
+    name = _("Daisy")
+    extensions = ("*.zip",)
+    capabilities = (
+        DC.TOC_TREE
+        | DC.METADATA
+        | DC.SINGLE_PAGE
+    )
+
+    def read(self) -> None:
+        super().read()
+        self._book: DaisyBook = read_daisy(self.get_file_system_path())
+        self.structure = StructuredHtmlParser.from_string(self._get_xml())
+        self._toc = self._build_toc()
+
+    def get_content(self) -> str:
+        return self.structure.get_text()
+
+    @property
+    def toc_tree(self) -> Section:
+        return self._toc
+
+    @property
+    def metadata(self) -> BookMetadata:
+        return BookMetadata(
+            title=self._book.metadata.title,
+            author=self._book.metadata.author,
+            publisher=self._book.metadata.publisher,
+        )
+
+    def _get_xml(self) -> str:        
+        fragments: set[str] = {self._book.nav_ref[x.content].split('#')[0] for x in self._book.toc}
+        content: list[str] = []
+        for text_file in fragments:
+            try:
+                text_path = self._book.metadata.path / text_file
+                if text_path.exists():
+                    log.debug(f"Reading from {text_file}")
+                    html_content = text_path.read_text(encoding='utf-8')
+                    content.append(html_content)
+            except (KeyError, FileNotFoundError):
+                continue
+        return '\n'.join(content)
+
+    def _build_toc(self) -> Section:
+        root = Section(
+            title=self._book.metadata.title,
+            pager = SINGLE_PAGE_DOCUMENT_PAGER,
+            level=1,
+            text_range=TextRange(0, len(self.structure.get_text())),
+        )
+        stack = TreeStackBuilder(root)
+        for entry in self._book.toc:
+            item_ref = self._book.nav_ref[entry.content].split('#')[1]
+            item_range = self.structure.html_id_ranges.get(item_ref)
+            if item_range:
+                s = Section(
+                    title=entry.label,
+                    pager = SINGLE_PAGE_DOCUMENT_PAGER,
+                    level = 2,
+                    text_range=TextRange(*item_range)
+                )
+                stack.push(s)
+        return root
+