Skip to content

Commit 7d8ba6f

Browse files
committed
Initial daisy implementation
1 parent 711b891 commit 7d8ba6f

File tree

2 files changed

+104
-13
lines changed

2 files changed

+104
-13
lines changed

bookworm/document/formats/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# coding: utf-8
22

3-
from .archive import ArchivedDocument
3+
#from .archive import ArchivedDocument
4+
from .daisy import DaisyDocument
45
from .epub import EpubDocument
56
from .fb2 import FB2Document, FitzFB2Document
67
from .html import FileSystemHtmlDocument, WebHtmlDocument

bookworm/document/formats/daisy.py

Lines changed: 102 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,30 @@
11
"""Daisy 3.0 document format """
2+
from collections import OrderedDict
23
from dataclasses import dataclass
4+
import glob
35
from pathlib import Path
46
from typing import Dict, List
7+
import zipfile
58
from zipfile import ZipFile
69

710
from lxml import etree
811

12+
from bookworm.document.base import SinglePageDocument, SINGLE_PAGE_DOCUMENT_PAGER, TreeStackBuilder
13+
from bookworm.document import BookMetadata, DocumentCapability as DC, Section
14+
from bookworm.logger import logger
15+
from bookworm.structured_text import TextRange
16+
from bookworm.structured_text.structured_html_parser import StructuredHtmlParser
17+
18+
log = logger.getChild(__name__)
19+
920
@dataclass
1021
class DaisyMetadata:
1122
"""metadata of a daisy book"""
1223
title: str
1324
author: str
1425
publisher: str
1526
language: str
16-
path: str
27+
path: Path | zipfile.Path
1728

1829
@dataclass
1930
class DaisyNavPoint:
@@ -22,14 +33,15 @@ class DaisyNavPoint:
2233
content: str
2334
label: str
2435

25-
26-
def _parse_opf(path: Path) -> DaisyMetadata:
27-
entries = list(path.glob("*.opf"))
36+
def _parse_opf(path: Path | zipfile.Path) -> DaisyMetadata:
37+
"""Parses the OPF file of a daisy3 book in order to obtain its book metadata"""
38+
# we have to use path.iterdir() instead of path.glob() because we want to be generic over the type of path this is
39+
# ZipFile.Path() does not support glob
40+
entries = [x for x in list(path.iterdir()) if x.name.endswith('.opf')]
2841
if not entries:
2942
raise FileNotFoundError("Could not find daisy OPF file")
3043
opf = entries[0]
31-
with open(opf, 'rb') as f:
32-
tree = etree.fromstring(f.read())
44+
tree = etree.fromstring(opf.read_bytes())
3345
dc_metadata = tree.find('metadata/dc-metadata', tree.nsmap)
3446
nsmap = dc_metadata.nsmap
3547
# We can now obtain the book's information
@@ -44,16 +56,20 @@ def _parse_opf(path: Path) -> DaisyMetadata:
4456

4557
@dataclass
4658
class DaisyBook:
59+
"""A daisy3 book representation"""
4760
metadata: DaisyMetadata
4861
toc: List[DaisyNavPoint]
4962
nav_ref: Dict[str, str]
5063

51-
def _parse_ncx(path: Path) -> List[DaisyNavPoint]:
52-
entries = list(path.glob("*.ncx"))
64+
def _parse_ncx(path: Path | zipfile.Path) -> List[DaisyNavPoint]:
65+
"""
66+
Parses a daisy NCX file in order to extract the book's table of content
67+
"""
68+
entries = [x for x in list(path.iterdir()) if x.name.endswith('.ncx')]
5369
if not entries:
70+
# We return an empty list if no NCX file is found
5471
return []
55-
with open(entries[0], 'rb') as f:
56-
tree = etree.fromstring(f.read())
72+
tree = etree.fromstring(entries[0].read_bytes())
5773
# navPoints are all nested inside the navMap
5874
# We are not interested in the navInfo element, which means that findall() will likely suffice
5975
nav_points = tree.findall('navMap/navPoint', tree.nsmap)
@@ -71,15 +87,21 @@ def parse_point(element) -> DaisyNavPoint:
7187

7288

7389
def read_daisy(path: Path) -> DaisyBook:
90+
"""
91+
Reads a daisy book either from an extracted directory, or from a zipfile
92+
"""
93+
# TODO: Is it ok to just read from the zipfile rather than extracting it and be done with it?
94+
if path.is_file() and zipfile.is_zipfile(path):
95+
zip = ZipFile(path)
96+
path = zipfile.Path(zip)
7497
metadata = _parse_opf(path)
7598
toc = _parse_ncx(path)
7699
tree_cache = {}
77100
nav_ref = {}
78101
def get_smil(file: str):
79102
entry = tree_cache.get(file)
80103
if not entry:
81-
with open(path / file, 'rb') as f:
82-
entry = etree.parse(f)
104+
entry = etree.fromstring((path / file).read_bytes())
83105
tree_cache[file] = entry
84106
return entry
85107
for point in toc:
@@ -94,3 +116,71 @@ def get_smil(file: str):
94116
toc=toc,
95117
nav_ref=nav_ref
96118
)
119+
120+
class DaisyDocument(SinglePageDocument):
121+
"""Daisy document"""
122+
format = "daisy"
123+
name = _("Daisy")
124+
extensions = ("*.zip",)
125+
capabilities = (
126+
DC.TOC_TREE
127+
| DC.METADATA
128+
| DC.SINGLE_PAGE
129+
)
130+
131+
def read(self) -> None:
132+
super().read()
133+
self._book: DaisyBook = read_daisy(self.get_file_system_path())
134+
self.structure = StructuredHtmlParser.from_string(self._get_xml())
135+
self._toc = self._build_toc()
136+
137+
def get_content(self) -> str:
138+
return self.structure.get_text()
139+
140+
@property
141+
def toc_tree(self) -> Section:
142+
return self._toc
143+
144+
@property
145+
def metadata(self) -> BookMetadata:
146+
return BookMetadata(
147+
title=self._book.metadata.title,
148+
author=self._book.metadata.author,
149+
publisher=self._book.metadata.publisher,
150+
)
151+
152+
def _get_xml(self) -> str:
153+
fragments: set[str] = {self._book.nav_ref[x.content].split('#')[0] for x in self._book.toc}
154+
content: list[str] = []
155+
for text_file in fragments:
156+
try:
157+
text_path = self._book.metadata.path / text_file
158+
if text_path.exists():
159+
log.debug(f"Reading from {text_file}")
160+
html_content = text_path.read_text(encoding='utf-8')
161+
content.append(html_content)
162+
except (KeyError, FileNotFoundError):
163+
continue
164+
return '\n'.join(content)
165+
166+
def _build_toc(self) -> Section:
167+
root = Section(
168+
title=self._book.metadata.title,
169+
pager = SINGLE_PAGE_DOCUMENT_PAGER,
170+
level=1,
171+
text_range=TextRange(0, len(self.structure.get_text())),
172+
)
173+
stack = TreeStackBuilder(root)
174+
for entry in self._book.toc:
175+
item_ref = self._book.nav_ref[entry.content].split('#')[1]
176+
item_range = self.structure.html_id_ranges.get(item_ref)
177+
if item_range:
178+
s = Section(
179+
title=entry.label,
180+
pager = SINGLE_PAGE_DOCUMENT_PAGER,
181+
level = 2,
182+
text_range=TextRange(*item_range)
183+
)
184+
stack.push(s)
185+
return root
186+

0 commit comments

Comments
 (0)