11"""Daisy 3.0 document format """
2+ from collections import OrderedDict
23from dataclasses import dataclass
4+ import glob
35from pathlib import Path
46from typing import Dict , List
7+ import zipfile
58from zipfile import ZipFile
69
710from lxml import etree
811
12+ from bookworm .document .base import SinglePageDocument , SINGLE_PAGE_DOCUMENT_PAGER , TreeStackBuilder
13+ from bookworm .document import BookMetadata , DocumentCapability as DC , Section
14+ from bookworm .logger import logger
15+ from bookworm .structured_text import TextRange
16+ from bookworm .structured_text .structured_html_parser import StructuredHtmlParser
17+
18+ log = logger .getChild (__name__ )
19+
920@dataclass
1021class DaisyMetadata :
1122 """metadata of a daisy book"""
1223 title : str
1324 author : str
1425 publisher : str
1526 language : str
16- path : str
27+ path : Path | zipfile . Path
1728
1829@dataclass
1930class DaisyNavPoint :
@@ -22,14 +33,15 @@ class DaisyNavPoint:
2233 content : str
2334 label : str
2435
25-
26- def _parse_opf (path : Path ) -> DaisyMetadata :
27- entries = list (path .glob ("*.opf" ))
36+ def _parse_opf (path : Path | zipfile .Path ) -> DaisyMetadata :
37+ """Parses the OPF file of a daisy3 book in order to obtain its book metadata"""
38+ # we have to use path.iterdir() instead of path.glob() because we want to be generic over the type of path this is
39+ # ZipFile.Path() does not support glob
40+ entries = [x for x in list (path .iterdir ()) if x .name .endswith ('.opf' )]
2841 if not entries :
2942 raise FileNotFoundError ("Could not find daisy OPF file" )
3043 opf = entries [0 ]
31- with open (opf , 'rb' ) as f :
32- tree = etree .fromstring (f .read ())
44+ tree = etree .fromstring (opf .read_bytes ())
3345 dc_metadata = tree .find ('metadata/dc-metadata' , tree .nsmap )
3446 nsmap = dc_metadata .nsmap
3547 # We can now obtain the book's information
@@ -44,16 +56,20 @@ def _parse_opf(path: Path) -> DaisyMetadata:
4456
4557@dataclass
4658class DaisyBook :
59+ """A daisy3 book representation"""
4760 metadata : DaisyMetadata
4861 toc : List [DaisyNavPoint ]
4962 nav_ref : Dict [str , str ]
5063
51- def _parse_ncx (path : Path ) -> List [DaisyNavPoint ]:
52- entries = list (path .glob ("*.ncx" ))
64+ def _parse_ncx (path : Path | zipfile .Path ) -> List [DaisyNavPoint ]:
65+ """
66+ Parses a daisy NCX file in order to extract the book's table of content
67+ """
68+ entries = [x for x in list (path .iterdir ()) if x .name .endswith ('.ncx' )]
5369 if not entries :
70+ # We return an empty list if no NCX file is found
5471 return []
55- with open (entries [0 ], 'rb' ) as f :
56- tree = etree .fromstring (f .read ())
72+ tree = etree .fromstring (entries [0 ].read_bytes ())
5773 # navPoints are all nested inside the navMap
5874 # We are not interested in the navInfo element, which means that findall() will likely suffice
5975 nav_points = tree .findall ('navMap/navPoint' , tree .nsmap )
@@ -71,15 +87,21 @@ def parse_point(element) -> DaisyNavPoint:
7187
7288
7389def read_daisy (path : Path ) -> DaisyBook :
90+ """
91+ Reads a daisy book either from an extracted directory, or from a zipfile
92+ """
93+ # TODO: Is it ok to just read from the zipfile rather than extracting it and be done with it?
94+ if path .is_file () and zipfile .is_zipfile (path ):
95+ zip = ZipFile (path )
96+ path = zipfile .Path (zip )
7497 metadata = _parse_opf (path )
7598 toc = _parse_ncx (path )
7699 tree_cache = {}
77100 nav_ref = {}
78101 def get_smil (file : str ):
79102 entry = tree_cache .get (file )
80103 if not entry :
81- with open (path / file , 'rb' ) as f :
82- entry = etree .parse (f )
104+ entry = etree .fromstring ((path / file ).read_bytes ())
83105 tree_cache [file ] = entry
84106 return entry
85107 for point in toc :
@@ -94,3 +116,71 @@ def get_smil(file: str):
94116 toc = toc ,
95117 nav_ref = nav_ref
96118 )
119+
120+ class DaisyDocument (SinglePageDocument ):
121+ """Daisy document"""
122+ format = "daisy"
123+ name = _ ("Daisy" )
124+ extensions = ("*.zip" ,)
125+ capabilities = (
126+ DC .TOC_TREE
127+ | DC .METADATA
128+ | DC .SINGLE_PAGE
129+ )
130+
131+ def read (self ) -> None :
132+ super ().read ()
133+ self ._book : DaisyBook = read_daisy (self .get_file_system_path ())
134+ self .structure = StructuredHtmlParser .from_string (self ._get_xml ())
135+ self ._toc = self ._build_toc ()
136+
137+ def get_content (self ) -> str :
138+ return self .structure .get_text ()
139+
140+ @property
141+ def toc_tree (self ) -> Section :
142+ return self ._toc
143+
144+ @property
145+ def metadata (self ) -> BookMetadata :
146+ return BookMetadata (
147+ title = self ._book .metadata .title ,
148+ author = self ._book .metadata .author ,
149+ publisher = self ._book .metadata .publisher ,
150+ )
151+
152+ def _get_xml (self ) -> str :
153+ fragments : set [str ] = {self ._book .nav_ref [x .content ].split ('#' )[0 ] for x in self ._book .toc }
154+ content : list [str ] = []
155+ for text_file in fragments :
156+ try :
157+ text_path = self ._book .metadata .path / text_file
158+ if text_path .exists ():
159+ log .debug (f"Reading from { text_file } " )
160+ html_content = text_path .read_text (encoding = 'utf-8' )
161+ content .append (html_content )
162+ except (KeyError , FileNotFoundError ):
163+ continue
164+ return '\n ' .join (content )
165+
166+ def _build_toc (self ) -> Section :
167+ root = Section (
168+ title = self ._book .metadata .title ,
169+ pager = SINGLE_PAGE_DOCUMENT_PAGER ,
170+ level = 1 ,
171+ text_range = TextRange (0 , len (self .structure .get_text ())),
172+ )
173+ stack = TreeStackBuilder (root )
174+ for entry in self ._book .toc :
175+ item_ref = self ._book .nav_ref [entry .content ].split ('#' )[1 ]
176+ item_range = self .structure .html_id_ranges .get (item_ref )
177+ if item_range :
178+ s = Section (
179+ title = entry .label ,
180+ pager = SINGLE_PAGE_DOCUMENT_PAGER ,
181+ level = 2 ,
182+ text_range = TextRange (* item_range )
183+ )
184+ stack .push (s )
185+ return root
186+
0 commit comments