This repository was archived by the owner on Dec 16, 2024. It is now read-only.
File tree Expand file tree Collapse file tree 2 files changed +45
-0
lines changed Expand file tree Collapse file tree 2 files changed +45
-0
lines changed Original file line number Diff line number Diff line change 5
5
from gpt_index .readers .base import BaseReader
6
6
from gpt_index .readers .file .base_parser import BaseParser
7
7
from gpt_index .readers .file .docs_parser import DocxParser , PDFParser
8
+ from gpt_index .readers .file .epub_parser import EpubParser
8
9
from gpt_index .readers .file .image_parser import ImageParser
9
10
from gpt_index .readers .file .markdown_parser import MarkdownParser
10
11
from gpt_index .readers .file .slides_parser import PptxParser
22
23
".mp3" : VideoAudioParser (),
23
24
".mp4" : VideoAudioParser (),
24
25
".csv" : CSVParser (),
26
+ ".epub" : EpubParser (),
25
27
".md" : MarkdownParser (),
26
28
}
27
29
Original file line number Diff line number Diff line change
1
+ """Epub parser.
2
+
3
+ Contains parsers for epub files.
4
+ """
5
+
6
+ from pathlib import Path
7
+ from typing import Dict
8
+
9
+ from gpt_index .readers .file .base_parser import BaseParser
10
+
11
+
12
+ class EpubParser (BaseParser ):
13
+ """Epub Parser."""
14
+
15
+ def _init_parser (self ) -> Dict :
16
+ """Init parser."""
17
+ return {}
18
+
19
+ def parse_file (self , file : Path , errors : str = "ignore" ) -> str :
20
+ """Parse file."""
21
+ try :
22
+ import ebooklib
23
+ from ebooklib import epub
24
+ except ImportError :
25
+ raise ValueError ("`EbookLib` is required to read Epub files." )
26
+ try :
27
+ import html2text
28
+ except ImportError :
29
+ raise ValueError ("`html2text` is required to parse Epub files." )
30
+
31
+ text_list = []
32
+ book = epub .read_epub (file , options = {"ignore_ncx" : True })
33
+
34
+ # Iterate through all chapters.
35
+ for item in book .get_items ():
36
+ # Chapters are typically located in epub documents items.
37
+ if item .get_type () == ebooklib .ITEM_DOCUMENT :
38
+ text_list .append (
39
+ html2text .html2text (item .get_content ().decode ("utf-8" ))
40
+ )
41
+
42
+ text = "\n " .join (text_list )
43
+ return text
You can’t perform that action at this time.
0 commit comments