Skip to content
This repository was archived by the owner on Dec 16, 2024. It is now read-only.

Commit 500e40d

Browse files
haowjyJerry Liu
andauthored
Add epub file parser (run-llama#355)
Co-authored-by: Jerry Liu <jerry@robustintelligence.com>
1 parent 1fa174f commit 500e40d

File tree

2 files changed

+45
-0
lines changed

2 files changed

+45
-0
lines changed

gpt_index/readers/file/base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from gpt_index.readers.base import BaseReader
66
from gpt_index.readers.file.base_parser import BaseParser
77
from gpt_index.readers.file.docs_parser import DocxParser, PDFParser
8+
from gpt_index.readers.file.epub_parser import EpubParser
89
from gpt_index.readers.file.image_parser import ImageParser
910
from gpt_index.readers.file.markdown_parser import MarkdownParser
1011
from gpt_index.readers.file.slides_parser import PptxParser
@@ -22,6 +23,7 @@
2223
".mp3": VideoAudioParser(),
2324
".mp4": VideoAudioParser(),
2425
".csv": CSVParser(),
26+
".epub": EpubParser(),
2527
".md": MarkdownParser(),
2628
}
2729

gpt_index/readers/file/epub_parser.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""Epub parser.
2+
3+
Contains parsers for epub files.
4+
"""
5+
6+
from pathlib import Path
7+
from typing import Dict
8+
9+
from gpt_index.readers.file.base_parser import BaseParser
10+
11+
12+
class EpubParser(BaseParser):
13+
"""Epub Parser."""
14+
15+
def _init_parser(self) -> Dict:
16+
"""Init parser."""
17+
return {}
18+
19+
def parse_file(self, file: Path, errors: str = "ignore") -> str:
20+
"""Parse file."""
21+
try:
22+
import ebooklib
23+
from ebooklib import epub
24+
except ImportError:
25+
raise ValueError("`EbookLib` is required to read Epub files.")
26+
try:
27+
import html2text
28+
except ImportError:
29+
raise ValueError("`html2text` is required to parse Epub files.")
30+
31+
text_list = []
32+
book = epub.read_epub(file, options={"ignore_ncx": True})
33+
34+
# Iterate through all chapters.
35+
for item in book.get_items():
36+
# Chapters are typically located in epub documents items.
37+
if item.get_type() == ebooklib.ITEM_DOCUMENT:
38+
text_list.append(
39+
html2text.html2text(item.get_content().decode("utf-8"))
40+
)
41+
42+
text = "\n".join(text_list)
43+
return text

0 commit comments

Comments
 (0)