-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextractor.py
More file actions
179 lines (149 loc) · 5.67 KB
/
extractor.py
File metadata and controls
179 lines (149 loc) · 5.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# -*- coding: utf-8 -*-
"""
KnowledgeDigest -- Text-Extraktion aus verschiedenen Dateiformaten.
Unterstuetzte Formate:
- PDF (via pdfplumber, PyPDF2 Fallback)
- DOCX (via python-docx)
- TXT (direkt, UTF-8 mit Fallback)
- MD (direkt, als Plaintext)
- HTML (via BeautifulSoup)
Usage:
from KnowledgeDigest.extractor import TextExtractor
ext = TextExtractor()
result = ext.extract("/path/to/document.pdf")
print(result.text[:200])
"""
__all__ = ["TextExtractor", "ExtractedText"]
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Set
@dataclass
class ExtractedText:
"""Ergebnis einer Text-Extraktion."""
text: str
language: Optional[str] = None
method: str = "unknown"
page_count: int = 0
# Unterstuetzte Dateitypen gruppiert
_PDF_EXTS = {'.pdf'}
_DOCX_EXTS = {'.docx'}
_TEXT_EXTS = {'.txt', '.md', '.markdown', '.rst', '.csv', '.log', '.yaml', '.yml', '.json', '.xml'}
_HTML_EXTS = {'.html', '.htm'}
ALL_SUPPORTED = _PDF_EXTS | _DOCX_EXTS | _TEXT_EXTS | _HTML_EXTS
class TextExtractor:
"""Extrahiert Text aus verschiedenen Dateiformaten."""
def supported_types(self) -> Set[str]:
"""Gibt unterstuetzte Dateiendungen zurueck."""
return ALL_SUPPORTED
def can_extract(self, path: Path) -> bool:
"""Prueft ob Datei unterstuetzt wird."""
return Path(path).suffix.lower() in ALL_SUPPORTED
def extract(self, path: str | Path) -> ExtractedText:
"""Extrahiert Text aus einer Datei.
Args:
path: Pfad zur Datei
Returns:
ExtractedText mit extrahiertem Text und Metadaten
Raises:
FileNotFoundError: Datei existiert nicht
ValueError: Dateityp nicht unterstuetzt
"""
path = Path(path)
if not path.exists():
raise FileNotFoundError(f"Datei nicht gefunden: {path}")
ext = path.suffix.lower()
if ext in _PDF_EXTS:
return self._extract_pdf(path)
elif ext in _DOCX_EXTS:
return self._extract_docx(path)
elif ext in _HTML_EXTS:
return self._extract_html(path)
elif ext in _TEXT_EXTS:
return self._extract_text(path)
else:
raise ValueError(f"Dateityp '{ext}' nicht unterstuetzt. "
f"Unterstuetzt: {sorted(ALL_SUPPORTED)}")
def _extract_pdf(self, path: Path) -> ExtractedText:
"""PDF-Extraktion: pdfplumber primaer, PyPDF2 Fallback."""
# Versuch 1: pdfplumber
try:
import pdfplumber
pages = []
with pdfplumber.open(str(path)) as pdf:
page_count = len(pdf.pages)
for page in pdf.pages:
text = page.extract_text()
if text:
pages.append(text)
if pages:
return ExtractedText(
text="\n\n".join(pages),
method="pdfplumber",
page_count=page_count,
)
except Exception:
pass
# Versuch 2: PyPDF2
try:
from PyPDF2 import PdfReader
reader = PdfReader(str(path))
page_count = len(reader.pages)
pages = []
for page in reader.pages:
text = page.extract_text()
if text:
pages.append(text)
if pages:
return ExtractedText(
text="\n\n".join(pages),
method="pypdf2",
page_count=page_count,
)
except Exception:
pass
return ExtractedText(text="", method="pdf_failed", page_count=0)
def _extract_docx(self, path: Path) -> ExtractedText:
"""DOCX-Extraktion via python-docx."""
try:
from docx import Document
doc = Document(str(path))
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
return ExtractedText(
text="\n\n".join(paragraphs),
method="python-docx",
page_count=0,
)
except Exception as e:
return ExtractedText(text="", method=f"docx_failed: {e}")
def _extract_html(self, path: Path) -> ExtractedText:
"""HTML-Extraktion via BeautifulSoup."""
raw = self._read_file(path)
if not raw:
return ExtractedText(text="", method="html_read_failed")
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(raw, 'html.parser')
# Script und Style entfernen
for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
tag.decompose()
text = soup.get_text(separator='\n', strip=True)
return ExtractedText(text=text, method="beautifulsoup")
except Exception as e:
return ExtractedText(text="", method=f"html_failed: {e}")
def _extract_text(self, path: Path) -> ExtractedText:
"""Plaintext-Extraktion (TXT, MD, YAML, etc.)."""
text = self._read_file(path)
if text is None:
return ExtractedText(text="", method="text_read_failed")
ext = path.suffix.lower()
method = f"plaintext_{ext.lstrip('.')}"
return ExtractedText(text=text, method=method)
@staticmethod
def _read_file(path: Path) -> Optional[str]:
"""Liest Datei mit UTF-8, Fallback auf cp1252 (Windows)."""
for encoding in ('utf-8', 'cp1252', 'latin-1'):
try:
return path.read_text(encoding=encoding)
except (UnicodeDecodeError, UnicodeError):
continue
return None