-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathRecursive_Chunking.py
More file actions
74 lines (56 loc) · 2.19 KB
/
Recursive_Chunking.py
File metadata and controls
74 lines (56 loc) · 2.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""
Recursive Chunking - The Recommended Default
Splits intelligently: Paragraphs → Lines → Sentences → Words
"""
from pypdf import PdfReader
from pathlib import Path
from langchain_text_splitters import RecursiveCharacterTextSplitter
def extract_text_from_pdf(pdf_path):
"""Extract all text from a PDF file."""
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def main():
# PDF files
pdfs = [
"/Volumes/vibecoding/RAG-Complete Cook Book/ITC-August-Q1-2526.pdf",
"/Volumes/vibecoding/RAG-Complete Cook Book/ITC-October-Q2-2526.pdf"
]
# Configuration
CHUNK_SIZE = 500
CHUNK_OVERLAP = 75 # 15% of 500
# Initialize Recursive Splitter
# It will try separators in this order: ["\n\n", "\n", ".", " ", ""]
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
separators=["\n\n", "\n", ".", " ", ""], # Paragraph → Line → Sentence → Word
length_function=len
)
print(f"📄 Recursive Chunking (Size: {CHUNK_SIZE}, Overlap: {CHUNK_OVERLAP})")
print(f" Strategy: Paragraph → Line → Sentence → Word")
print("=" * 70)
for pdf_path in pdfs:
pdf_file = Path(pdf_path)
if not pdf_file.exists():
print(f"❌ File not found: {pdf_file.name}\n")
continue
print(f"\n📂 Processing: {pdf_file.name}")
# Extract text
text = extract_text_from_pdf(pdf_path)
# Create chunks using Recursive splitter
chunks = text_splitter.split_text(text)
print(f" Total characters: {len(text):,}")
print(f" Total chunks: {len(chunks)}")
# Show first 3 chunks
print(f"\n First 3 chunks:")
for i, chunk in enumerate(chunks[:3], 1):
print(f"\n --- Chunk {i} (Length: {len(chunk)}) ---")
# Show first 150 chars
preview = chunk.replace('\n', ' ')
print(f" {preview} ", len(preview))
print("\n" + "-" * 70)
if __name__ == "__main__":
main()