Skip to content

Commit 0b90a3c

Browse files
authored
feat(mcp): Add section-based document handling to prevent token limit issues (#18)
How to use: Simply ask to Roo Code/Cline to use Table of Contents and Section Access when you ask your query. Example query in Cline: Go through the files in MCP server and tell me how to (do XYZ) and use Table of Contents and Section Access to answer. New Features: Section-based document navigation Document structure caching for improved performance Table of contents generation Section-specific content retrieval URL-friendly section ID generation Benefits: Prevents token limit issues by allowing partial document loading Improves navigation of large documents Reduces memory usage by loading only requested sections Maintains document structure for better content organization
1 parent a87586a commit 0b90a3c

File tree

8 files changed

+19531
-31
lines changed

8 files changed

+19531
-31
lines changed

.github/ISSUE_TEMPLATE/bug_report.md

Lines changed: 0 additions & 30 deletions
This file was deleted.
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import re
2+
from dataclasses import dataclass
3+
from typing import List, Dict, Optional, Tuple
4+
5+
@dataclass
6+
class Section:
7+
"""Represents a section in a markdown document."""
8+
level: int
9+
title: str
10+
content: str
11+
start_pos: int
12+
end_pos: int
13+
subsections: List['Section']
14+
15+
class DocumentStructure:
16+
"""Manages markdown document structure and section access."""
17+
18+
def __init__(self):
19+
self.sections: List[Section] = []
20+
self.toc: Dict[str, Section] = {}
21+
22+
def parse_document(self, content: str) -> None:
23+
"""Parse markdown content into sections."""
24+
self.sections = []
25+
self.toc = {}
26+
27+
# Find all headers with their positions
28+
header_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
29+
headers = [(match.group(1), match.group(2), match.start(), match.end())
30+
for match in header_pattern.finditer(content)]
31+
32+
if not headers:
33+
# If no headers, treat entire document as one section
34+
self.sections = [Section(
35+
level=0,
36+
title="Document",
37+
content=content,
38+
start_pos=0,
39+
end_pos=len(content),
40+
subsections=[]
41+
)]
42+
return
43+
44+
# Process headers into sections
45+
current_sections = []
46+
for i, (hashes, title, start, header_end) in enumerate(headers):
47+
level = len(hashes)
48+
49+
# Find section content (from end of this header to start of next, or end of document)
50+
content_start = header_end
51+
content_end = headers[i + 1][2] if i < len(headers) - 1 else len(content)
52+
section_content = content[content_start:content_end].strip()
53+
54+
section = Section(
55+
level=level,
56+
title=title.strip(),
57+
content=section_content,
58+
start_pos=start,
59+
end_pos=content_end,
60+
subsections=[]
61+
)
62+
63+
# Add to table of contents
64+
section_id = self._make_section_id(title)
65+
self.toc[section_id] = section
66+
67+
# Find parent section by checking levels
68+
while current_sections and current_sections[-1].level >= level:
69+
current_sections.pop()
70+
71+
if current_sections:
72+
current_sections[-1].subsections.append(section)
73+
else:
74+
self.sections.append(section)
75+
76+
current_sections.append(section)
77+
78+
def get_section_by_id(self, section_id: str) -> Optional[Section]:
79+
"""Get a section by its ID."""
80+
return self.toc.get(section_id)
81+
82+
def get_table_of_contents(self) -> List[Tuple[int, str, str]]:
83+
"""Get table of contents as [(level, title, section_id)]."""
84+
toc_entries = []
85+
86+
def add_section(section: Section, prefix: str = ""):
87+
section_id = self._make_section_id(section.title)
88+
toc_entries.append((section.level, prefix + section.title, section_id))
89+
for subsection in section.subsections:
90+
add_section(subsection, prefix + " ")
91+
92+
for section in self.sections:
93+
add_section(section)
94+
95+
return toc_entries
96+
97+
def _make_section_id(self, title: str) -> str:
98+
"""Generate a URL-friendly section ID from title."""
99+
# Convert to lowercase and replace spaces with hyphens
100+
section_id = title.lower().replace(" ", "-")
101+
# Remove any non-alphanumeric characters (except hyphens)
102+
section_id = re.sub(r'[^a-z0-9-]', '', section_id)
103+
# Remove multiple consecutive hyphens
104+
section_id = re.sub(r'-+', '-', section_id)
105+
return section_id.strip('-')

fast-markdown-mcp/src/fast_markdown_mcp/server.py

Lines changed: 91 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,22 +13,67 @@
1313

1414
logger = logging.getLogger(__name__)
1515

16+
from .document_structure import DocumentStructure
17+
1618
class MarkdownStore:
1719
"""Manages markdown content and metadata."""
1820

1921
def __init__(self, storage_path: str):
2022
self.base_path = Path(storage_path)
2123
self.content_cache = {}
2224
self.metadata_cache = {}
25+
self.structure_cache = {} # Cache for parsed document structures
2326

2427
async def get_content(self, file_id: str) -> str:
2528
"""Get markdown content."""
2629
file_path = self.base_path / f"{file_id}.md"
2730
try:
28-
return file_path.read_text(encoding='utf-8')
31+
content = file_path.read_text(encoding='utf-8')
32+
# Parse and cache document structure
33+
if file_id not in self.structure_cache:
34+
structure = DocumentStructure()
35+
structure.parse_document(content)
36+
self.structure_cache[file_id] = structure
37+
return content
2938
except Exception as e:
3039
logger.error(f"Error reading content for {file_id}: {e}")
3140
return f"Error reading content: {str(e)}"
41+
42+
async def get_section(self, file_id: str, section_id: str) -> str:
43+
"""Get a specific section from a markdown file."""
44+
try:
45+
if file_id not in self.structure_cache:
46+
await self.get_content(file_id) # This will parse and cache the structure
47+
48+
structure = self.structure_cache[file_id]
49+
section = structure.get_section_by_id(section_id)
50+
51+
if not section:
52+
return f"Section '{section_id}' not found in {file_id}"
53+
54+
return f"Section: {section.title}\n\n{section.content}"
55+
except Exception as e:
56+
logger.error(f"Error getting section {section_id} from {file_id}: {e}")
57+
return f"Error getting section: {str(e)}"
58+
59+
async def get_table_of_contents(self, file_id: str) -> str:
60+
"""Get table of contents for a markdown file."""
61+
try:
62+
if file_id not in self.structure_cache:
63+
await self.get_content(file_id) # This will parse and cache the structure
64+
65+
structure = self.structure_cache[file_id]
66+
toc = structure.get_table_of_contents()
67+
68+
result = [f"Table of Contents for {file_id}:"]
69+
for level, title, section_id in toc:
70+
indent = " " * level
71+
result.append(f"{indent}- {title} [{section_id}]")
72+
73+
return "\n".join(result)
74+
except Exception as e:
75+
logger.error(f"Error getting table of contents for {file_id}: {e}")
76+
return f"Error getting table of contents: {str(e)}"
3277

3378
async def get_metadata(self, file_id: str) -> dict:
3479
"""Get metadata as a dictionary."""
@@ -335,6 +380,38 @@ async def list_tools() -> list[types.Tool]:
335380
"type": "object",
336381
"properties": {}
337382
}
383+
),
384+
types.Tool(
385+
name="get_section",
386+
description="Get a specific section from a markdown file",
387+
inputSchema={
388+
"type": "object",
389+
"properties": {
390+
"file_id": {
391+
"type": "string",
392+
"description": "ID of the file (without .md extension)"
393+
},
394+
"section_id": {
395+
"type": "string",
396+
"description": "ID of the section to retrieve"
397+
}
398+
},
399+
"required": ["file_id", "section_id"]
400+
}
401+
),
402+
types.Tool(
403+
name="get_table_of_contents",
404+
description="Get table of contents for a markdown file",
405+
inputSchema={
406+
"type": "object",
407+
"properties": {
408+
"file_id": {
409+
"type": "string",
410+
"description": "ID of the file (without .md extension)"
411+
}
412+
},
413+
"required": ["file_id"]
414+
}
338415
)
339416
]
340417

@@ -373,6 +450,19 @@ async def call_tool(name: str, arguments: dict) -> list[types.TextContent]:
373450
elif name == "get_stats":
374451
result = await self.store.get_stats()
375452
return [types.TextContent(type="text", text=result)]
453+
elif name == "get_section":
454+
file_id = arguments.get("file_id")
455+
section_id = arguments.get("section_id")
456+
if not file_id or not section_id:
457+
raise ValueError("file_id and section_id are required")
458+
result = await self.store.get_section(file_id, section_id)
459+
return [types.TextContent(type="text", text=result)]
460+
elif name == "get_table_of_contents":
461+
file_id = arguments.get("file_id")
462+
if not file_id:
463+
raise ValueError("file_id is required")
464+
result = await self.store.get_table_of_contents(file_id)
465+
return [types.TextContent(type="text", text=result)]
376466
else:
377467
raise ValueError(f"Unknown tool: {name}")
378468

storage/markdown/ai_pydantic_dev_.json

Lines changed: 4 additions & 0 deletions
Large diffs are not rendered by default.

storage/markdown/docs_ag2_ai_docs_home.json

Lines changed: 9 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)