Skip to content

Commit 06c26a7

Browse files
committed
💩(backend) add document content endpoint
Get the content of a document in markdown format. Ex: http://localhost:8071/api/v1.0/documents/<ID>/content/
1 parent 8cf06d6 commit 06c26a7

File tree

3 files changed

+131
-1
lines changed

3 files changed

+131
-1
lines changed

src/backend/core/api/viewsets.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
from rest_framework.permissions import AllowAny
3434
from rest_framework.throttling import UserRateThrottle
3535

36-
from core import authentication, enums, models
36+
from core import authentication, enums, models, utils as core_utils
3737
from core.services.ai_services import AIService
3838
from core.services.collaboration_services import CollaborationService
3939
from core.utils import extract_attachments, filter_descendants
@@ -1353,6 +1353,25 @@ def media_check(self, request, *args, **kwargs):
13531353
}
13541354

13551355
return drf.response.Response(body, status=drf.status.HTTP_200_OK)
1356+
1357+
@drf.decorators.action(detail=True, methods=["get"], url_path="content")
1358+
def content(self, request, *args, **kwargs):
1359+
"""
1360+
Get the content of a document
1361+
"""
1362+
1363+
document = self.get_object()
1364+
1365+
# content_type = response.headers.get("Content-Type", "")
1366+
1367+
base64_yjs_content = document.content
1368+
content = core_utils.base64_yjs_to_markdown(base64_yjs_content)
1369+
1370+
body = {
1371+
"content": content,
1372+
}
1373+
1374+
return drf.response.Response(body, status=drf.status.HTTP_200_OK)
13561375

13571376
@drf.decorators.action(
13581377
detail=True,

src/backend/core/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -839,6 +839,7 @@ def get_abilities(self, user, ancestors_links=None):
839839
"children_list": can_get,
840840
"children_create": can_update and user.is_authenticated,
841841
"collaboration_auth": can_get,
842+
"content": can_get,
842843
"cors_proxy": can_get,
843844
"descendants": can_get,
844845
"destroy": is_owner,

src/backend/core/utils.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,116 @@ def base64_yjs_to_text(base64_string):
6666
soup = BeautifulSoup(blocknote_structure, "lxml-xml")
6767
return soup.get_text(separator=" ", strip=True)
6868

69+
def base64_yjs_to_markdown(base64_string: str) -> str:
70+
xml_content = base64_yjs_to_xml(base64_string)
71+
soup = BeautifulSoup(xml_content, "lxml-xml")
72+
73+
md_lines: list[str] = []
74+
75+
def walk(node) -> None:
76+
if not getattr(node, "name", None):
77+
return
78+
79+
# Treat the synthetic “[document]” tag exactly like a wrapper
80+
if node.name in {"[document]", "blockGroup", "blockContainer"}:
81+
for child in node.find_all(recursive=False):
82+
walk(child)
83+
if node.name == "blockContainer":
84+
md_lines.append("") # paragraph break
85+
return
86+
87+
# ----------- content nodes -------------
88+
if node.name == "heading":
89+
level = int(node.get("level", 1))
90+
md_lines.extend([("#" * level) + " " + process_inline_formatting(node), ""])
91+
92+
elif node.name == "paragraph":
93+
md_lines.extend([process_inline_formatting(node), ""])
94+
95+
elif node.name == "bulletListItem":
96+
md_lines.append("- " + process_inline_formatting(node))
97+
98+
elif node.name == "numberedListItem":
99+
idx = node.get("index", "1")
100+
md_lines.append(f"{idx}. " + process_inline_formatting(node))
101+
102+
elif node.name == "checkListItem":
103+
checked = "x" if node.get("checked") == "true" else " "
104+
md_lines.append(f"- [{checked}] " + process_inline_formatting(node))
105+
106+
elif node.name == "codeBlock":
107+
lang = node.get("language", "")
108+
code = node.get_text("", strip=False)
109+
md_lines.extend([f"```{lang}", code, "```", ""])
110+
111+
elif node.name in {"quote", "blockquote"}:
112+
quote = process_inline_formatting(node)
113+
for line in quote.splitlines() or [""]:
114+
md_lines.append("> " + line)
115+
md_lines.append("")
116+
117+
elif node.name == "divider":
118+
md_lines.extend(["---", ""])
119+
120+
elif node.name == "callout":
121+
emoji = node.get("emoji", "💡")
122+
md_lines.extend([f"> {emoji} {process_inline_formatting(node)}", ""])
123+
124+
elif node.name == "img":
125+
src = node.get("src", "")
126+
alt = node.get("alt", "")
127+
md_lines.extend([f"![{alt}]({src})", ""])
128+
129+
# unknown tags are ignored
130+
131+
# kick-off: start at the synthetic root
132+
walk(soup)
133+
134+
# collapse accidental multiple blank lines
135+
cleaned: list[str] = []
136+
for line in md_lines:
137+
if line == "" and (not cleaned or cleaned[-1] == ""):
138+
continue
139+
cleaned.append(line)
140+
141+
return "\n".join(cleaned).rstrip() + "\n"
142+
143+
def process_inline_formatting(element):
144+
"""
145+
Process inline formatting elements like bold, italic, underline, etc.
146+
and convert them to markdown syntax.
147+
"""
148+
result = ""
149+
150+
# If it's just a text node, return the text
151+
if isinstance(element, str):
152+
return element
153+
154+
# Process children elements
155+
for child in element.contents:
156+
if isinstance(child, str):
157+
result += child
158+
elif hasattr(child, 'name'):
159+
if child.name == "bold":
160+
result += "**" + process_inline_formatting(child) + "**"
161+
elif child.name == "italic":
162+
result += "*" + process_inline_formatting(child) + "*"
163+
elif child.name == "underline":
164+
result += "__" + process_inline_formatting(child) + "__"
165+
elif child.name == "strike":
166+
result += "~~" + process_inline_formatting(child) + "~~"
167+
elif child.name == "code":
168+
result += "`" + process_inline_formatting(child) + "`"
169+
elif child.name == "link":
170+
href = child.get("href", "")
171+
text = process_inline_formatting(child)
172+
result += f"[{text}]({href})"
173+
else:
174+
# For other elements, just process their contents
175+
result += process_inline_formatting(child)
176+
177+
return result
178+
69179

70180
def extract_attachments(content):
71181
"""Helper method to extract media paths from a document's content."""

0 commit comments

Comments
 (0)