@@ -66,6 +66,116 @@ def base64_yjs_to_text(base64_string):
6666 soup = BeautifulSoup (blocknote_structure , "lxml-xml" )
6767 return soup .get_text (separator = " " , strip = True )
6868
69+ def base64_yjs_to_markdown (base64_string : str ) -> str :
70+ xml_content = base64_yjs_to_xml (base64_string )
71+ soup = BeautifulSoup (xml_content , "lxml-xml" )
72+
73+ md_lines : list [str ] = []
74+
75+ def walk (node ) -> None :
76+ if not getattr (node , "name" , None ):
77+ return
78+
79+ # Treat the synthetic “[document]” tag exactly like a wrapper
80+ if node .name in {"[document]" , "blockGroup" , "blockContainer" }:
81+ for child in node .find_all (recursive = False ):
82+ walk (child )
83+ if node .name == "blockContainer" :
84+ md_lines .append ("" ) # paragraph break
85+ return
86+
87+ # ----------- content nodes -------------
88+ if node .name == "heading" :
89+ level = int (node .get ("level" , 1 ))
90+ md_lines .extend ([("#" * level ) + " " + process_inline_formatting (node ), "" ])
91+
92+ elif node .name == "paragraph" :
93+ md_lines .extend ([process_inline_formatting (node ), "" ])
94+
95+ elif node .name == "bulletListItem" :
96+ md_lines .append ("- " + process_inline_formatting (node ))
97+
98+ elif node .name == "numberedListItem" :
99+ idx = node .get ("index" , "1" )
100+ md_lines .append (f"{ idx } . " + process_inline_formatting (node ))
101+
102+ elif node .name == "checkListItem" :
103+ checked = "x" if node .get ("checked" ) == "true" else " "
104+ md_lines .append (f"- [{ checked } ] " + process_inline_formatting (node ))
105+
106+ elif node .name == "codeBlock" :
107+ lang = node .get ("language" , "" )
108+ code = node .get_text ("" , strip = False )
109+ md_lines .extend ([f"```{ lang } " , code , "```" , "" ])
110+
111+ elif node .name in {"quote" , "blockquote" }:
112+ quote = process_inline_formatting (node )
113+ for line in quote .splitlines () or ["" ]:
114+ md_lines .append ("> " + line )
115+ md_lines .append ("" )
116+
117+ elif node .name == "divider" :
118+ md_lines .extend (["---" , "" ])
119+
120+ elif node .name == "callout" :
121+ emoji = node .get ("emoji" , "💡" )
122+ md_lines .extend ([f"> { emoji } { process_inline_formatting (node )} " , "" ])
123+
124+ elif node .name == "img" :
125+ src = node .get ("src" , "" )
126+ alt = node .get ("alt" , "" )
127+ md_lines .extend ([f"" , "" ])
128+
129+ # unknown tags are ignored
130+
131+ # kick-off: start at the synthetic root
132+ walk (soup )
133+
134+ # collapse accidental multiple blank lines
135+ cleaned : list [str ] = []
136+ for line in md_lines :
137+ if line == "" and (not cleaned or cleaned [- 1 ] == "" ):
138+ continue
139+ cleaned .append (line )
140+
141+ return "\n " .join (cleaned ).rstrip () + "\n "
142+
143+ def process_inline_formatting (element ):
144+ """
145+ Process inline formatting elements like bold, italic, underline, etc.
146+ and convert them to markdown syntax.
147+ """
148+ result = ""
149+
150+ # If it's just a text node, return the text
151+ if isinstance (element , str ):
152+ return element
153+
154+ # Process children elements
155+ for child in element .contents :
156+ if isinstance (child , str ):
157+ result += child
158+ elif hasattr (child , 'name' ):
159+ if child .name == "bold" :
160+ result += "**" + process_inline_formatting (child ) + "**"
161+ elif child .name == "italic" :
162+ result += "*" + process_inline_formatting (child ) + "*"
163+ elif child .name == "underline" :
164+ result += "__" + process_inline_formatting (child ) + "__"
165+ elif child .name == "strike" :
166+ result += "~~" + process_inline_formatting (child ) + "~~"
167+ elif child .name == "code" :
168+ result += "`" + process_inline_formatting (child ) + "`"
169+ elif child .name == "link" :
170+ href = child .get ("href" , "" )
171+ text = process_inline_formatting (child )
172+ result += f"[{ text } ]({ href } )"
173+ else :
174+ # For other elements, just process their contents
175+ result += process_inline_formatting (child )
176+
177+ return result
178+
69179
70180def extract_attachments (content ):
71181 """Helper method to extract media paths from a document's content."""
0 commit comments