Skip to content

Commit be96173

Browse files
committed
feat: improve parsing performance from o(n^2) to o(n)
The function process_tag was previously concatenating strings inside of a loop. Each + operation creates a new string, resulting in repeated copying of already accumulated data. By replacing this with an array we append to, and then joining this array at the end, we can take this from an exponential function to a linear function.
1 parent 9c299ed commit be96173

File tree

1 file changed

+73
-21
lines changed

1 file changed

+73
-21
lines changed

markdownify/__init__.py

Lines changed: 73 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def convert_soup(self, soup):
133133
return self.process_tag(soup, convert_as_inline=False)
134134

135135
def process_tag(self, node, convert_as_inline):
136-
text = ''
136+
text_parts = []
137137

138138
# markdown headings or cells can't include
139139
# block elements (elements w/newlines)
@@ -147,43 +147,95 @@ def process_tag(self, node, convert_as_inline):
147147
# Remove whitespace-only textnodes just before, after or
148148
# inside block-level elements.
149149
should_remove_inside = should_remove_whitespace_inside(node)
150-
for el in node.children:
151-
# Only extract (remove) whitespace-only text node if any of the
152-
# conditions is true:
153-
# - el is the first element in its parent (block-level)
154-
# - el is the last element in its parent (block-level)
155-
# - el is adjacent to a block-level node
156-
can_extract = (should_remove_inside and (not el.previous_sibling
157-
or not el.next_sibling)
158-
or should_remove_whitespace_outside(el.previous_sibling)
159-
or should_remove_whitespace_outside(el.next_sibling))
160-
if (isinstance(el, NavigableString)
161-
and six.text_type(el).strip() == ''
162-
and can_extract):
150+
children = list(node.children)
151+
for i, el in enumerate(children):
152+
# Quick type check first to avoid unnecessary function calls
153+
if not isinstance(el, NavigableString):
154+
continue
155+
156+
# Check if the text is entirely whitespace first
157+
text = six.text_type(el)
158+
if text.strip():
159+
continue
160+
161+
# If first or last element
162+
is_at_extreme_position = (
163+
should_remove_inside and (i == 0 or i == len(children) - 1)
164+
)
165+
# True if there is a preceding sibling, and it should have whitespace removed.
166+
has_removal_candidate_to_left = (
167+
i > 0 and should_remove_whitespace_outside(children[i - 1])
168+
)
169+
# True if there is a following sibling, and it should have whitespace removed.
170+
has_removal_candidate_to_right = (
171+
i < len(children) - 1 and should_remove_whitespace_outside(children[i + 1])
172+
)
173+
# Determine if we can extract based on position and adjacency
174+
can_extract = (
175+
is_at_extreme_position
176+
or has_removal_candidate_to_left
177+
or has_removal_candidate_to_right
178+
)
179+
180+
# Extract if conditions are met
181+
if can_extract:
163182
el.extract()
164183

165184
# Convert the children first
166185
for el in node.children:
167186
if isinstance(el, Comment) or isinstance(el, Doctype):
168187
continue
169188
elif isinstance(el, NavigableString):
170-
text += self.process_text(el)
189+
text_parts.append(self.process_text(el))
171190
else:
172-
text_strip = text.rstrip('\n')
173-
newlines_left = len(text) - len(text_strip)
191+
# 1) Pop trailing newlines from whatever's in text_parts so far
192+
text_strip, newlines_left = self.pop_trailing_newlines(text_parts)
193+
# 2) Convert the next tag
174194
next_text = self.process_tag(el, convert_children_as_inline)
195+
# 3) Figure out how many leading newlines in next_text
175196
next_text_strip = next_text.lstrip('\n')
176197
newlines_right = len(next_text) - len(next_text_strip)
198+
# 4) Calculate how many newlines to insert between text_strip and next_text_strip
177199
newlines = '\n' * max(newlines_left, newlines_right)
178-
text = text_strip + newlines + next_text_strip
200+
text_parts.extend([text_strip, newlines, next_text_strip])
179201

180202
# apply this tag's final conversion function
181203
convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", node.name)
182204
convert_fn = getattr(self, convert_fn_name, None)
183205
if convert_fn and self.should_convert_tag(node.name):
184-
text = convert_fn(node, text, convert_as_inline)
185-
186-
return text
206+
# Join the text parts before passing to convert_fn
207+
text_parts_str = ''.join(text_parts)
208+
text_parts = [convert_fn(node, text_parts_str, convert_as_inline)]
209+
210+
return ''.join(text_parts)
211+
212+
def pop_trailing_newlines(self, parts):
213+
"""
214+
Pops trailing newline text from the end of `parts`. I suspect this function
215+
will get refactored with time.
216+
Returns: (combined_text_without_trailing_newlines, count_of_newlines_popped)
217+
If there is no trailing text in parts, returns ("", 0).
218+
"""
219+
220+
newlines_total = 0
221+
# 1) First pop off any chunks that are newlines
222+
newline_chunks = []
223+
while parts and parts[-1].rstrip('\n') == '':
224+
newline_chunks.append(parts.pop())
225+
if newline_chunks:
226+
# Example: if we had ["\n", "\n", "\n"] at the very end
227+
all_newlines = ''.join(reversed(newline_chunks))
228+
newlines_total += len(all_newlines)
229+
230+
# 2) Now look at one more chunk which might have some real text + trailing newlines
231+
text_without_newline = ''
232+
if parts:
233+
last_chunk = parts.pop()
234+
stripped = last_chunk.rstrip('\n')
235+
newlines_total += (len(last_chunk) - len(stripped))
236+
text_without_newline = stripped
237+
238+
return (text_without_newline, newlines_total)
187239

188240
def convert__document_(self, el, text, convert_as_inline):
189241
"""Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""

0 commit comments

Comments
 (0)