Skip to content

Commit 5b79b92

Browse files
committed
use list-based processing (inspired by AlextheYounga)
Signed-off-by: Chris Papademetrious <chrispy@synopsys.com> Signed-off-by: chrispy <chrispy@synopsys.com>
1 parent 3026602 commit 5b79b92

File tree

1 file changed

+41
-13
lines changed

1 file changed

+41
-13
lines changed

markdownify/__init__.py

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
whitespace_re = re.compile(r'[\t ]+')
1010
all_whitespace_re = re.compile(r'[\t \r\n]+')
1111
newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
12+
extract_newlines_re = re.compile(r'^(\n*)(.*?)(\n*)$', flags=re.DOTALL)
1213
html_heading_re = re.compile(r'h[1-6]')
1314

1415

@@ -168,6 +169,12 @@ def convert(self, html):
168169
def convert_soup(self, soup):
169170
return self.process_tag(soup, convert_as_inline=False)
170171

172+
def process_element(self, node, convert_as_inline):
173+
if isinstance(node, NavigableString):
174+
return self.process_text(node)
175+
else:
176+
return self.process_tag(node, convert_as_inline)
177+
171178
def process_tag(self, node, convert_as_inline):
172179
text = ''
173180

@@ -203,23 +210,44 @@ def _can_ignore(el):
203210
return True
204211
else:
205212
return False
213+
elif el is None:
214+
return True
206215
else:
207216
raise ValueError('Unexpected element type: %s' % type(el))
208217

209-
children_to_convert = [child for child in node.children if not _can_ignore(child)]
218+
children_to_convert = [el for el in node.children if not _can_ignore(el)]
210219

211-
# Convert the children first
212-
for el in children_to_convert:
213-
if isinstance(el, NavigableString):
214-
text += self.process_text(el)
215-
else:
216-
text_strip = text.rstrip('\n')
217-
newlines_left = len(text) - len(text_strip)
218-
next_text = self.process_tag(el, convert_children_as_inline)
219-
next_text_strip = next_text.lstrip('\n')
220-
newlines_right = len(next_text) - len(next_text_strip)
221-
newlines = '\n' * max(newlines_left, newlines_right)
222-
text = text_strip + newlines + next_text_strip
220+
# Convert the children elements into a list of result strings.
221+
child_strings = [self.process_element(el, convert_children_as_inline) for el in children_to_convert]
222+
223+
# Remove empty string values.
224+
child_strings = [s for s in child_strings if s]
225+
226+
# Collapse newlines at child element boundaries, if needed.
227+
if node.name == 'pre' or node.find_parent('pre'):
228+
# Inside <pre> blocks, do not collapse newlines.
229+
pass
230+
else:
231+
# Collapse newlines at child element boundaries.
232+
updated_child_strings = [''] # so the first lookback works
233+
for child_string in child_strings:
234+
# Separate the leading/trailing newlines from the content.
235+
leading_nl, content, trailing_nl = extract_newlines_re.match(child_string).groups()
236+
237+
# If the last child had trailing newlines and this child has leading newlines,
238+
# use the larger newline count, limited to 2.
239+
if updated_child_strings[-1] and leading_nl:
240+
prev_trailing_nl = updated_child_strings.pop() # will be replaced by the collapsed value
241+
num_newlines = min(2, max(len(prev_trailing_nl), len(leading_nl)))
242+
leading_nl = '\n' * num_newlines
243+
244+
# Add the results to the updated child string list.
245+
updated_child_strings.extend([leading_nl, content, trailing_nl])
246+
247+
child_strings = updated_child_strings
248+
249+
# Join all child text strings into a single string.
250+
text = ''.join(child_strings)
223251

224252
# apply this tag's final conversion function
225253
convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", node.name)

0 commit comments

Comments
 (0)