@@ -168,6 +168,12 @@ def convert(self, html):
168168 def convert_soup (self , soup ):
169169 return self .process_tag (soup , convert_as_inline = False )
170170
171+ def process_element (self , node , convert_as_inline ):
172+ if isinstance (node , NavigableString ):
173+ return self .process_text (node )
174+ else :
175+ return self .process_tag (node , convert_as_inline )
176+
171177 def process_tag (self , node , convert_as_inline ):
172178 text = ''
173179
@@ -203,23 +209,44 @@ def _can_ignore(el):
203209 return True
204210 else :
205211 return False
212+ elif el is None :
213+ return True
206214 else :
207215 raise ValueError ('Unexpected element type: %s' % type (el ))
208216
209- children_to_convert = [child for child in node .children if not _can_ignore (child )]
217+ children_to_convert = [el for el in node .children if not _can_ignore (el )]
210218
211- # Convert the children first
212- for el in children_to_convert :
213- if isinstance (el , NavigableString ):
214- text += self .process_text (el )
215- else :
216- text_strip = text .rstrip ('\n ' )
217- newlines_left = len (text ) - len (text_strip )
218- next_text = self .process_tag (el , convert_children_as_inline )
219- next_text_strip = next_text .lstrip ('\n ' )
220- newlines_right = len (next_text ) - len (next_text_strip )
221- newlines = '\n ' * max (newlines_left , newlines_right )
222- text = text_strip + newlines + next_text_strip
219+ # Convert the children elements into a list of result strings.
220+ child_strings = [self .process_element (el , convert_children_as_inline ) for el in children_to_convert ]
221+
222+ # Remove empty string values.
223+ child_strings = [s for s in child_strings if s ]
224+
225+ # Collapse newlines at child element boundaries, if needed.
226+ if node .name == 'pre' or node .find_parent ('pre' ):
227+ # Inside <pre> blocks, do not collapse newlines.
228+ pass
229+ else :
230+ # Collapse newlines at child element boundaries.
231+ updated_child_strings = ['' ] # so the first lookback works
232+ for child_string in child_strings :
233+ # Separate the leading/trailing newlines from the content.
234+ leading_nl , content , trailing_nl = re .match (r'^(\n*)(.*?)(\n*)$' , child_string , flags = re .DOTALL ).groups ()
235+
236+ # If the last child had trailing newlines and this child has leading newlines,
237+ # use the larger newline count, limited to 2.
238+ if updated_child_strings [- 1 ] and leading_nl :
239+ prev_trailing_nl = updated_child_strings .pop () # will be replaced by the collapsed value
240+ num_newlines = min (2 , max (len (prev_trailing_nl ), len (leading_nl )))
241+ leading_nl = '\n ' * num_newlines
242+
243+ # Add the results to the updated child string list.
244+ updated_child_strings .extend ([leading_nl , content , trailing_nl ])
245+
246+ child_strings = updated_child_strings
247+
248+ # Join all child text strings into a single string.
249+ text = '' .join (child_strings )
223250
224251 # apply this tag's final conversion function
225252 convert_fn_name = "convert_%s" % re .sub (r"[\[\]:-]" , "_" , node .name )
0 commit comments