@@ -133,7 +133,7 @@ def convert_soup(self, soup):
133133 return self .process_tag (soup , convert_as_inline = False )
134134
135135 def process_tag (self , node , convert_as_inline ):
136- text = ''
136+ text_parts = []
137137
138138 # markdown headings or cells can't include
139139 # block elements (elements w/newlines)
@@ -147,43 +147,95 @@ def process_tag(self, node, convert_as_inline):
147147 # Remove whitespace-only textnodes just before, after or
148148 # inside block-level elements.
149149 should_remove_inside = should_remove_whitespace_inside (node )
150- for el in node .children :
151- # Only extract (remove) whitespace-only text node if any of the
152- # conditions is true:
153- # - el is the first element in its parent (block-level)
154- # - el is the last element in its parent (block-level)
155- # - el is adjacent to a block-level node
156- can_extract = (should_remove_inside and (not el .previous_sibling
157- or not el .next_sibling )
158- or should_remove_whitespace_outside (el .previous_sibling )
159- or should_remove_whitespace_outside (el .next_sibling ))
160- if (isinstance (el , NavigableString )
161- and six .text_type (el ).strip () == ''
162- and can_extract ):
150+ children = list (node .children )
151+ for i , el in enumerate (children ):
152+ # Quick type check first to avoid unnecessary function calls
153+ if not isinstance (el , NavigableString ):
154+ continue
155+
156+ # Check if the text is entirely whitespace first
157+ text = six .text_type (el )
158+ if text .strip ():
159+ continue
160+
161+ # If first or last element
162+ is_at_extreme_position = (
163+ should_remove_inside and (i == 0 or i == len (children ) - 1 )
164+ )
165+ # True if there is a preceding sibling, and it should have whitespace removed.
166+ has_removal_candidate_to_left = (
167+ i > 0 and should_remove_whitespace_outside (children [i - 1 ])
168+ )
169+ # True if there is a following sibling, and it should have whitespace removed.
170+ has_removal_candidate_to_right = (
171+ i < len (children ) - 1 and should_remove_whitespace_outside (children [i + 1 ])
172+ )
173+ # Determine if we can extract based on position and adjacency
174+ can_extract = (
175+ is_at_extreme_position
176+ or has_removal_candidate_to_left
177+ or has_removal_candidate_to_right
178+ )
179+
180+ # Extract if conditions are met
181+ if can_extract :
163182 el .extract ()
164183
165184 # Convert the children first
166185 for el in node .children :
167186 if isinstance (el , Comment ) or isinstance (el , Doctype ):
168187 continue
169188 elif isinstance (el , NavigableString ):
170- text += self .process_text (el )
189+ text_parts . append ( self .process_text (el ) )
171190 else :
172- text_strip = text .rstrip ('\n ' )
173- newlines_left = len (text ) - len (text_strip )
191+ # 1) Pop trailing newlines from whatever's in text_parts so far
192+ text_strip , newlines_left = self .pop_trailing_newlines (text_parts )
193+ # 2) Convert the next tag
174194 next_text = self .process_tag (el , convert_children_as_inline )
195+ # 3) Figure out how many leading newlines in next_text
175196 next_text_strip = next_text .lstrip ('\n ' )
176197 newlines_right = len (next_text ) - len (next_text_strip )
198+ # 4) Calculate how many newlines to insert between text_strip and next_text_strip
177199 newlines = '\n ' * max (newlines_left , newlines_right )
178- text = text_strip + newlines + next_text_strip
200+ text_parts . extend ([ text_strip , newlines , next_text_strip ])
179201
180202 # apply this tag's final conversion function
181203 convert_fn_name = "convert_%s" % re .sub (r"[\[\]:-]" , "_" , node .name )
182204 convert_fn = getattr (self , convert_fn_name , None )
183205 if convert_fn and self .should_convert_tag (node .name ):
184- text = convert_fn (node , text , convert_as_inline )
185-
186- return text
206+ # Join the text parts before passing to convert_fn
207+ text_parts_str = '' .join (text_parts )
208+ text_parts = [convert_fn (node , text_parts_str , convert_as_inline )]
209+
210+ return '' .join (text_parts )
211+
212+ def pop_trailing_newlines (self , parts ):
213+ """
214+ Pops trailing newline text from the end of `parts`. I suspect this function
215+ will get refactored with time.
216+ Returns: (combined_text_without_trailing_newlines, count_of_newlines_popped)
217+ If there is no trailing text in parts, returns ("", 0).
218+ """
219+
220+ newlines_total = 0
221+ # 1) First pop off any chunks that are newlines
222+ newline_chunks = []
223+ while parts and parts [- 1 ].rstrip ('\n ' ) == '' :
224+ newline_chunks .append (parts .pop ())
225+ if newline_chunks :
226+ # Example: if we had ["\n", "\n", "\n"] at the very end
227+ all_newlines = '' .join (reversed (newline_chunks ))
228+ newlines_total += len (all_newlines )
229+
230+ # 2) Now look at one more chunk which might have some real text + trailing newlines
231+ text_without_newline = ''
232+ if parts :
233+ last_chunk = parts .pop ()
234+ stripped = last_chunk .rstrip ('\n ' )
235+ newlines_total += (len (last_chunk ) - len (stripped ))
236+ text_without_newline = stripped
237+
238+ return (text_without_newline , newlines_total )
187239
188240 def convert__document_ (self , el , text , convert_as_inline ):
189241 """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
0 commit comments