|
| 1 | +from lxml import html, etree |
| 2 | +from itertools import zip_longest |
| 3 | + |
| 4 | +def merge_elements(a: etree._Element, b: etree._Element): |
| 5 | + """ |
| 6 | + Lazy-merge 兩個 <Element>,相同標籤才往下比對。 |
| 7 | + 每次合併完成後立即 yield,讓呼叫端自行處理。 |
| 8 | + """ |
| 9 | + if a is None and b is None: |
| 10 | + return # 兩邊都沒東西 |
| 11 | + if a is None: |
| 12 | + yield b # 只有右邊 |
| 13 | + return |
| 14 | + if b is None: |
| 15 | + yield a # 只有左邊 |
| 16 | + return |
| 17 | + |
| 18 | + # 標籤一致才真正合併;否則視為差異,各自輸出 |
| 19 | + if a.tag != b.tag: |
| 20 | + yield a |
| 21 | + yield b |
| 22 | + return |
| 23 | + |
| 24 | + # ---------- 1) 建立新的節點 ---------- |
| 25 | + merged = etree.Element(a.tag) |
| 26 | + |
| 27 | + # ---------- 2) 合併屬性 ---------- |
| 28 | + merged.attrib.update(a.attrib) # 先左 |
| 29 | + merged.attrib.update(b.attrib) # 再右(右邊覆蓋) |
| 30 | + |
| 31 | + # ---------- 3) 合併文字 ---------- |
| 32 | + merged.text = (a.text or '') + (b.text or '') |
| 33 | + |
| 34 | + # ---------- 4) 逐子節點遞迴 ---------- |
| 35 | + for left_child, right_child in zip_longest(list(a), list(b)): |
| 36 | + for sub in merge_elements(left_child, right_child): |
| 37 | + merged.append(sub) |
| 38 | + |
| 39 | + # ---------- 5) 把結果交出去 ---------- |
| 40 | + yield merged |
| 41 | + |
| 42 | + |
| 43 | +# --------- 使用範例 --------- |
| 44 | +html1 = """<ul> |
| 45 | + <li id="a">apple</li> |
| 46 | + <li id="b">banana</li> |
| 47 | +</ul>""" |
| 48 | + |
| 49 | +html2 = """<ul> |
| 50 | + <li id="b">BANANA</li> |
| 51 | + <li id="c">cherry</li> |
| 52 | +</ul>""" |
| 53 | + |
| 54 | +dom1 = html.fromstring(html1) |
| 55 | +dom2 = html.fromstring(html2) |
| 56 | + |
| 57 | +# 只取第一層 <ul> |
| 58 | +merged_ul, = merge_elements(dom1, dom2) # 注意逗號 (unpack generator) |
| 59 | + |
| 60 | +print(html.tostring(merged_ul, |
| 61 | + encoding="unicode", |
| 62 | + pretty_print=True)) |
0 commit comments