adbar · adbar · Jul 26, 2024 · Jul 23, 2024 · Jul 23, 2024 · Jul 23, 2024
diff --git a/tests/comparison_small.py b/tests/comparison_small.py
@@ -34,12 +34,10 @@
 #from trafilatura.utils import load_html, sanitize
 #from trafilatura.xml import xmltotxt
 
-logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
+logging.basicConfig(stream=sys.stdout, level=logging.ERROR)  # logging.WARNING
 
 TEST_DIR = os.path.abspath(os.path.dirname(__file__))
 
-## logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
-
 #JT_STOPLIST = jt_stoplist_init()
 
 

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -516,6 +516,11 @@ def test_links():
     mydoc = html.fromstring('<html><body><p>Test text under <a rel="license" href="">CC BY-SA license</a>.</p></body></html>')
     assert 'license="CC BY-SA license"' in extract(mydoc, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG, with_metadata=True)
 
+    # link in p, length threshold
+    mydoc = html.fromstring(f'<html><body><article><p><a>f{"abcd"*20}</a></p></article></body></html>')
+    assert "abc" in extract(copy(mydoc), no_fallback=True, config=ZERO_CONFIG, favor_precision=False)
+    assert extract(mydoc, no_fallback=True, config=ZERO_CONFIG, favor_precision=True) == ""
+
 
 def test_tei():
     '''test TEI-related functions'''
@@ -752,7 +757,7 @@ def test_htmlprocessing():
     assert b'<p>A B tail C</p>' in etree.tostring(mydoc)
 
     # paywalls
-    my_html = '<html><body><main><p>1</p><p id="paywall">2</p><p>3</p></main></body></html>'
+    my_html = '<html><body><main><p>1</p><p id="premium">2</p><p>3</p></main></body></html>'
     assert extract(my_html, config=ZERO_CONFIG, no_fallback=True) == '1\n3'
     assert extract(my_html, config=ZERO_CONFIG, no_fallback=False) == '1\n3'
     # test tail of node deleted if set as text
@@ -1122,6 +1127,19 @@ def test_table_processing():
     result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
     assert "a | b | |" in result
 
+    # links: this gets through (for now)
+    htmlstring = '<html><body><article><table><tr><td><a href="link.html">a</a></td></tr></table></article></body></html>'
+    result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
+    assert result == "a |"
+
+    # link: this is filtered out
+    htmlstring = f'<html><body><article><table><tr><td><a href="link.html">{"abc"*100}</a></td></tr></table></article></body></html>'
+    result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
+    assert result == ""
+    htmlstring = f'<html><body><article><table><tr><td><a href="link.html">{" "*100}</a></td></tr></table></article></body></html>'
+    result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
+    assert result == ""
+
 
 def test_list_processing():
     options = DEFAULT_OPTIONS

diff --git a/trafilatura/external.py b/trafilatura/external.py
@@ -18,7 +18,7 @@
 from .settings import JUSTEXT_LANGUAGES
 from .utils import fromstring_bytes, trim
 from .xml import TEI_VALID_TAGS
-from .xpaths import OVERALL_DISCARD_XPATH, PAYWALL_DISCARD_XPATH
+from .xpaths import OVERALL_DISCARD_XPATH
 
 LOGGER = logging.getLogger(__name__)
 
@@ -48,7 +48,6 @@ def compare_extraction(tree, backup_tree, body, text, len_text, options):
 
     use_readability, jt_result = False, False
     # prior cleaning
-    backup_tree = prune_unwanted_nodes(backup_tree, PAYWALL_DISCARD_XPATH)
     if options.focus == "precision":
         backup_tree = prune_unwanted_nodes(backup_tree, OVERALL_DISCARD_XPATH)
 
@@ -92,7 +91,6 @@ def compare_extraction(tree, backup_tree, body, text, len_text, options):
     # override faulty extraction: try with justext
     if body.xpath(SANITIZED_XPATH) or len_text < options.min_extracted_size:  # body.find(...)
         LOGGER.debug('unclean document triggering justext examination: %s', options.source)
-        # tree = prune_unwanted_sections(tree, {}, options)
         body2, text2, len_text2 = justext_rescue(tree, options)
         jt_result = bool(text2)
         # prevent too short documents from replacing the main text

diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py
@@ -34,7 +34,6 @@
 
 HTML_TAG_MAPPING = {v: k for k, v in REND_TAG_MAPPING.items()}
 
-
 PRESERVE_IMG_CLEANING = {'figure', 'picture', 'source'}
 
 
@@ -110,91 +109,81 @@
     return tree
 
 
-def collect_link_info(links_xpath, favor_precision=False):
+def collect_link_info(links_xpath):
     '''Collect heuristics on link text'''
-    # init
-    mylist = []
+    mylist = [e for e in (trim(elem.text_content()) for elem in links_xpath) if e]
+    lengths = list(map(len, mylist))
     # longer strings impact recall in favor of precision
-    threshold = 50 if favor_precision else 10
-    # examine the elements
-    for subelem in links_xpath:
-        subelemtext = trim(subelem.text_content())
-        if subelemtext:
-            mylist.append(subelemtext)
-    shortelems = sum(1 for text in mylist if len(text) < threshold)
-    lengths = sum(len(text) for text in mylist)
-    return lengths, len(mylist), shortelems, mylist
+    shortelems = sum(1 for l in lengths if l < 10)
+    return sum(lengths), len(mylist), shortelems, mylist
 
 
 def link_density_test(element, text, favor_precision=False):
     '''Remove sections which are rich in links (probably boilerplate)'''
-    links_xpath, mylist = element.findall('.//ref'), []
-    if links_xpath:
-        if element.tag == 'p': #  and not element.getparent().tag == 'item'
-            if not favor_precision:
-                if element.getnext() is None:
-                    limitlen, threshold = 60, 0.8
-                else:
-                    limitlen, threshold = 30, 0.8
-            else:
-                limitlen, threshold = 200, 0.8
-            #if 'hi' in list(element):
-            #    limitlen, threshold = 100, 0.8
-        #elif element.tag == 'head':
-        #    limitlen, threshold = 50, 0.8
+    links_xpath = element.findall('.//ref')
+    if not links_xpath:
+        return False, []
+    mylist = []
+    # shortcut
+    if len(links_xpath) == 1:
+        len_threshold = 10 if favor_precision else 100
+        link_text = trim(links_xpath[0].text_content())
+        if len(link_text) > len_threshold and len(link_text) > len(text)*0.9:
+            return True, []
+    if element.tag == 'p':
+        limitlen = 60 if element.getnext() is None else 30
+    else:
+        if element.getnext() is None:
+            limitlen = 300
+        #elif re.search(r'[.?!:]', element.text_content()):
+        #    limitlen, threshold = 150, 0.66
         else:
-            if element.getnext() is None:
-                limitlen, threshold = 300, 0.8
-            #elif re.search(r'[.?!:]', elemtext):
-            #    limitlen, threshold = 150, 0.66
-            else:
-                limitlen, threshold = 100, 0.8
-        elemlen = len(text)
-        if elemlen < limitlen:
-            linklen, elemnum, shortelems, mylist = collect_link_info(links_xpath, favor_precision)
-            if elemnum == 0:
-                return True, mylist
-            LOGGER.debug('list link text/total: %s/%s – short elems/total: %s/%s', linklen, elemlen, shortelems, elemnum)
-            # (elemnum > 1 and shortelems/elemnum > 0.8):
-            if linklen > threshold*elemlen or (elemnum > 1 and shortelems/elemnum > 0.8):
-                return True, mylist
+            limitlen = 100
+    elemlen = len(text)
+    if elemlen < limitlen:
+        linklen, elemnum, shortelems, mylist = collect_link_info(links_xpath)
+        if elemnum == 0:
+            return True, mylist
+        LOGGER.debug('list link text/total: %s/%s – short elems/total: %s/%s', linklen, elemlen, shortelems, elemnum)
+        if linklen > elemlen*0.8 or (elemnum > 1 and shortelems/elemnum > 0.8):
+            return True, mylist
     return False, mylist
 
 
 def link_density_test_tables(element):
     '''Remove tables which are rich in links (probably boilerplate)'''
-    # if element.getnext() is not None:
-    #     return False
     links_xpath = element.findall('.//ref')
-    if links_xpath:
-        elemlen = len(trim(element.text_content()))
-        if elemlen > 250:
-            linklen, elemnum, _, _ = collect_link_info(links_xpath)
-            if elemnum == 0:
-                return True
-            LOGGER.debug('table link text: %s / total: %s', linklen, elemlen)
-            return linklen > 0.8*elemlen if elemlen < 1000 else linklen > 0.5*elemlen
-            # does more harm than good (issue #76)
-            #if shortelems > len(links_xpath) * 0.66:
-            #    return True
-    return False
+
+    if not links_xpath:
+        return False
+
+    elemlen = len(trim(element.text_content()))
+    if elemlen < 200:
+        return False
+
+    linklen, elemnum, _, _ = collect_link_info(links_xpath)
+    if elemnum == 0:
+        return True
+
+    LOGGER.debug('table link text: %s / total: %s', linklen, elemlen)
+    return linklen > 0.8*elemlen if elemlen < 1000 else linklen > 0.5*elemlen
 
 
 def delete_by_link_density(subtree, tagname, backtracking=False, favor_precision=False):
     '''Determine the link density of elements with respect to their length,
        and remove the elements identified as boilerplate.'''
     deletions = []
-    threshold = 200 if favor_precision else 100
+    len_threshold = 200 if favor_precision else 100
+    depth_threshold = 1 if favor_precision else 3
 
     for elem in subtree.iter(tagname):
         elemtext = trim(elem.text_content())
         result, templist = link_density_test(elem, elemtext, favor_precision)
-        if result:
+        if result or (
+            backtracking and templist and
+            0 < len(elemtext) < len_threshold and len(elem) >= depth_threshold
+        ):
             deletions.append(elem)
-        elif backtracking and templist:  # if?
-            if 0 < len(elemtext) < threshold and len(elem) >= 3:
-                deletions.extend(elem)
-                # print('backtrack:', text)
             # else: # and not re.search(r'[?!.]', text):
             # print(elem.tag, templist)
 
@@ -334,7 +323,7 @@
     "Simplify markup and convert relevant HTML tags to an XML standard."
     # delete links for faster processing
     if not options.links:
-        xpath_expr = './/div//a|.//ul//a'  # .//p//a ?
+        xpath_expr = ".//*[self::div or self::li or self::p]//a"
         if options.tables:
             xpath_expr += '|.//table//a'
         # necessary for further detection
@@ -348,7 +337,7 @@
         for elem in tree.iter('a', 'ref'):
             elem.tag = 'ref'
             # replace href attribute and delete the rest
-            target = elem.get('href') # defaults to None
+            target = elem.get('href')  # defaults to None
             elem.attrib.clear()
             if target:
                 # convert relative URLs

diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py
@@ -18,8 +18,7 @@
 from .utils import FORMATTING_PROTECTED, is_image_file, text_chars_test
 from .xpaths import (BODY_XPATH, COMMENTS_DISCARD_XPATH, COMMENTS_XPATH,
                      DISCARD_IMAGE_ELEMENTS, OVERALL_DISCARD_XPATH,
-                     PAYWALL_DISCARD_XPATH, PRECISION_DISCARD_XPATH,
-                     TEASER_DISCARD_XPATH)
+                     PRECISION_DISCARD_XPATH, TEASER_DISCARD_XPATH)
 
 
 LOGGER = logging.getLogger(__name__)
@@ -335,7 +334,7 @@ def handle_paragraphs(element, potential_tags, options):
     return None
 
 
-def define_cell_type(element, is_header):
+def define_cell_type(is_header):
     "Determine cell element type and mint new element."
     # define tag
     cell_element = Element("cell")
@@ -371,7 +370,7 @@ def handle_table(table_elem, potential_tags, options):
         elif subelement.tag in TABLE_ELEMS:
             is_header = subelement.tag == "th" and not seen_header_row
             seen_header = seen_header or is_header
-            new_child_elem = define_cell_type(subelement, is_header)
+            new_child_elem = define_cell_type(is_header)
             # process
             if len(subelement) == 0:
                 processed_cell = process_node(subelement, options)
@@ -385,7 +384,7 @@ def handle_table(table_elem, potential_tags, options):
                     if child.tag in TABLE_ALL:
                         # todo: define attributes properly
                         if child.tag in TABLE_ELEMS:
-                            # subcell_elem = define_cell_type(subelement)
+                            # subcell_elem = define_cell_type(is_header)
                             child.tag = "cell"
                         processed_subchild = handle_textnode(child, options, preserve_spaces=True, comments_fix=True)
                     # todo: lists in table cells
@@ -509,7 +508,6 @@ def prune_unwanted_sections(tree, potential_tags, options):
     favor_precision = options.focus == "precision"
     # prune the rest
     tree = prune_unwanted_nodes(tree, OVERALL_DISCARD_XPATH, with_backup=True)
-    tree = prune_unwanted_nodes(tree, PAYWALL_DISCARD_XPATH)
     # decide if images are preserved
     if 'graphic' not in potential_tags:
         tree = prune_unwanted_nodes(tree, DISCARD_IMAGE_ELEMENTS)
@@ -518,17 +516,24 @@ def prune_unwanted_sections(tree, potential_tags, options):
         tree = prune_unwanted_nodes(tree, TEASER_DISCARD_XPATH)
         if favor_precision:
             tree = prune_unwanted_nodes(tree, PRECISION_DISCARD_XPATH)
-    # remove elements by link density
-    tree = delete_by_link_density(tree, 'div', backtracking=True, favor_precision=favor_precision)
-    tree = delete_by_link_density(tree, 'list', backtracking=False, favor_precision=favor_precision)
-    tree = delete_by_link_density(tree, 'p', backtracking=False, favor_precision=favor_precision)
+    # remove elements by link density, several passes
+    for _ in range(2):
+        tree = delete_by_link_density(tree, 'div', backtracking=True, favor_precision=favor_precision)
+        tree = delete_by_link_density(tree, 'list', backtracking=False, favor_precision=favor_precision)
+        tree = delete_by_link_density(tree, 'p', backtracking=False, favor_precision=favor_precision)
+    # tables
+    if 'table' in potential_tags or favor_precision:
+        # tree = delete_by_link_density(tree, 'table', backtracking=False, favor_precision=favor_precision)
+        for elem in tree.iter('table'):
+            if link_density_test_tables(elem) is True:
+                elem.getparent().remove(elem)
     # also filter fw/head, table and quote elements?
     if favor_precision:
         # delete trailing titles
         while len(tree) > 0 and (tree[-1].tag == 'head'):
             tree[-1].getparent().remove(tree[-1])
-        tree = delete_by_link_density(tree, 'head', backtracking=False)  # favor_precision=favor_precision
-        tree = delete_by_link_density(tree, 'quote', backtracking=False)  # favor_precision=favor_precision
+        tree = delete_by_link_density(tree, 'head', backtracking=False, favor_precision=True)
+        tree = delete_by_link_density(tree, 'quote', backtracking=False, favor_precision=True)
     return tree
 
 
@@ -550,12 +555,6 @@ def _extract(tree, options):
             continue
         # prune the subtree
         subtree = prune_unwanted_sections(subtree, potential_tags, options)
-        # second pass?
-        # subtree = delete_by_link_density(subtree, 'list', backtracking=False, favor_precision=options.focus == "precision")
-        if 'table' in potential_tags or options.focus == "precision":
-            for elem in subtree.iter('table'):
-                if link_density_test_tables(elem) is True:
-                    elem.getparent().remove(elem)
         # skip if empty tree
         if len(subtree) == 0:
             continue

diff --git a/trafilatura/xpaths.py b/trafilatura/xpaths.py
@@ -92,18 +92,9 @@
 # or contains(@class, 'comment') or contains(@id, 'comment')
 
 
-PAYWALL_DISCARD_XPATH = [XPath(
-    '''.//*[self::div or self::p][
-    contains(@id, "paywall") or contains(@id, "premium") or
-    contains(@class, "paid-content") or contains(@class, "paidcontent") or
-    contains(@class, "obfuscated") or contains(@class, "blurred") or
-    contains(@class, "restricted") or contains(@class, "overlay")
-    ]'''
-)]
-
-
 OVERALL_DISCARD_XPATH = [XPath(x) for x in (
     # navigation + footers, news outlets related posts, sharing, jp-post-flair jp-relatedposts
+    # paywalls
     '''.//*[self::div or self::item or self::list
             or self::p or self::section or self::span][
     contains(translate(@id, "F","f"), "footer") or contains(translate(@class, "F","f"), "footer")
@@ -142,14 +133,16 @@
     or contains(@class, "criteo") or contains(@class, "options") or contains(@class, "expand")
     or contains(@class, "consent") or contains(@class, "modal-content")
     or contains(@class, " ad ") or contains(@class, "permission")
-    or contains(@class, "next-") or contains(@class, "side-stories")
-    or contains(@class, "related-stories") or contains(@class, "most-popular")
-    or contains(@class, "mol-factbox") or starts-with(@class, "ZendeskForm")
-    or contains(@class, "message-container") or contains(@id, "message_container")
+    or contains(@class, "next-") or contains(@class, "-stories")
+    or contains(@class, "most-popular") or contains(@class, "mol-factbox")
+    or starts-with(@class, "ZendeskForm") or contains(@id|@class, "message-container")
     or contains(@class, "yin") or contains(@class, "zlylin")
     or contains(@class, "xg1") or contains(@id, "bmdh")
     or contains(@class, "slide") or contains(@class, "viewport")
-    or @data-lp-replacement-content]''',
+    or @data-lp-replacement-content
+    or contains(@id, "premium") or contains(@class, "overlay")
+    or contains(@class, "paid-content") or contains(@class, "paidcontent")
+    or contains(@class, "obfuscated") or contains(@class, "blurred")]''',
 
     # comment debris + hidden parts
     '''.//*[@class="comments-title" or contains(@class, "comments-title") or