Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extraction: review link and structure checks #653

Merged
merged 10 commits into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions tests/comparison_small.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,10 @@
#from trafilatura.utils import load_html, sanitize
#from trafilatura.xml import xmltotxt

logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
logging.basicConfig(stream=sys.stdout, level=logging.ERROR) # logging.WARNING

TEST_DIR = os.path.abspath(os.path.dirname(__file__))

## logging.basicConfig(stream=sys.stdout, level=logging.WARNING)

#JT_STOPLIST = jt_stoplist_init()


Expand Down
20 changes: 19 additions & 1 deletion tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,11 @@ def test_links():
mydoc = html.fromstring('<html><body><p>Test text under <a rel="license" href="">CC BY-SA license</a>.</p></body></html>')
assert 'license="CC BY-SA license"' in extract(mydoc, include_links=True, no_fallback=True, output_format='xml', config=ZERO_CONFIG, with_metadata=True)

# link in p, length threshold
mydoc = html.fromstring(f'<html><body><article><p><a>f{"abcd"*20}</a></p></article></body></html>')
assert "abc" in extract(copy(mydoc), no_fallback=True, config=ZERO_CONFIG, favor_precision=False)
assert extract(mydoc, no_fallback=True, config=ZERO_CONFIG, favor_precision=True) == ""


def test_tei():
'''test TEI-related functions'''
Expand Down Expand Up @@ -752,7 +757,7 @@ def test_htmlprocessing():
assert b'<p>A B tail C</p>' in etree.tostring(mydoc)

# paywalls
my_html = '<html><body><main><p>1</p><p id="paywall">2</p><p>3</p></main></body></html>'
my_html = '<html><body><main><p>1</p><p id="premium">2</p><p>3</p></main></body></html>'
assert extract(my_html, config=ZERO_CONFIG, no_fallback=True) == '1\n3'
assert extract(my_html, config=ZERO_CONFIG, no_fallback=False) == '1\n3'
# test tail of node deleted if set as text
Expand Down Expand Up @@ -1122,6 +1127,19 @@ def test_table_processing():
result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "a | b | |" in result

# links: this gets through (for now)
htmlstring = '<html><body><article><table><tr><td><a href="link.html">a</a></td></tr></table></article></body></html>'
result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "a |"

# link: this is filtered out
htmlstring = f'<html><body><article><table><tr><td><a href="link.html">{"abc"*100}</a></td></tr></table></article></body></html>'
result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == ""
htmlstring = f'<html><body><article><table><tr><td><a href="link.html">{" "*100}</a></td></tr></table></article></body></html>'
result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == ""


def test_list_processing():
options = DEFAULT_OPTIONS
Expand Down
4 changes: 1 addition & 3 deletions trafilatura/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from .settings import JUSTEXT_LANGUAGES
from .utils import fromstring_bytes, trim
from .xml import TEI_VALID_TAGS
from .xpaths import OVERALL_DISCARD_XPATH, PAYWALL_DISCARD_XPATH
from .xpaths import OVERALL_DISCARD_XPATH

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -48,7 +48,6 @@ def compare_extraction(tree, backup_tree, body, text, len_text, options):

use_readability, jt_result = False, False
# prior cleaning
backup_tree = prune_unwanted_nodes(backup_tree, PAYWALL_DISCARD_XPATH)
if options.focus == "precision":
backup_tree = prune_unwanted_nodes(backup_tree, OVERALL_DISCARD_XPATH)

Expand Down Expand Up @@ -92,7 +91,6 @@ def compare_extraction(tree, backup_tree, body, text, len_text, options):
# override faulty extraction: try with justext
if body.xpath(SANITIZED_XPATH) or len_text < options.min_extracted_size: # body.find(...)
LOGGER.debug('unclean document triggering justext examination: %s', options.source)
# tree = prune_unwanted_sections(tree, {}, options)
body2, text2, len_text2 = justext_rescue(tree, options)
jt_result = bool(text2)
# prevent too short documents from replacing the main text
Expand Down
117 changes: 53 additions & 64 deletions trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@

HTML_TAG_MAPPING = {v: k for k, v in REND_TAG_MAPPING.items()}


PRESERVE_IMG_CLEANING = {'figure', 'picture', 'source'}


Expand Down Expand Up @@ -110,91 +109,81 @@
return tree


def collect_link_info(links_xpath, favor_precision=False):
def collect_link_info(links_xpath):
'''Collect heuristics on link text'''
# init
mylist = []
mylist = [e for e in (trim(elem.text_content()) for elem in links_xpath) if e]
lengths = list(map(len, mylist))
# longer strings impact recall in favor of precision
threshold = 50 if favor_precision else 10
# examine the elements
for subelem in links_xpath:
subelemtext = trim(subelem.text_content())
if subelemtext:
mylist.append(subelemtext)
shortelems = sum(1 for text in mylist if len(text) < threshold)
lengths = sum(len(text) for text in mylist)
return lengths, len(mylist), shortelems, mylist
shortelems = sum(1 for l in lengths if l < 10)
return sum(lengths), len(mylist), shortelems, mylist


def link_density_test(element, text, favor_precision=False):
'''Remove sections which are rich in links (probably boilerplate)'''
links_xpath, mylist = element.findall('.//ref'), []
if links_xpath:
if element.tag == 'p': # and not element.getparent().tag == 'item'
if not favor_precision:
if element.getnext() is None:
limitlen, threshold = 60, 0.8
else:
limitlen, threshold = 30, 0.8
else:
limitlen, threshold = 200, 0.8
#if 'hi' in list(element):
# limitlen, threshold = 100, 0.8
#elif element.tag == 'head':
# limitlen, threshold = 50, 0.8
links_xpath = element.findall('.//ref')
if not links_xpath:
return False, []
mylist = []
# shortcut
if len(links_xpath) == 1:
len_threshold = 10 if favor_precision else 100
link_text = trim(links_xpath[0].text_content())
if len(link_text) > len_threshold and len(link_text) > len(text)*0.9:
return True, []
if element.tag == 'p':
limitlen = 60 if element.getnext() is None else 30
else:
if element.getnext() is None:
limitlen = 300
#elif re.search(r'[.?!:]', element.text_content()):
# limitlen, threshold = 150, 0.66
else:
if element.getnext() is None:
limitlen, threshold = 300, 0.8
#elif re.search(r'[.?!:]', elemtext):
# limitlen, threshold = 150, 0.66
else:
limitlen, threshold = 100, 0.8
elemlen = len(text)
if elemlen < limitlen:
linklen, elemnum, shortelems, mylist = collect_link_info(links_xpath, favor_precision)
if elemnum == 0:
return True, mylist
LOGGER.debug('list link text/total: %s/%s – short elems/total: %s/%s', linklen, elemlen, shortelems, elemnum)
# (elemnum > 1 and shortelems/elemnum > 0.8):
if linklen > threshold*elemlen or (elemnum > 1 and shortelems/elemnum > 0.8):
return True, mylist
limitlen = 100
elemlen = len(text)
if elemlen < limitlen:
linklen, elemnum, shortelems, mylist = collect_link_info(links_xpath)
if elemnum == 0:
return True, mylist
LOGGER.debug('list link text/total: %s/%s – short elems/total: %s/%s', linklen, elemlen, shortelems, elemnum)
if linklen > elemlen*0.8 or (elemnum > 1 and shortelems/elemnum > 0.8):
return True, mylist
return False, mylist


def link_density_test_tables(element):
'''Remove tables which are rich in links (probably boilerplate)'''
# if element.getnext() is not None:
# return False
links_xpath = element.findall('.//ref')
if links_xpath:
elemlen = len(trim(element.text_content()))
if elemlen > 250:
linklen, elemnum, _, _ = collect_link_info(links_xpath)
if elemnum == 0:
return True
LOGGER.debug('table link text: %s / total: %s', linklen, elemlen)
return linklen > 0.8*elemlen if elemlen < 1000 else linklen > 0.5*elemlen
# does more harm than good (issue #76)
#if shortelems > len(links_xpath) * 0.66:
# return True
return False

if not links_xpath:
return False

elemlen = len(trim(element.text_content()))
if elemlen < 200:
return False

linklen, elemnum, _, _ = collect_link_info(links_xpath)
if elemnum == 0:
return True

Check warning on line 166 in trafilatura/htmlprocessing.py

View check run for this annotation

Codecov / codecov/patch

trafilatura/htmlprocessing.py#L166

Added line #L166 was not covered by tests

LOGGER.debug('table link text: %s / total: %s', linklen, elemlen)
return linklen > 0.8*elemlen if elemlen < 1000 else linklen > 0.5*elemlen


def delete_by_link_density(subtree, tagname, backtracking=False, favor_precision=False):
'''Determine the link density of elements with respect to their length,
and remove the elements identified as boilerplate.'''
deletions = []
threshold = 200 if favor_precision else 100
len_threshold = 200 if favor_precision else 100
depth_threshold = 1 if favor_precision else 3

for elem in subtree.iter(tagname):
elemtext = trim(elem.text_content())
result, templist = link_density_test(elem, elemtext, favor_precision)
if result:
if result or (
backtracking and templist and
0 < len(elemtext) < len_threshold and len(elem) >= depth_threshold
):
deletions.append(elem)
elif backtracking and templist: # if?
if 0 < len(elemtext) < threshold and len(elem) >= 3:
deletions.extend(elem)
# print('backtrack:', text)
# else: # and not re.search(r'[?!.]', text):
# print(elem.tag, templist)

Expand Down Expand Up @@ -334,7 +323,7 @@
"Simplify markup and convert relevant HTML tags to an XML standard."
# delete links for faster processing
if not options.links:
xpath_expr = './/div//a|.//ul//a' # .//p//a ?
xpath_expr = ".//*[self::div or self::li or self::p]//a"
if options.tables:
xpath_expr += '|.//table//a'
# necessary for further detection
Expand All @@ -348,7 +337,7 @@
for elem in tree.iter('a', 'ref'):
elem.tag = 'ref'
# replace href attribute and delete the rest
target = elem.get('href') # defaults to None
target = elem.get('href') # defaults to None
elem.attrib.clear()
if target:
# convert relative URLs
Expand Down
35 changes: 17 additions & 18 deletions trafilatura/main_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
from .utils import FORMATTING_PROTECTED, is_image_file, text_chars_test
from .xpaths import (BODY_XPATH, COMMENTS_DISCARD_XPATH, COMMENTS_XPATH,
DISCARD_IMAGE_ELEMENTS, OVERALL_DISCARD_XPATH,
PAYWALL_DISCARD_XPATH, PRECISION_DISCARD_XPATH,
TEASER_DISCARD_XPATH)
PRECISION_DISCARD_XPATH, TEASER_DISCARD_XPATH)


LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -335,7 +334,7 @@ def handle_paragraphs(element, potential_tags, options):
return None


def define_cell_type(element, is_header):
def define_cell_type(is_header):
"Determine cell element type and mint new element."
# define tag
cell_element = Element("cell")
Expand Down Expand Up @@ -371,7 +370,7 @@ def handle_table(table_elem, potential_tags, options):
elif subelement.tag in TABLE_ELEMS:
is_header = subelement.tag == "th" and not seen_header_row
seen_header = seen_header or is_header
new_child_elem = define_cell_type(subelement, is_header)
new_child_elem = define_cell_type(is_header)
# process
if len(subelement) == 0:
processed_cell = process_node(subelement, options)
Expand All @@ -385,7 +384,7 @@ def handle_table(table_elem, potential_tags, options):
if child.tag in TABLE_ALL:
# todo: define attributes properly
if child.tag in TABLE_ELEMS:
# subcell_elem = define_cell_type(subelement)
# subcell_elem = define_cell_type(is_header)
child.tag = "cell"
processed_subchild = handle_textnode(child, options, preserve_spaces=True, comments_fix=True)
# todo: lists in table cells
Expand Down Expand Up @@ -509,7 +508,6 @@ def prune_unwanted_sections(tree, potential_tags, options):
favor_precision = options.focus == "precision"
# prune the rest
tree = prune_unwanted_nodes(tree, OVERALL_DISCARD_XPATH, with_backup=True)
tree = prune_unwanted_nodes(tree, PAYWALL_DISCARD_XPATH)
# decide if images are preserved
if 'graphic' not in potential_tags:
tree = prune_unwanted_nodes(tree, DISCARD_IMAGE_ELEMENTS)
Expand All @@ -518,17 +516,24 @@ def prune_unwanted_sections(tree, potential_tags, options):
tree = prune_unwanted_nodes(tree, TEASER_DISCARD_XPATH)
if favor_precision:
tree = prune_unwanted_nodes(tree, PRECISION_DISCARD_XPATH)
# remove elements by link density
tree = delete_by_link_density(tree, 'div', backtracking=True, favor_precision=favor_precision)
tree = delete_by_link_density(tree, 'list', backtracking=False, favor_precision=favor_precision)
tree = delete_by_link_density(tree, 'p', backtracking=False, favor_precision=favor_precision)
# remove elements by link density, several passes
for _ in range(2):
tree = delete_by_link_density(tree, 'div', backtracking=True, favor_precision=favor_precision)
tree = delete_by_link_density(tree, 'list', backtracking=False, favor_precision=favor_precision)
tree = delete_by_link_density(tree, 'p', backtracking=False, favor_precision=favor_precision)
# tables
if 'table' in potential_tags or favor_precision:
# tree = delete_by_link_density(tree, 'table', backtracking=False, favor_precision=favor_precision)
for elem in tree.iter('table'):
if link_density_test_tables(elem) is True:
elem.getparent().remove(elem)
# also filter fw/head, table and quote elements?
if favor_precision:
# delete trailing titles
while len(tree) > 0 and (tree[-1].tag == 'head'):
tree[-1].getparent().remove(tree[-1])
tree = delete_by_link_density(tree, 'head', backtracking=False) # favor_precision=favor_precision
tree = delete_by_link_density(tree, 'quote', backtracking=False) # favor_precision=favor_precision
tree = delete_by_link_density(tree, 'head', backtracking=False, favor_precision=True)
tree = delete_by_link_density(tree, 'quote', backtracking=False, favor_precision=True)
return tree


Expand All @@ -550,12 +555,6 @@ def _extract(tree, options):
continue
# prune the subtree
subtree = prune_unwanted_sections(subtree, potential_tags, options)
# second pass?
# subtree = delete_by_link_density(subtree, 'list', backtracking=False, favor_precision=options.focus == "precision")
if 'table' in potential_tags or options.focus == "precision":
for elem in subtree.iter('table'):
if link_density_test_tables(elem) is True:
elem.getparent().remove(elem)
# skip if empty tree
if len(subtree) == 0:
continue
Expand Down
23 changes: 8 additions & 15 deletions trafilatura/xpaths.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,18 +92,9 @@
# or contains(@class, 'comment') or contains(@id, 'comment')


PAYWALL_DISCARD_XPATH = [XPath(
'''.//*[self::div or self::p][
contains(@id, "paywall") or contains(@id, "premium") or
contains(@class, "paid-content") or contains(@class, "paidcontent") or
contains(@class, "obfuscated") or contains(@class, "blurred") or
contains(@class, "restricted") or contains(@class, "overlay")
]'''
)]


OVERALL_DISCARD_XPATH = [XPath(x) for x in (
# navigation + footers, news outlets related posts, sharing, jp-post-flair jp-relatedposts
# paywalls
'''.//*[self::div or self::item or self::list
or self::p or self::section or self::span][
contains(translate(@id, "F","f"), "footer") or contains(translate(@class, "F","f"), "footer")
Expand Down Expand Up @@ -142,14 +133,16 @@
or contains(@class, "criteo") or contains(@class, "options") or contains(@class, "expand")
or contains(@class, "consent") or contains(@class, "modal-content")
or contains(@class, " ad ") or contains(@class, "permission")
or contains(@class, "next-") or contains(@class, "side-stories")
or contains(@class, "related-stories") or contains(@class, "most-popular")
or contains(@class, "mol-factbox") or starts-with(@class, "ZendeskForm")
or contains(@class, "message-container") or contains(@id, "message_container")
or contains(@class, "next-") or contains(@class, "-stories")
or contains(@class, "most-popular") or contains(@class, "mol-factbox")
or starts-with(@class, "ZendeskForm") or contains(@id|@class, "message-container")
or contains(@class, "yin") or contains(@class, "zlylin")
or contains(@class, "xg1") or contains(@id, "bmdh")
or contains(@class, "slide") or contains(@class, "viewport")
or @data-lp-replacement-content]''',
or @data-lp-replacement-content
or contains(@id, "premium") or contains(@class, "overlay")
or contains(@class, "paid-content") or contains(@class, "paidcontent")
or contains(@class, "obfuscated") or contains(@class, "blurred")]''',

# comment debris + hidden parts
'''.//*[@class="comments-title" or contains(@class, "comments-title") or
Expand Down
Loading