Skip to content

Commit

Permalink
LXML: compile most XPath expressions (#504)
Browse files Browse the repository at this point in the history
* LXML: compile XPath expressions

* compile all XPath expressions
  • Loading branch information
adbar authored Feb 13, 2024
1 parent ca32cab commit 27bb013
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 42 deletions.
10 changes: 5 additions & 5 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import warnings
from copy import deepcopy

from lxml.etree import Element, SubElement, strip_elements, strip_tags
from lxml.etree import Element, SubElement, XPath, strip_elements, strip_tags
from lxml.html import tostring

# own
Expand Down Expand Up @@ -545,7 +545,7 @@ def extract_content(tree, options):
for expr in BODY_XPATH:
# select tree if the expression has been found
try:
subtree = tree.xpath(expr)[0]
subtree = expr(tree)[0]
except IndexError:
continue
# prune the subtree
Expand Down Expand Up @@ -624,7 +624,7 @@ def extract_comments(tree, options):
# potential_tags.add('div') trouble with <div class="comment-author meta">
for expr in COMMENTS_XPATH:
# select tree if the expression has been found
subtree = tree.xpath(expr)
subtree = expr(tree)
if not subtree:
continue
subtree = subtree[0]
Expand Down Expand Up @@ -714,7 +714,7 @@ def compare_extraction(tree, backup_tree, url, body, text, len_text, options):

def basic_cleaning(tree):
"Remove a few section types from the document."
for elem in tree.xpath(BASIC_CLEAN_XPATH):
for elem in BASIC_CLEAN_XPATH(tree):
elem.getparent().remove(elem)
return tree

Expand Down Expand Up @@ -952,7 +952,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
if prune_xpath is not None:
if isinstance(prune_xpath, str):
prune_xpath = [prune_xpath]
tree = prune_unwanted_nodes(tree, prune_xpath)
tree = prune_unwanted_nodes(tree, [XPath(x) for x in prune_xpath])

# backup (or not) for further processing
tree_backup_1 = deepcopy(tree) if no_fallback is False else None
Expand Down
6 changes: 3 additions & 3 deletions trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from copy import deepcopy

from courlan.urlutils import fix_relative_urls, get_base_url
from lxml.etree import strip_tags
from lxml.etree import XPath, strip_tags

from .filters import duplicate_test, textfilter
from .settings import CUT_EMPTY_ELEMS, MANUALLY_CLEANED, MANUALLY_STRIPPED
Expand Down Expand Up @@ -84,8 +84,8 @@ def prune_unwanted_nodes(tree, nodelist, with_backup=False):
if with_backup is True:
old_len = len(tree.text_content()) # ' '.join(tree.itertext())
backup = deepcopy(tree)
for expr in nodelist:
for subtree in tree.xpath(expr):
for expression in nodelist:
for subtree in expression(tree):
# preserve tail text from deletion
if subtree.tail is not None:
previous = subtree.getprevious()
Expand Down
4 changes: 2 additions & 2 deletions trafilatura/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ def extract_metainfo(tree, expressions, len_limit=200):
for expression in expressions:
# examine all results
i = 0
for elem in tree.xpath(expression):
for elem in expression(tree):
content = trim(' '.join(elem.itertext()))
if content and 2 < len(content) < len_limit:
return content
Expand Down Expand Up @@ -405,7 +405,7 @@ def extract_catstags(metatype, tree):
for catexpr in xpath_expression:
results.extend(
elem.text_content()
for elem in tree.xpath(catexpr)
for elem in catexpr(tree)
if re.search(regexpr, elem.attrib['href'])
)
if results:
Expand Down
22 changes: 12 additions & 10 deletions trafilatura/metaxpaths.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,29 @@
# code available from https://github.com/adbar/trafilatura/
# under GNU GPLv3+ license

from lxml.etree import XPath


# the order or depth of XPaths could be changed after exhaustive testing
author_xpaths = [
author_xpaths = [XPath(x) for x in (
'//*[(self::a or self::address or self::div or self::link or self::p or self::span or self::strong)][@rel="author" or @id="author" or @class="author" or @itemprop="author name" or rel="me" or contains(@class, "author-name") or contains(@class, "AuthorName") or contains(@class, "authorName") or contains(@class, "author name")]|//author', # specific and almost specific
'//*[(self::a or self::div or self::h3 or self::h4 or self::p or self::span)][contains(@class, "author") or contains(@id, "author") or contains(@itemprop, "author") or @class="byline" or contains(@id, "zuozhe") or contains(@class, "zuozhe") or contains(@id, "bianji") or contains(@class, "bianji") or contains(@id, "xiaobian") or contains(@class, "xiaobian") or contains(@class, "submitted-by") or contains(@class, "posted-by") or @class="username" or @class="BBL" or contains(@class, "journalist-name")]', # almost generic and generic, last ones not common
'//*[contains(translate(@id, "A", "a"), "author") or contains(translate(@class, "A", "a"), "author") or contains(@class, "screenname") or contains(@data-component, "Byline") or contains(@itemprop, "author") or contains(@class, "writer") or contains(translate(@class, "B", "b"), "byline")]', # last resort: any element
]
)]


author_discard_xpaths = [
author_discard_xpaths = [XPath(x) for x in (
""".//*[(self::a or self::div or self::section or self::span)][@id='comments' or @class='comments' or @class='title' or @class='date' or
contains(@id, 'commentlist') or contains(@class, 'commentlist') or contains(@class, 'sidebar') or contains(@class, 'is-hidden') or contains(@class, 'quote')
or contains(@id, 'comment-list') or contains(@class, 'comments-list') or contains(@class, 'embedly-instagram') or contains(@id, 'ProductReviews') or
starts-with(@id, 'comments') or contains(@data-component, "Figure") or contains(@class, "article-share") or contains(@class, "article-support") or contains(@class, "print") or contains(@class, "category") or contains(@class, "meta-date") or contains(@class, "meta-reviewer")
or starts-with(@class, 'comments') or starts-with(@class, 'Comments')
]""",
'//time|//figure',
]
)]


categories_xpaths = [
categories_xpaths = [XPath(x) for x in (
"""//div[starts-with(@class, 'post-info') or starts-with(@class, 'postinfo') or
starts-with(@class, 'post-meta') or starts-with(@class, 'postmeta') or
starts-with(@class, 'meta') or starts-with(@class, 'entry-meta') or starts-with(@class, 'entry-info') or
Expand All @@ -35,26 +37,26 @@
'//*[(self::li or self::span)][@class="post-category" or @class="postcategory" or @class="entry-category" or contains(@class, "cat-links")]//a[@href]',
'//header[@class="entry-header"]//a[@href]',
'//div[@class="row" or @class="tags"]//a[@href]',
]
)]
# "//*[(self::div or self::p)][contains(@class, 'byline')]",


tags_xpaths = [
tags_xpaths = [XPath(x) for x in (
'//div[@class="tags"]//a[@href]',
"//p[starts-with(@class, 'entry-tags')]//a[@href]",
'''//div[@class="row" or @class="jp-relatedposts" or
@class="entry-utility" or starts-with(@class, 'tag') or
starts-with(@class, 'postmeta') or starts-with(@class, 'meta')]//a[@href]''',
'//*[@class="entry-meta" or contains(@class, "topics") or contains(@class, "tags-links")]//a[@href]',
]
)]
# "related-topics"
# https://github.com/grangier/python-goose/blob/develop/goose/extractors/tags.py


title_xpaths = [
title_xpaths = [XPath(x) for x in (
'//*[(self::h1 or self::h2)][contains(@class, "post-title") or contains(@class, "entry-title") or contains(@class, "headline") or contains(@id, "headline") or contains(@itemprop, "headline") or contains(@class, "post__title") or contains(@class, "article-title")]',
'//*[@class="entry-title" or @class="post-title"]',
'//*[(self::h1 or self::h2 or self::h3)][contains(@class, "title") or contains(@id, "title")]',
]
)]
# json-ld headline
# '//header/h1',
4 changes: 3 additions & 1 deletion trafilatura/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from os import cpu_count
from pathlib import Path

from lxml.etree import XPath



def use_config(filename=None, config=None):
Expand Down Expand Up @@ -72,7 +74,7 @@ def use_config(filename=None, config=None):
]
# 'center', 'rb', 'wbr'

BASIC_CLEAN_XPATH = ".//aside|.//footer|.//script|.//style"
BASIC_CLEAN_XPATH = XPath(".//aside|.//footer|.//script|.//style")

TAG_CATALOG = frozenset(['blockquote', 'code', 'del', 'head', 'hi', 'lb', 'list', 'p', 'pre', 'quote'])
# + list(CUT_EMPTY_ELEMS)
Expand Down
45 changes: 24 additions & 21 deletions trafilatura/xpaths.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
## under GNU GPL v3 license


BODY_XPATH = [
from lxml.etree import XPath


BODY_XPATH = [XPath(x) for x in (
'''.//*[(self::article or self::div or self::main or self::section)][
@class="post" or @class="entry" or
contains(@class, "post-text") or contains(@class, "post_text") or
Expand Down Expand Up @@ -46,7 +49,7 @@
or contains(translate(@class, "CP","cp"), "page-content") or
@id="content" or @class="content"])[1]''',
'(.//*[(self::article or self::div or self::section)][starts-with(@class, "main") or starts-with(@id, "main") or starts-with(@role, "main")])[1]|(.//main)[1]',
]
)]
# starts-with(@id, "article") or
# or starts-with(@id, "story") or contains(@class, "story")
# starts-with(@class, "content ") or contains(@class, " content")
Expand All @@ -58,7 +61,7 @@
# './/span[@class=""]', # instagram?


COMMENTS_XPATH = [
COMMENTS_XPATH = [XPath(x) for x in (
""".//*[(self::div or self::list or self::section)][contains(@id, 'commentlist')
or contains(@class, 'commentlist') or contains(@class, 'comment-page') or
contains(@id, 'comment-list') or contains(@class, 'comments-list') or
Expand All @@ -70,34 +73,34 @@
""".//*[(self::div or self::section or self::list)][starts-with(@id, 'comol') or
starts-with(@id, 'disqus_thread') or starts-with(@id, 'dsq-comments')]""",
".//*[(self::div or self::section)][starts-with(@id, 'social') or contains(@class, 'comment')]",
]
)]
# or contains(@class, 'Comments')


REMOVE_COMMENTS_XPATH = [
REMOVE_COMMENTS_XPATH = [XPath(
""".//*[(self::div or self::list or self::section)][
starts-with(translate(@id, "C","c"), 'comment') or
starts-with(translate(@class, "C","c"), 'comment') or
contains(@class, 'article-comments') or contains(@class, 'post-comments')
or starts-with(@id, 'comol') or starts-with(@id, 'disqus_thread')
or starts-with(@id, 'dsq-comments')
]""",
]
]"""
)]
# or self::span
# or contains(@class, 'comment') or contains(@id, 'comment')


PAYWALL_DISCARD_XPATH = [
PAYWALL_DISCARD_XPATH = [XPath(
'''.//*[(self::div or self::p)][
contains(@id, "paywall") or contains(@id, "premium") or
contains(@class, "paid-content") or contains(@class, "paidcontent") or
contains(@class, "obfuscated") or contains(@class, "blurred") or
contains(@class, "restricted") or contains(@class, "overlay")
]''',
]
]'''
)]


OVERALL_DISCARD_XPATH = [
OVERALL_DISCARD_XPATH = [XPath(x) for x in (
# navigation + footers, news outlets related posts, sharing, jp-post-flair jp-relatedposts
'''.//*[(self::div or self::item or self::list
or self::p or self::section or self::span)][
Expand Down Expand Up @@ -155,7 +158,7 @@
or contains(@style, "hidden") or contains(@hidden, "hidden") or contains(@class, "noprint")
or contains(@style, "display:none") or contains(@class, " hidden") or @aria-hidden="true"
or contains(@class, "notloaded")]''',
]
)]
# conflicts:
# contains(@id, "header") or contains(@class, "header") or
# class contains "cats" (categories, also tags?)
Expand All @@ -166,40 +169,40 @@


# the following conditions focus on extraction precision
TEASER_DISCARD_XPATH = [
TEASER_DISCARD_XPATH = [XPath(
'''.//*[(self::div or self::item or self::list
or self::p or self::section or self::span)][
contains(translate(@id, "T", "t"), "teaser") or contains(translate(@class, "T", "t"), "teaser")
]''',
]
]'''
)]


PRECISION_DISCARD_XPATH = [
PRECISION_DISCARD_XPATH = [XPath(x) for x in (
'.//header',
'''.//*[(self::div or self::item or self::list
or self::p or self::section or self::span)][
contains(@id, "bottom") or contains(@class, "bottom") or
contains(@id, "link") or contains(@class, "link")
or contains(@style, "border")
]''',
]
)]


DISCARD_IMAGE_ELEMENTS = [
DISCARD_IMAGE_ELEMENTS = [XPath(
'''.//*[(self::div or self::item or self::list
or self::p or self::section or self::span)][
contains(@id, "caption") or contains(@class, "caption")
]
'''
]
)]


COMMENTS_DISCARD_XPATH = [
COMMENTS_DISCARD_XPATH = [XPath(x) for x in (
'.//*[(self::div or self::section)][starts-with(@id, "respond")]',
'.//cite|.//quote',
'''.//*[@class="comments-title" or contains(@class, "comments-title") or
contains(@class, "nocomments") or starts-with(@id, "reply-") or
starts-with(@class, "reply-") or contains(@class, "-reply-") or contains(@class, "message")
or contains(@class, "signin") or
contains(@id, "akismet") or contains(@class, "akismet") or contains(@style, "display:none")]''',
]
)]

0 comments on commit 27bb013

Please sign in to comment.