Skip to content
This repository was archived by the owner on Jan 4, 2022. It is now read-only.

add audit trail #2

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 23 additions & 4 deletions readability/readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,8 +381,10 @@ def score_paragraphs(self):

# WTF? candidates[elem]['content_score'] += content_score
candidates[parent_node]["content_score"] += content_score
candidates[parent_node]["audit_trail"].append(f"+{content_score} (child)")
if grand_parent_node is not None:
candidates[grand_parent_node]["content_score"] += content_score / 2.0
candidates[grand_parent_node]["audit_trail"].append(f"+{content_score / 2.0} (grandchild)")

# Scale the final candidates score based on link density. Good content
# should have a relatively small link density (5% or less) and be
Expand All @@ -399,39 +401,55 @@ def score_paragraphs(self):

return candidates

def class_weight(self, e):
def class_weight_and_audit_trail(self, e):
audit_trail = []
weight = 0
for feature in [e.get("class", None), e.get("id", None)]:
if feature:
if REGEXES["negativeRe"].search(feature):
weight -= 25
audit_trail.append("-25: negativeRe")

if REGEXES["positiveRe"].search(feature):
weight += 25
audit_trail.append("+25: positiveRe")

if self.positive_keywords and self.positive_keywords.search(feature):
weight += 25
audit_trail.append("+25: positive_keywords")

if self.negative_keywords and self.negative_keywords.search(feature):
weight -= 25
audit_trail.append("-25: negative_keywords")

if self.positive_keywords and self.positive_keywords.match("tag-" + e.tag):
weight += 25
audit_trail.append("+25: positive_keywords")

if self.negative_keywords and self.negative_keywords.match("tag-" + e.tag):
weight -= 25
audit_trail.append("-25: negative_keywords")

return weight
return weight, audit_trail

def class_weight(self, elem):
return self.class_weight_and_audit_trail(elem)[0]

def score_node(self, elem):
content_score = self.class_weight(elem)
content_score, audit_trail = self.class_weight_and_audit_trail(elem)
name = elem.tag.lower()
audit_trail.append(f"total class_weight = {content_score}")
if name in ["div", "article"]:
content_score += 5
audit_trail.append("+5 div/article")
elif name in ["pre", "td", "blockquote"]:
content_score += 3
audit_trail.append("+3 pre/td/blockquote")

elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]:
content_score -= 3
audit_trail.append("-3 address/ol/ul/dl/dd/dt/li/form/aside")

elif name in [
"h1",
"h2",
Expand All @@ -445,7 +463,8 @@ def score_node(self, elem):
"nav",
]:
content_score -= 5
return {"content_score": content_score, "elem": elem}
audit_trail.append("-5 h")
return {"content_score": content_score, "elem": elem, "audit_trail": audit_trail}

def remove_unlikely_candidates(self):
for elem in self.html.findall(".//*"):
Expand Down
24 changes: 23 additions & 1 deletion tests/test_article_only.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import os
import unittest

from readability import Document
import timeout_decorator

from readability import Document

SAMPLES = os.path.join(os.path.dirname(__file__), "samples")

Expand Down Expand Up @@ -124,3 +124,25 @@ def test_utf8_kanji(self):
sample = load_sample("utf-8-kanji.sample.html")
doc = Document(sample)
res = doc.summary()

def test_audit_trail(self):
sample = load_sample("the-hurricane-rubin-carter-denzel-washington.html")
doc = Document(sample)

best_candidate = doc.select_best_candidate(doc.score_paragraphs())
self.assertListEqual(
[
"+25: positiveRe",
"total class_weight = 25",
"+5 div/article",
"+2.83 (child)",
"+9 (child)",
"+13 (child)",
"+10 (child)",
"+10 (child)",
"+9 (child)",
"+15 (child)",
"+2.65 (child)",
],
best_candidate["audit_trail"],
)