Skip to content
This repository was archived by the owner on Jan 4, 2022. It is now read-only.

small content score refactoring #3

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 10 additions & 16 deletions readability/readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,11 +352,6 @@ def score_paragraphs(self):
candidates = {}
ordered = []
for elem in self.tags(self._html(), "p", "pre", "td"):
parent_node = elem.getparent()
if parent_node is None:
continue
grand_parent_node = parent_node.getparent()

inner_text = clean(elem.text_content() or "")
inner_text_len = len(inner_text)

Expand All @@ -365,24 +360,23 @@ def score_paragraphs(self):
if inner_text_len < MIN_LEN:
continue

if parent_node not in candidates:
candidates[parent_node] = self.score_node(parent_node)
ordered.append(parent_node)

if grand_parent_node is not None and grand_parent_node not in candidates:
candidates[grand_parent_node] = self.score_node(grand_parent_node)
ordered.append(grand_parent_node)

content_score = 1
content_score += len(inner_text.split(","))
content_score += min((inner_text_len / 100), 3)
# if elem not in candidates:
# candidates[elem] = self.score_node(elem)

# WTF? candidates[elem]['content_score'] += content_score
candidates[parent_node]["content_score"] += content_score
if grand_parent_node is not None:
candidates[grand_parent_node]["content_score"] += content_score / 2.0
p = elem.getparent()
depth = 1
while p is not None and depth <= 2:
if p not in candidates:
candidates[p] = self.score_node(p)
ordered.append(p)

candidates[p]["content_score"] += content_score / depth
p = p.getparent()
depth += 1

# Scale the final candidates score based on link density. Good content
# should have a relatively small link density (5% or less) and be
Expand Down