Skip to content

Commit b4a5cd1

Browse files
author
Sam
committed
Extract clean conditionally reason
1 parent 6104b46 commit b4a5cd1

File tree

1 file changed

+23
-26
lines changed

1 file changed

+23
-26
lines changed

lib/readability.rb

Lines changed: 23 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -417,38 +417,35 @@ def clean_conditionally(node, candidates, selector)
417417

418418
content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
419419
link_density = get_link_density(el)
420-
to_remove = false
421-
reason = ""
422-
423-
if counts["img"] > counts["p"]
424-
reason = "too many images"
425-
to_remove = true
426-
elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
427-
reason = "more <li>s than <p>s"
428-
to_remove = true
429-
elsif counts["input"] > (counts["p"] / 3).to_i
430-
reason = "less than 3x <p>s than <input>s"
431-
to_remove = true
432-
elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
433-
reason = "too short a content length without a single image"
434-
to_remove = true
435-
elsif weight < 25 && link_density > 0.2
436-
reason = "too many links for its weight (#{weight})"
437-
to_remove = true
438-
elsif weight >= 25 && link_density > 0.5
439-
reason = "too many links for its weight (#{weight})"
440-
to_remove = true
441-
elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
442-
reason = "<embed>s with too short a content length, or too many <embed>s"
443-
to_remove = true
444-
end
445420

446-
if to_remove
421+
reason = clean_conditionally_reason?(counts, content_length, options, weight, link_density)
422+
if reason
447423
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
448424
el.remove
449425
end
450426
end
451427
end
452428
end
429+
430+
def clean_conditionally_reason?(counts, content_length, options, weight, link_density)
431+
if counts["img"] > counts["p"]
432+
"too many images"
433+
elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
434+
"more <li>s than <p>s"
435+
elsif counts["input"] > (counts["p"] / 3).to_i
436+
"less than 3x <p>s than <input>s"
437+
elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
438+
"too short a content length without a single image"
439+
elsif weight < 25 && link_density > 0.2
440+
"too many links for its weight (#{weight})"
441+
elsif weight >= 25 && link_density > 0.5
442+
"too many links for its weight (#{weight})"
443+
elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
444+
"<embed>s with too short a content length, or too many <embed>s"
445+
else
446+
nil
447+
end
448+
end
449+
453450
end
454451
end

0 commit comments

Comments
 (0)