@@ -417,38 +417,35 @@ def clean_conditionally(node, candidates, selector)
417
417
418
418
content_length = el . text . strip . length # Count the text length excluding any surrounding whitespace
419
419
link_density = get_link_density ( el )
420
- to_remove = false
421
- reason = ""
422
-
423
- if counts [ "img" ] > counts [ "p" ]
424
- reason = "too many images"
425
- to_remove = true
426
- elsif counts [ "li" ] > counts [ "p" ] && name != "ul" && name != "ol"
427
- reason = "more <li>s than <p>s"
428
- to_remove = true
429
- elsif counts [ "input" ] > ( counts [ "p" ] / 3 ) . to_i
430
- reason = "less than 3x <p>s than <input>s"
431
- to_remove = true
432
- elsif content_length < ( options [ :min_text_length ] || TEXT_LENGTH_THRESHOLD ) && ( counts [ "img" ] == 0 || counts [ "img" ] > 2 )
433
- reason = "too short a content length without a single image"
434
- to_remove = true
435
- elsif weight < 25 && link_density > 0.2
436
- reason = "too many links for its weight (#{ weight } )"
437
- to_remove = true
438
- elsif weight >= 25 && link_density > 0.5
439
- reason = "too many links for its weight (#{ weight } )"
440
- to_remove = true
441
- elsif ( counts [ "embed" ] == 1 && content_length < 75 ) || counts [ "embed" ] > 1
442
- reason = "<embed>s with too short a content length, or too many <embed>s"
443
- to_remove = true
444
- end
445
420
446
- if to_remove
421
+ reason = clean_conditionally_reason? ( counts , content_length , options , weight , link_density )
422
+ if reason
447
423
debug ( "Conditionally cleaned #{ name } ##{ el [ :id ] } .#{ el [ :class ] } with weight #{ weight } and content score #{ content_score } because it has #{ reason } ." )
448
424
el . remove
449
425
end
450
426
end
451
427
end
452
428
end
429
+
430
+ def clean_conditionally_reason? ( counts , content_length , options , weight , link_density )
431
+ if counts [ "img" ] > counts [ "p" ]
432
+ "too many images"
433
+ elsif counts [ "li" ] > counts [ "p" ] && name != "ul" && name != "ol"
434
+ "more <li>s than <p>s"
435
+ elsif counts [ "input" ] > ( counts [ "p" ] / 3 ) . to_i
436
+ "less than 3x <p>s than <input>s"
437
+ elsif content_length < ( options [ :min_text_length ] || TEXT_LENGTH_THRESHOLD ) && ( counts [ "img" ] == 0 || counts [ "img" ] > 2 )
438
+ "too short a content length without a single image"
439
+ elsif weight < 25 && link_density > 0.2
440
+ "too many links for its weight (#{ weight } )"
441
+ elsif weight >= 25 && link_density > 0.5
442
+ "too many links for its weight (#{ weight } )"
443
+ elsif ( counts [ "embed" ] == 1 && content_length < 75 ) || counts [ "embed" ] > 1
444
+ "<embed>s with too short a content length, or too many <embed>s"
445
+ else
446
+ nil
447
+ end
448
+ end
449
+
453
450
end
454
451
end
0 commit comments