Skip to content

Commit 0e50b53

Browse files
committed
Release version 0.7 . Better HTML5 support and an important bugfix.
1 parent 537de2b commit 0e50b53

File tree

6 files changed

+56
-24
lines changed

6 files changed

+56
-24
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ env:
1111

1212
before_install:
1313
# work around https://github.com/travis-ci/travis-ci/issues/8363
14-
- pyenv global system 3.5
14+
- pyenv global system 3.6
1515

1616
install:
1717
- travis_retry pip install -U pip wheel tox

Makefile

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
# Makefile to help automate tasks
22
WD := $(shell pwd)
3-
PY := .env/bin/python
4-
PIP := .env/bin/pip
5-
PEP8 := .env/bin/pep8
6-
NOSE := .env/bin/nosetests
7-
3+
PY := .venv/bin/python
4+
PIP := .venv/bin/pip
5+
PEP8 := .venv/bin/pep8
6+
NOSE := .venv/bin/nosetests
87

98
# ###########
109
# Tests rule!
@@ -22,16 +21,17 @@ $(NOSE):
2221
.PHONY: all
2322
all: venv develop
2423

25-
venv: bin/python
26-
bin/python:
27-
virtualenv .env
24+
venv: .venv/bin/python
25+
26+
.venv/bin/python:
27+
virtualenv .venv
2828

2929
.PHONY: clean_venv
3030
clean_venv:
31-
rm -rf .env
31+
rm -rf .venv
3232

33-
develop: .env/lib/python*/site-packages/readability-lxml.egg-link
34-
.env/lib/python*/site-packages/readability-lxml.egg-link:
33+
develop: .venv/lib/python*/site-packages/readability-lxml.egg-link
34+
.venv/lib/python*/site-packages/readability-lxml.egg-link:
3535
$(PY) setup.py develop
3636

3737

README.rst

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,15 @@ Usage
3535
Change Log
3636
----------
3737

38-
- 0.3 Added Document.encoding, positive\_keywords and
39-
negative\_keywords
40-
- 0.4 Added Videos loading and allowed more images per paragraph
41-
- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and
42-
3.4
38+
- 0.7 Improved HTML5 tags handling. Heuristics were changed for a lot of sites: Fixed an important
39+
bug with stripping unwanted HTML nodes (only first matching node was removed before).
4340
- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3
4441
and 3.4
42+
- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and
43+
3.4
44+
- 0.4 Added Videos loading and allowed more images per paragraph
45+
- 0.3 Added Document.encoding, positive\_keywords and
46+
negative\_keywords
4547

4648
Licensing
4749
=========

readability/readability.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -381,13 +381,13 @@ def class_weight(self, e):
381381
def score_node(self, elem):
382382
content_score = self.class_weight(elem)
383383
name = elem.tag.lower()
384-
if name == "div":
384+
if name in ["div", "article"]:
385385
content_score += 5
386386
elif name in ["pre", "td", "blockquote"]:
387387
content_score += 3
388-
elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
388+
elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]:
389389
content_score -= 3
390-
elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
390+
elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th", "header", "footer", "nav"]:
391391
content_score -= 5
392392
return {
393393
'content_score': content_score,
@@ -463,7 +463,7 @@ def sanitize(self, node, candidates):
463463

464464
allowed = {}
465465
# Conditionally clean <table>s, <ul>s, and <div>s
466-
for el in self.reverse_tags(node, "table", "ul", "div"):
466+
for el in self.reverse_tags(node, "table", "ul", "div", "aside", "header", "footer", "section"):
467467
if el in allowed:
468468
continue
469469
weight = self.class_weight(el)
@@ -577,7 +577,7 @@ def sanitize(self, node, candidates):
577577
if siblings and sum(siblings) > 1000:
578578
to_remove = False
579579
log.debug("Allowing %s" % describe(el))
580-
for desnode in self.tags(el, "table", "ul", "div"):
580+
for desnode in self.tags(el, "table", "ul", "div", "section"):
581581
allowed[desnode] = True
582582

583583
if to_remove:

setup.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
setup(
1616
name="readability-lxml",
17-
version="0.6.2",
17+
version="0.7",
1818
author="Yuri Baburov",
1919
author_email="burchik@gmail.com",
2020
description="fast html to text parser (article readability tool) with python3 support",
@@ -43,6 +43,5 @@
4343
"Programming Language :: Python :: 3.4",
4444
"Programming Language :: Python :: 3.5",
4545
"Programming Language :: Python :: 3.6",
46-
4746
],
4847
)

tests/test_article_only.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,34 @@ def test_best_elem_is_root_and_passing(self):
6161
)
6262
doc = Document(sample)
6363
doc.summary()
64+
65+
def test_correct_cleanup(self):
66+
sample = """
67+
<html>
68+
<body>
69+
<section>test section</section>
70+
<article class="">
71+
<p>Lot of text here.</p>
72+
<div id="advertisement"><a href="link">Ad</a></div>
73+
<p>More text is written here, and contains punctuation and dots.</p>
74+
</article>
75+
<aside id="comment1"/>
76+
<div id="comment2">
77+
<a href="asd">spam</a>
78+
<a href="asd">spam</a>
79+
<a href="asd">spam</a>
80+
</div>
81+
<div id="comment3"/>
82+
<aside id="comment4">A small comment.</aside>
83+
<div id="comment5"><p>The comment is also helpful, but it's
84+
still not the correct item to be extracted.</p>
85+
<p>It's even longer than the article itself!"</p></div>
86+
</body>
87+
</html>
88+
"""
89+
doc = Document(sample)
90+
s = doc.summary()
91+
#print(s)
92+
assert('punctuation' in s)
93+
assert(not 'comment' in s)
94+
assert(not 'aside' in s)

0 commit comments

Comments
 (0)