Release version 0.7 . Better HTML5 support and an important bugfix.

buriy · buriy · commit 0e50b53d0563 · 2018-05-07T17:53:53.000+07:00
diff --git a/.travis.yml b/.travis.yml
@@ -11,7 +11,7 @@ env:
 
 before_install:
   # work around https://github.com/travis-ci/travis-ci/issues/8363
-  - pyenv global system 3.5
+  - pyenv global system 3.6
 
 install:
   - travis_retry pip install -U pip wheel tox
diff --git a/Makefile b/Makefile
@@ -1,10 +1,9 @@
 # Makefile to help automate tasks
 WD := $(shell pwd)
-PY := .env/bin/python
-PIP := .env/bin/pip
-PEP8 := .env/bin/pep8
-NOSE := .env/bin/nosetests
-
+PY := .venv/bin/python
+PIP := .venv/bin/pip
+PEP8 := .venv/bin/pep8
+NOSE := .venv/bin/nosetests
 
 # ###########
 # Tests rule!
@@ -22,16 +21,17 @@ $(NOSE):
 .PHONY: all
 all: venv develop
 
-venv: bin/python
-bin/python:
-	virtualenv .env
+venv: .venv/bin/python
+
+.venv/bin/python:
+	virtualenv .venv
 
 .PHONY: clean_venv
 clean_venv:
-	rm -rf .env
+	rm -rf .venv
 
-develop: .env/lib/python*/site-packages/readability-lxml.egg-link
-.env/lib/python*/site-packages/readability-lxml.egg-link:
+develop: .venv/lib/python*/site-packages/readability-lxml.egg-link
+.venv/lib/python*/site-packages/readability-lxml.egg-link:
 	$(PY) setup.py develop
 
 
diff --git a/README.rst b/README.rst
@@ -35,13 +35,15 @@ Usage
 Change Log
 ----------
 
--  0.3 Added Document.encoding, positive\_keywords and
-   negative\_keywords
--  0.4 Added Videos loading and allowed more images per paragraph
--  0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and
-   3.4
+-  0.7 Improved HTML5 tags handling. Heuristics were changed for a lot of sites: Fixed an important
+bug with stripping unwanted HTML nodes (only first matching node was removed before).
 -  0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3
    and 3.4
+-  0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and
+   3.4
+-  0.4 Added Videos loading and allowed more images per paragraph
+-  0.3 Added Document.encoding, positive\_keywords and
+   negative\_keywords
 
 Licensing
 =========
diff --git a/readability/readability.py b/readability/readability.py
@@ -381,13 +381,13 @@ def class_weight(self, e):
     def score_node(self, elem):
         content_score = self.class_weight(elem)
         name = elem.tag.lower()
-        if name == "div":
+        if name in ["div", "article"]:
             content_score += 5
         elif name in ["pre", "td", "blockquote"]:
             content_score += 3
-        elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
+        elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]:
             content_score -= 3
-        elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
+        elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th", "header", "footer", "nav"]:
             content_score -= 5
         return {
             'content_score': content_score,
@@ -463,7 +463,7 @@ def sanitize(self, node, candidates):
 
         allowed = {}
         # Conditionally clean <table>s, <ul>s, and <div>s
-        for el in self.reverse_tags(node, "table", "ul", "div"):
+        for el in self.reverse_tags(node, "table", "ul", "div", "aside", "header", "footer", "section"):
             if el in allowed:
                 continue
             weight = self.class_weight(el)
@@ -577,7 +577,7 @@ def sanitize(self, node, candidates):
                     if siblings and sum(siblings) > 1000:
                         to_remove = False
                         log.debug("Allowing %s" % describe(el))
-                        for desnode in self.tags(el, "table", "ul", "div"):
+                        for desnode in self.tags(el, "table", "ul", "div", "section"):
                             allowed[desnode] = True
 
                 if to_remove:
diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@
 
 setup(
     name="readability-lxml",
-    version="0.6.2",
+    version="0.7",
     author="Yuri Baburov",
     author_email="burchik@gmail.com",
     description="fast html to text parser (article readability tool) with python3 support",
@@ -43,6 +43,5 @@
         "Programming Language :: Python :: 3.4",
         "Programming Language :: Python :: 3.5",
         "Programming Language :: Python :: 3.6",
-
     ],
 )
diff --git a/tests/test_article_only.py b/tests/test_article_only.py
@@ -61,3 +61,34 @@ def test_best_elem_is_root_and_passing(self):
         )
         doc = Document(sample)
         doc.summary()
+
+    def test_correct_cleanup(self):
+        sample = """
+        <html>
+            <body>
+                <section>test section</section>
+                <article class="">
+<p>Lot of text here.</p>
+                <div id="advertisement"><a href="link">Ad</a></div>
+<p>More text is written here, and contains punctuation and dots.</p>
+</article>
+                <aside id="comment1"/>
+                <div id="comment2">
+                    <a href="asd">spam</a>
+                    <a href="asd">spam</a>
+                    <a href="asd">spam</a>
+                </div>
+                <div id="comment3"/>
+                <aside id="comment4">A small comment.</aside>
+                <div id="comment5"><p>The comment is also helpful, but it's
+                    still not the correct item to be extracted.</p>
+                    <p>It's even longer than the article itself!"</p></div>
+            </body>
+        </html>
+        """
+        doc = Document(sample)
+        s = doc.summary()
+        #print(s)
+        assert('punctuation' in s)
+        assert(not 'comment' in s)
+        assert(not 'aside' in s)