slaveofcode · derlin · Mar 25, 2018 · Mar 25, 2018 · Mar 25, 2018 · Mar 25, 2018
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,38 @@
+# Distribution / packaging
+__pycache__
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+PKG-INFO
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# IDEs
+.idea
+.vscode
+*.iml
+
+
+# data downloaded
+boilerpipe/data
diff --git a/MANIFEST b/MANIFEST
diff --git a/PKG-INFO b/PKG-INFO
diff --git a/README.md b/README.md
@@ -5,19 +5,8 @@ Installation
 ============
 You can install this lib directly from github repository by execute these command
 
-    pip install git+ssh://git@github.com/slaveofcode/boilerpipe3@master
+    pip install git+ssh://git@github.com/derlin/boilerpipe3@master
 
-Or from official pypi 
-
-    pip install boilerpipe3
-
-Configuration
-=============
-
-Dependencies:
-jpype, charade
-
-The boilerpipe jar files will get fetched and included automatically when building the package.
 
 Usage
 =====
@@ -35,14 +24,30 @@ The constructor takes a keyword argment ``extractor``, being one of the availabl
 - NumWordsRulesExtractor
 - CanolaExtractor
 
-If no extractor is passed the DefaultExtractor will be used by default. Additional keyword arguments are either ``html`` for HTML text or ``url``.
+If no extractor is passed the DefaultExtractor will be used by default.
 
     from boilerpipe.extract import Extractor
-    extractor = Extractor(extractor='ArticleExtractor', url=your_url)
+    extractor = Extractor(extractor='ArticleExtractor')
+
+Once you get an extractor instance, extract relevant content using one of `getText`, `getHTML`, `getTextBlock`, `getImages`. Each one accepts one of the following arguments: 
 
-Then, to extract relevant content:
+- `url`: the url of the page
+- `html`: an html string to parse
+- `processed`: the `(source, data)` returned by the method `get`.
 
-    extracted_text = extractor.getText()
+
+Example:
+
+    extracted_text = extractor.getText(url=your_url)
 
-    extracted_html = extractor.getHTML()
+    extracted_html = extractor.getHTML(url=your_url)
+
+If you need multiple information, you can save some computation time by doing:
+
+    processed = extractor.get(url=url) # download and process once
+
+    text = extractor.getText(processed=processed)
+    text_blocks = extractor.getTextBlocks(processed=processed)
+    html = extractor.getHTML(processed=processed)
+    images = extractor.getImages(processed=processed)
 
diff --git a/boilerpipe-1.2.1-bin.tar.gz b/boilerpipe-1.2.1-bin.tar.gz
diff --git a/src/boilerpipe/__init__.py → boilerpipe/__init__.py b/src/boilerpipe/__init__.py → boilerpipe/__init__.py
@@ -1,10 +1,13 @@
 import os
-import imp
 import jpype
+from os import path
 
 if jpype.isJVMStarted() != True:
     jars = []
-    for top, dirs, files in os.walk(imp.find_module('boilerpipe')[1]+'/data'):
+    data_dir = path.join(path.dirname(path.realpath(__file__)), 'data')
+    for top, dirs, files in os.walk(data_dir):
         for nm in files:       
             jars.append(os.path.join(top, nm))
     jpype.startJVM(jpype.getDefaultJVMPath(), "-Djava.class.path=%s" % os.pathsep.join(jars))
+
+from .extractor import Extractor, EXTRACTORS
diff --git a/boilerpipe/extractor.py b/boilerpipe/extractor.py
@@ -0,0 +1,156 @@
+import jpype
+import socket
+import threading
+
+from bs4 import UnicodeDammit
+import requests
+
+socket.setdefaulttimeout(15)
+lock = threading.Lock()
+
+InputSource = jpype.JClass('org.xml.sax.InputSource')
+StringReader = jpype.JClass('java.io.StringReader')
+HTMLHighlighter = jpype.JClass('de.l3s.boilerpipe.sax.HTMLHighlighter')
+BoilerpipeSAXInput = jpype.JClass('de.l3s.boilerpipe.sax.BoilerpipeSAXInput')
+
+from functools import wraps
+
+# suppress warning for invalid SSL certificates
+requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
+
+#: Headers passed with each request
+_DEFAULT_HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36'}
+
+EXTRACTORS = [
+    'DefaultExtractor',
+    'ArticleExtractor',
+    'ArticleSentencesExtractor',
+    'KeepEverythingWithMinKWordsExtractor',  # if used, don't forget to pass the kMin argument to its constructor
+    'KeepEverythingExtractor',
+    'LargestContentExtractor',
+    'NumWordsRulesExtractor',
+    'CanolaExtractor'
+]
+
+
+def thread_safe(method):
+    @wraps(method)
+    def _impl(self, *args, **kwargs):
+        try:
+            # make it thread safe, see jpype documentation for more info
+            if threading.activeCount() > 1:
+                if jpype.isThreadAttachedToJVM() is False:
+                    jpype.attachThreadToJVM()
+            # lock.acquire()
+            return method(self, *args, **kwargs)
+        finally:
+            # lock.release()
+            pass
+
+    return _impl
+
+
+class Extractor(object):
+    """
+    Extract text. Constructor takes 'extractor' as a keyword argument,
+    being one of the boilerpipe extractors:
+    - DefaultExtractor
+    - ArticleExtractor
+    - ArticleSentencesExtractor
+    - KeepEverythingExtractor
+    - KeepEverythingWithMinKWordsExtractor
+    - LargestContentExtractor
+    - NumWordsRulesExtractor
+    - CanolaExtractor
+    """
+    extractor = None
+    source = None
+    data = None
+    headers = {'User-Agent': 'Mozilla/5.0'}
+
+    @thread_safe
+    def __init__(self, extractor='DefaultExtractor', **kwargs):
+        if extractor == "KeepEverythingWithMinKWordsExtractor":
+            kMin = kwargs.get("kMin", 1)  # set default to 1
+            self.extractor = jpype.JClass(
+                "de.l3s.boilerpipe.extractors." + extractor)(kMin)
+        else:
+            self.extractor = jpype.JClass(
+                "de.l3s.boilerpipe.extractors." + extractor).INSTANCE
+
+    @thread_safe
+    def get(self, url=None, html=None):
+        return self._process(url=url, html=html)
+
+    @thread_safe
+    def getText(self, url=None, html=None, processed=None):
+        source, data = self._process(html=html, url=url, processed=processed)
+        return source.getContent()
+
+    @thread_safe
+    def getTextBlocks(self, url=None, html=None, processed=None):
+        source, data = self._process(html=html, url=url, processed=processed)
+        blocks = source.getTextBlocks()
+        results = []
+        for i in range(blocks.size()):
+            if blocks[i].isContent():
+                results.append(blocks[i].getText())
+        return results
+
+    @thread_safe
+    def getHTML(self, url=None, html=None, processed=None):
+        source, data = self._process(html=html, url=url, processed=processed)
+        highlighter = HTMLHighlighter.newExtractingInstance()
+        return highlighter.process(source, data)
+
+    def getImages(self, url=None, html=None, processed=None):
+        if processed is not None:
+            source, data = processed
+        else:
+            source, data = self.get(url=url, html=html)
+
+        extractor = jpype.JClass("de.l3s.boilerpipe.sax.ImageExtractor").INSTANCE
+        images = extractor.process(source, data)
+        jpype.java.util.Collections.sort(images)
+        # list comprehension returns
+        #   TypeError: iter() returned non-iterator of type 'java.util.ArrayList$Itr'
+        # so do it the old way:
+        results = []
+        for i in range(images.size()):
+            img = images[i]
+            results.append({
+                'src': img.getSrc(),
+                'width': img.getWidth(),
+                'height': img.getHeight(),
+                'alt': img.getAlt(),
+                'area': img.getArea()
+            })
+        return results
+
+    def _process(self, **kwargs):
+        if kwargs.get('processed'):
+            source, data = kwargs.get('processed')
+            return source, data
+        if kwargs.get('url'):
+            resp = requests.get(kwargs['url'], verify=False, stream=True, headers=_DEFAULT_HEADERS)
+            data = self._convert(resp.content)
+        elif kwargs.get('html'):
+            data = kwargs['html']
+            if not isinstance(data, str):
+                data = self._convert(data)
+        else:
+            raise Exception('No text or url provided')
+
+        reader = StringReader(data)
+        source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
+        self.extractor.process(source)
+        return source, data
+
+    def _convert(self, content):
+        converted = UnicodeDammit(content)
+        if not converted.unicode_markup:
+            raise UnicodeDecodeError(
+                "Failed to detect encoding, tried [%s]", ', '.join(converted.tried_encodings))
+        # print converted.original_encoding
+        return converted.unicode_markup
diff --git a/dist/boilerpipe3-1.1.tar.gz b/dist/boilerpipe3-1.1.tar.gz
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,5 @@
+charade==1.0.3
+JPype1-py3==0.5.5.2
+numpy==1.14.2
+scikit-learn==0.19.1
+scipy==1.0.1
diff --git a/setup.py b/setup.py
@@ -1,26 +1,27 @@
 import tarfile
 from fnmatch import fnmatch
 from os.path import basename, exists, dirname, abspath, join
-from distutils.core import setup
+import setuptools
 
 try:
     from urllib import urlretrieve
 except:
     from urllib.request import urlretrieve
 
 import sys
+
 if sys.version_info[0] < 3:
     print("This module can only be used with Python 3.")
     print("For a Python 2 version, see:\nhttps://github.com/misja/python-boilerpipe")
     sys.exit(1)
 
-__version__ = '1.1'
-boilerpipe_version = '1.2.0'
-DATAPATH = join(abspath(dirname((__file__))), 'src/boilerpipe/data')
+__version__ = '1.2'
+boilerpipe_version = '1.2.1'
+DATAPATH = join(abspath(dirname((__file__))), 'boilerpipe/data')
 
 
 def download_jars(datapath, version=boilerpipe_version):
-    tgz_url = 'https://github.com/slaveofcode/boilerpipe3/raw/master/boilerpipe-{0}-bin.tar.gz'.format(version)
+    tgz_url = 'https://github.com/derlin/boilerpipe3/raw/master/boilerpipe-{0}-bin.tar.gz'.format(version)
     tgz_name = basename(tgz_url)
 
     if not exists(tgz_name):
@@ -33,13 +34,17 @@ def download_jars(datapath, version=boilerpipe_version):
             continue
         tar.extract(tarinfo, datapath)
 
+
 download_jars(datapath=DATAPATH)
 
-setup(
+setuptools.setup(
     name='boilerpipe3',
     version=__version__,
-    packages=['boilerpipe', 'boilerpipe.extract'],
-    package_dir={'': 'src'},
+    author='Lucy Linder',
+    author_email='lucy.derlin@gmail.com',
+    url='https://github.com/derlin/boilerpipe3',
+
+    packages=['boilerpipe'],
     package_data={
         'boilerpipe': [
             'data/boilerpipe-{version}/boilerpipe-{version}.jar'.format(version=boilerpipe_version),
@@ -48,24 +53,23 @@ def download_jars(datapath, version=boilerpipe_version):
     },
     install_requires=[
         'JPype1-py3',
-        'charade',
+        'requests',
+        'beautifulsoup4',
     ],
-    author='Aditya Kresna Permana',
-    author_email='zeandcode@gmail.com',
-    maintainer = 'Aditya Kresna Permana',
-    maintainer_email = 'zeandcode@gmail.com',
-    url = 'https://github.com/slaveofcode/boilerpipe3',
+
     classifiers=[
-          'Development Status :: 5 - Production/Stable',
-          'Environment :: Console',
-          'Intended Audience :: Developers',
-          'License :: OSI Approved :: Apache Software License',
-          'Operating System :: OS Independent',
-          'Programming Language :: Python :: 3.4',
-          'Natural Language :: English',
-      ],
-      keywords='boilerpipe',
-      license='Apache 2.0',
+        'Development Status :: 5 - Production/Stable',
+        'Environment :: Console',
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: Apache Software License',
+        'Operating System :: OS Independent',
+        'Programming Language :: Python :: 3.4',
+        'Natural Language :: English',
+    ],
+
+    keywords='boilerpipe',
+    license='Apache 2.0',
 
-    description='Python interface to Boilerpipe, Boilerplate Removal and Fulltext Extraction from HTML pages with Python 3 support'
+    description='Python interface to Boilerpipe, Boilerplate Removal and Fulltext Extraction from HTML pages with Python 3 support. '
+                'Forked and improved from https://github.com/slaveofcode/boilerpipe3.'
 )