Skip to content

fix KeepEverythingWithMinKWordsExtractor #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Distribution / packaging
__pycache__
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
PKG-INFO

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# IDEs
.idea
.vscode
*.iml


# data downloaded
boilerpipe/data
8 changes: 0 additions & 8 deletions MANIFEST

This file was deleted.

18 changes: 0 additions & 18 deletions PKG-INFO

This file was deleted.

39 changes: 22 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,8 @@ Installation
============
You can install this lib directly from github repository by execute these command

pip install git+ssh://git@github.com/slaveofcode/boilerpipe3@master
pip install git+ssh://git@github.com/derlin/boilerpipe3@master

Or from official pypi

pip install boilerpipe3

Configuration
=============

Dependencies:
jpype, charade

The boilerpipe jar files will get fetched and included automatically when building the package.

Usage
=====
Expand All @@ -35,14 +24,30 @@ The constructor takes a keyword argment ``extractor``, being one of the availabl
- NumWordsRulesExtractor
- CanolaExtractor

If no extractor is passed the DefaultExtractor will be used by default. Additional keyword arguments are either ``html`` for HTML text or ``url``.
If no extractor is passed the DefaultExtractor will be used by default.

from boilerpipe.extract import Extractor
extractor = Extractor(extractor='ArticleExtractor', url=your_url)
extractor = Extractor(extractor='ArticleExtractor')

Once you get an extractor instance, extract relevant content using one of `getText`, `getHTML`, `getTextBlock`, `getImages`. Each one accepts one of the following arguments:

Then, to extract relevant content:
- `url`: the url of the page
- `html`: an html string to parse
- `processed`: the `(source, data)` returned by the method `get`.

extracted_text = extractor.getText()

Example:

extracted_text = extractor.getText(url=your_url)

extracted_html = extractor.getHTML()
extracted_html = extractor.getHTML(url=your_url)

If you need multiple information, you can save some computation time by doing:

processed = extractor.get(url=url) # download and process once

text = extractor.getText(processed=processed)
text_blocks = extractor.getTextBlocks(processed=processed)
html = extractor.getHTML(processed=processed)
images = extractor.getImages(processed=processed)

Binary file added boilerpipe-1.2.1-bin.tar.gz
Binary file not shown.
7 changes: 5 additions & 2 deletions src/boilerpipe/__init__.py → boilerpipe/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import os
import imp
import jpype
from os import path

if jpype.isJVMStarted() != True:
jars = []
for top, dirs, files in os.walk(imp.find_module('boilerpipe')[1]+'/data'):
data_dir = path.join(path.dirname(path.realpath(__file__)), 'data')
for top, dirs, files in os.walk(data_dir):
for nm in files:
jars.append(os.path.join(top, nm))
jpype.startJVM(jpype.getDefaultJVMPath(), "-Djava.class.path=%s" % os.pathsep.join(jars))

from .extractor import Extractor, EXTRACTORS
156 changes: 156 additions & 0 deletions boilerpipe/extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import jpype
import socket
import threading

from bs4 import UnicodeDammit
import requests

socket.setdefaulttimeout(15)
lock = threading.Lock()

InputSource = jpype.JClass('org.xml.sax.InputSource')
StringReader = jpype.JClass('java.io.StringReader')
HTMLHighlighter = jpype.JClass('de.l3s.boilerpipe.sax.HTMLHighlighter')
BoilerpipeSAXInput = jpype.JClass('de.l3s.boilerpipe.sax.BoilerpipeSAXInput')

from functools import wraps

# suppress warning for invalid SSL certificates
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)

#: Headers passed with each request
_DEFAULT_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36'}

EXTRACTORS = [
'DefaultExtractor',
'ArticleExtractor',
'ArticleSentencesExtractor',
'KeepEverythingWithMinKWordsExtractor', # if used, don't forget to pass the kMin argument to its constructor
'KeepEverythingExtractor',
'LargestContentExtractor',
'NumWordsRulesExtractor',
'CanolaExtractor'
]


def thread_safe(method):
@wraps(method)
def _impl(self, *args, **kwargs):
try:
# make it thread safe, see jpype documentation for more info
if threading.activeCount() > 1:
if jpype.isThreadAttachedToJVM() is False:
jpype.attachThreadToJVM()
# lock.acquire()
return method(self, *args, **kwargs)
finally:
# lock.release()
pass

return _impl


class Extractor(object):
"""
Extract text. Constructor takes 'extractor' as a keyword argument,
being one of the boilerpipe extractors:
- DefaultExtractor
- ArticleExtractor
- ArticleSentencesExtractor
- KeepEverythingExtractor
- KeepEverythingWithMinKWordsExtractor
- LargestContentExtractor
- NumWordsRulesExtractor
- CanolaExtractor
"""
extractor = None
source = None
data = None
headers = {'User-Agent': 'Mozilla/5.0'}

@thread_safe
def __init__(self, extractor='DefaultExtractor', **kwargs):
if extractor == "KeepEverythingWithMinKWordsExtractor":
kMin = kwargs.get("kMin", 1) # set default to 1
self.extractor = jpype.JClass(
"de.l3s.boilerpipe.extractors." + extractor)(kMin)
else:
self.extractor = jpype.JClass(
"de.l3s.boilerpipe.extractors." + extractor).INSTANCE

@thread_safe
def get(self, url=None, html=None):
return self._process(url=url, html=html)

@thread_safe
def getText(self, url=None, html=None, processed=None):
source, data = self._process(html=html, url=url, processed=processed)
return source.getContent()

@thread_safe
def getTextBlocks(self, url=None, html=None, processed=None):
source, data = self._process(html=html, url=url, processed=processed)
blocks = source.getTextBlocks()
results = []
for i in range(blocks.size()):
if blocks[i].isContent():
results.append(blocks[i].getText())
return results

@thread_safe
def getHTML(self, url=None, html=None, processed=None):
source, data = self._process(html=html, url=url, processed=processed)
highlighter = HTMLHighlighter.newExtractingInstance()
return highlighter.process(source, data)

def getImages(self, url=None, html=None, processed=None):
if processed is not None:
source, data = processed
else:
source, data = self.get(url=url, html=html)

extractor = jpype.JClass("de.l3s.boilerpipe.sax.ImageExtractor").INSTANCE
images = extractor.process(source, data)
jpype.java.util.Collections.sort(images)
# list comprehension returns
# TypeError: iter() returned non-iterator of type 'java.util.ArrayList$Itr'
# so do it the old way:
results = []
for i in range(images.size()):
img = images[i]
results.append({
'src': img.getSrc(),
'width': img.getWidth(),
'height': img.getHeight(),
'alt': img.getAlt(),
'area': img.getArea()
})
return results

def _process(self, **kwargs):
if kwargs.get('processed'):
source, data = kwargs.get('processed')
return source, data
if kwargs.get('url'):
resp = requests.get(kwargs['url'], verify=False, stream=True, headers=_DEFAULT_HEADERS)
data = self._convert(resp.content)
elif kwargs.get('html'):
data = kwargs['html']
if not isinstance(data, str):
data = self._convert(data)
else:
raise Exception('No text or url provided')

reader = StringReader(data)
source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
self.extractor.process(source)
return source, data

def _convert(self, content):
converted = UnicodeDammit(content)
if not converted.unicode_markup:
raise UnicodeDecodeError(
"Failed to detect encoding, tried [%s]", ', '.join(converted.tried_encodings))
# print converted.original_encoding
return converted.unicode_markup
Binary file removed dist/boilerpipe3-1.1.tar.gz
Binary file not shown.
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
charade==1.0.3
JPype1-py3==0.5.5.2
numpy==1.14.2
scikit-learn==0.19.1
scipy==1.0.1
54 changes: 29 additions & 25 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,27 @@
import tarfile
from fnmatch import fnmatch
from os.path import basename, exists, dirname, abspath, join
from distutils.core import setup
import setuptools

try:
from urllib import urlretrieve
except:
from urllib.request import urlretrieve

import sys

if sys.version_info[0] < 3:
print("This module can only be used with Python 3.")
print("For a Python 2 version, see:\nhttps://github.com/misja/python-boilerpipe")
sys.exit(1)

__version__ = '1.1'
boilerpipe_version = '1.2.0'
DATAPATH = join(abspath(dirname((__file__))), 'src/boilerpipe/data')
__version__ = '1.2'
boilerpipe_version = '1.2.1'
DATAPATH = join(abspath(dirname((__file__))), 'boilerpipe/data')


def download_jars(datapath, version=boilerpipe_version):
tgz_url = 'https://github.com/slaveofcode/boilerpipe3/raw/master/boilerpipe-{0}-bin.tar.gz'.format(version)
tgz_url = 'https://github.com/derlin/boilerpipe3/raw/master/boilerpipe-{0}-bin.tar.gz'.format(version)
tgz_name = basename(tgz_url)

if not exists(tgz_name):
Expand All @@ -33,13 +34,17 @@ def download_jars(datapath, version=boilerpipe_version):
continue
tar.extract(tarinfo, datapath)


download_jars(datapath=DATAPATH)

setup(
setuptools.setup(
name='boilerpipe3',
version=__version__,
packages=['boilerpipe', 'boilerpipe.extract'],
package_dir={'': 'src'},
author='Lucy Linder',
author_email='lucy.derlin@gmail.com',
url='https://github.com/derlin/boilerpipe3',

packages=['boilerpipe'],
package_data={
'boilerpipe': [
'data/boilerpipe-{version}/boilerpipe-{version}.jar'.format(version=boilerpipe_version),
Expand All @@ -48,24 +53,23 @@ def download_jars(datapath, version=boilerpipe_version):
},
install_requires=[
'JPype1-py3',
'charade',
'requests',
'beautifulsoup4',
],
author='Aditya Kresna Permana',
author_email='zeandcode@gmail.com',
maintainer = 'Aditya Kresna Permana',
maintainer_email = 'zeandcode@gmail.com',
url = 'https://github.com/slaveofcode/boilerpipe3',

classifiers=[
'Development Status :: 5 - Production/Stable',
'Environment :: Console',
'Intended Audience :: Developers',
'License :: OSI Approved :: Apache Software License',
'Operating System :: OS Independent',
'Programming Language :: Python :: 3.4',
'Natural Language :: English',
],
keywords='boilerpipe',
license='Apache 2.0',
'Development Status :: 5 - Production/Stable',
'Environment :: Console',
'Intended Audience :: Developers',
'License :: OSI Approved :: Apache Software License',
'Operating System :: OS Independent',
'Programming Language :: Python :: 3.4',
'Natural Language :: English',
],

keywords='boilerpipe',
license='Apache 2.0',

description='Python interface to Boilerpipe, Boilerplate Removal and Fulltext Extraction from HTML pages with Python 3 support'
description='Python interface to Boilerpipe, Boilerplate Removal and Fulltext Extraction from HTML pages with Python 3 support. '
'Forked and improved from https://github.com/slaveofcode/boilerpipe3.'
)
Loading