From bcfc8e1cb69de58ffba729f87735cabbba5b4a5e Mon Sep 17 00:00:00 2001
From: Vinta
Date: Fri, 4 Oct 2013 10:28:55 +0800
Subject: [PATCH 1/6] custom finder / extender pipeline support
---
haul/models.py | 20 ++++++++++++++++----
haul/settings.py | 1 +
2 files changed, 17 insertions(+), 4 deletions(-)
diff --git a/haul/models.py b/haul/models.py
index f445ae5..c802f6b 100644
--- a/haul/models.py
+++ b/haul/models.py
@@ -89,9 +89,15 @@ def start_finder_pipeline(self, *args, **kwargs):
pipeline_output['pipeline_index'] = idx
pipeline_output['pipeline_break'] = False
- finder_func = utils.module_member(name)
+ if hasattr(name, '__call__'):
+ finder_func = name
+ else:
+ finder_func = utils.module_member(name)
+
output = finder_func(*args, **pipeline_output)
- pipeline_output.update(output)
+
+ if isinstance(output, dict):
+ pipeline_output.update(output)
if pipeline_output['pipeline_break']:
break
@@ -115,9 +121,15 @@ def start_extender_pipeline(self, *args, **kwargs):
pipeline_output['pipeline_index'] = idx
pipeline_output['pipeline_break'] = False
- extender_func = utils.module_member(name)
+ if hasattr(name, '__call__'):
+ extender_func = name
+ else:
+ extender_func = utils.module_member(name)
+
output = extender_func(*args, **pipeline_output)
- pipeline_output.update(output)
+
+ if isinstance(output, dict):
+ pipeline_output.update(output)
if pipeline_output['pipeline_break']:
break
diff --git a/haul/settings.py b/haul/settings.py
index ef63ffc..f339cd3 100644
--- a/haul/settings.py
+++ b/haul/settings.py
@@ -1,5 +1,6 @@
# coding: utf-8
+# http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser
DEFAULT_PARSER = 'lxml'
FINDER_PIPELINE = (
From 873f099efaf7f8b08e215a94089e6a49c92ea986 Mon Sep 17 00:00:00 2001
From: Vinta
Date: Fri, 4 Oct 2013 10:29:41 +0800
Subject: [PATCH 2/6] test for custom finder pipeline
---
tests/fixtures/page.html | 6 +++++
tests/pipeline/test.py | 0
tests/test.py | 52 ++++++++++++++++++++++++++++++++++++++--
3 files changed, 56 insertions(+), 2 deletions(-)
delete mode 100644 tests/pipeline/test.py
diff --git a/tests/fixtures/page.html b/tests/fixtures/page.html
index f2ab7a4..3a02c51 100644
--- a/tests/fixtures/page.html
+++ b/tests/fixtures/page.html
@@ -15,6 +15,12 @@
some image
+
+
+ img with data-src
+
+
+
-
diff --git a/tests/pipeline/test.py b/tests/pipeline/test.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/test.py b/tests/test.py
index 454f640..d3ce6d2 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -95,10 +95,10 @@ def test_find_image_url(self):
self.assertIsInstance(hr, HaulResult)
self.assertIn('image/', hr.content_type)
-class PropagatorPipelineTestCase(HaulBaseTestCase):
+class ExtenderPipelineTestCase(HaulBaseTestCase):
def setUp(self):
- super(PropagatorPipelineTestCase, self).setUp()
+ super(ExtenderPipelineTestCase, self).setUp()
def test_blogspot(self):
h = Haul()
@@ -143,6 +143,54 @@ def test_wordpress(self):
self.assertIsInstance(hr, HaulResult)
self.assertIn('text/html', hr.content_type)
+class CustomFinderPipelineTestCase(HaulBaseTestCase):
+
+ def setUp(self):
+ super(CustomFinderPipelineTestCase, self).setUp()
+
+ def test_find_html_document(self):
+ from haul.utils import in_ignorecase
+
+ def img_data_src_finder(pipeline_index,
+ soup,
+ finder_image_urls=[],
+ *args, **kwargs):
+ """
+ Find image URL in 's data-src attribute
+ """
+
+ now_finder_image_urls = []
+
+ for img in soup.find_all('img'):
+ src = img.get('data-src', None)
+ if src:
+ if (not in_ignorecase(src, finder_image_urls)) and \
+ (not in_ignorecase(src, now_finder_image_urls)):
+ now_finder_image_urls.append(src)
+
+ output = {}
+ output['finder_image_urls'] = finder_image_urls + now_finder_image_urls
+
+ return output
+
+ FINDER_PIPELINE = (
+ 'haul.finders.pipeline.html.img_src_finder',
+ 'haul.finders.pipeline.html.a_href_finder',
+ 'haul.finders.pipeline.css.background_image_finder',
+ img_data_src_finder,
+ )
+
+ h = Haul(finder_pipeline=FINDER_PIPELINE)
+ hr = h.find_images(self.complete_html)
+
+ self.assertIsInstance(hr, HaulResult)
+
+ test_image_url = 'http://files.heelsfetishism.com/media/heels/2013/10/03/18099_307a62430fa045cc9b2124d16de63f33.jpg'
+ self.assertIn(test_image_url, hr.finder_image_urls)
+
+ image_urls = hr.image_urls
+ image_urls_count = len(image_urls)
+ self.assertEqual(image_urls_count, 6)
class ExceptionsTestCase(HaulBaseTestCase):
From a498898c4daf99f0b4e90a9d7d91b347a843b812 Mon Sep 17 00:00:00 2001
From: Vinta
Date: Fri, 4 Oct 2013 10:30:27 +0800
Subject: [PATCH 3/6] pipeline refactoring
---
haul/extenders/pipeline/google.py | 24 +++++++++++++++---------
haul/extenders/pipeline/pinterest.py | 8 +++++---
haul/extenders/pipeline/tumblr.py | 18 ++++++++++--------
haul/extenders/pipeline/wordpress.py | 8 +++++---
haul/finders/pipeline/css.py | 13 ++++++++-----
haul/finders/pipeline/html.py | 24 ++++++++++++++----------
6 files changed, 57 insertions(+), 38 deletions(-)
diff --git a/haul/extenders/pipeline/google.py b/haul/extenders/pipeline/google.py
index 70d412e..226f701 100644
--- a/haul/extenders/pipeline/google.py
+++ b/haul/extenders/pipeline/google.py
@@ -3,7 +3,10 @@
import re
-def blogspot_s1600_extender(pipeline_index, finder_image_urls, *args, **kwargs):
+def blogspot_s1600_extender(pipeline_index,
+ finder_image_urls,
+ extender_image_urls=[],
+ *args, **kwargs):
"""
Example:
http://1.bp.blogspot.com/-S97wTYQKbrY/UkWukhKhTKI/AAAAAAAAJ0g/fcRDiqVC8Us/s898/aaPOP+001.jpg
@@ -11,7 +14,6 @@ def blogspot_s1600_extender(pipeline_index, finder_image_urls, *args, **kwargs):
http://1.bp.blogspot.com/-S97wTYQKbrY/UkWukhKhTKI/AAAAAAAAJ0g/fcRDiqVC8Us/s1600/aaPOP+001.jpg
"""
- pre_extender_image_urls = kwargs.get('extender_image_urls', [])
now_extender_image_urls = []
search_re = re.compile(r'/s\d+/', re.IGNORECASE)
@@ -23,12 +25,15 @@ def blogspot_s1600_extender(pipeline_index, finder_image_urls, *args, **kwargs):
now_extender_image_urls.append(extender_image_url)
output = {}
- output['extender_image_urls'] = pre_extender_image_urls + now_extender_image_urls
+ output['extender_image_urls'] = extender_image_urls + now_extender_image_urls
return output
-def ggpht_s1600_extender(pipeline_index, finder_image_urls, *args, **kwargs):
+def ggpht_s1600_extender(pipeline_index,
+ finder_image_urls,
+ extender_image_urls=[],
+ *args, **kwargs):
"""
Example:
http://lh4.ggpht.com/-fFi-qJRuxeY/UjwHSOTHGOI/AAAAAAAArgE/SWTMT-hXzB4/s640/Celeber-ru-Emma-Watson-Net-A-Porter-The-Edit-Magazine-Photoshoot-2013-01.jpg
@@ -36,7 +41,6 @@ def ggpht_s1600_extender(pipeline_index, finder_image_urls, *args, **kwargs):
http://lh4.ggpht.com/-fFi-qJRuxeY/UjwHSOTHGOI/AAAAAAAArgE/SWTMT-hXzB4/s1600/Celeber-ru-Emma-Watson-Net-A-Porter-The-Edit-Magazine-Photoshoot-2013-01.jpg
"""
- pre_extender_image_urls = kwargs.get('extender_image_urls', [])
now_extender_image_urls = []
search_re = re.compile(r'/s\d+/', re.IGNORECASE)
@@ -48,12 +52,15 @@ def ggpht_s1600_extender(pipeline_index, finder_image_urls, *args, **kwargs):
now_extender_image_urls.append(extender_image_url)
output = {}
- output['extender_image_urls'] = pre_extender_image_urls + now_extender_image_urls
+ output['extender_image_urls'] = extender_image_urls + now_extender_image_urls
return output
-def googleusercontent_s1600_extender(pipeline_index, finder_image_urls, *args, **kwargs):
+def googleusercontent_s1600_extender(pipeline_index,
+ finder_image_urls,
+ extender_image_urls=[],
+ *args, **kwargs):
"""
Example:
https://lh6.googleusercontent.com/-T6V-utZHzbE/Ukjn-1MDOSI/AAAAAAAAA3g/H6Qcw1zt4n0/w555-h399-no/2101_aa2cac09d1c6431b8a635d61cd9c4471.jpg
@@ -61,7 +68,6 @@ def googleusercontent_s1600_extender(pipeline_index, finder_image_urls, *args, *
https://lh6.googleusercontent.com/-T6V-utZHzbE/Ukjn-1MDOSI/AAAAAAAAA3g/H6Qcw1zt4n0/s1600/2101_aa2cac09d1c6431b8a635d61cd9c4471.jpg
"""
- pre_extender_image_urls = kwargs.get('extender_image_urls', [])
now_extender_image_urls = []
search_re = re.compile(r'/w\d+\-h\d+\-no/', re.IGNORECASE)
@@ -73,6 +79,6 @@ def googleusercontent_s1600_extender(pipeline_index, finder_image_urls, *args, *
now_extender_image_urls.append(extender_image_url)
output = {}
- output['extender_image_urls'] = pre_extender_image_urls + now_extender_image_urls
+ output['extender_image_urls'] = extender_image_urls + now_extender_image_urls
return output
diff --git a/haul/extenders/pipeline/pinterest.py b/haul/extenders/pipeline/pinterest.py
index 70244d7..24e30a8 100644
--- a/haul/extenders/pipeline/pinterest.py
+++ b/haul/extenders/pipeline/pinterest.py
@@ -3,7 +3,10 @@
import re
-def original_image_extender(pipeline_index, finder_image_urls, *args, **kwargs):
+def original_image_extender(pipeline_index,
+ finder_image_urls,
+ extender_image_urls=[],
+ *args, **kwargs):
"""
Example:
http://media-cache-ec0.pinimg.com/70x/50/9b/bd/509bbd5c6543d473bc2b49befe75f4c6.jpg
@@ -13,7 +16,6 @@ def original_image_extender(pipeline_index, finder_image_urls, *args, **kwargs):
http://media-cache-ec0.pinimg.com/originals/50/9b/bd/509bbd5c6543d473bc2b49befe75f4c6.jpg
"""
- pre_extender_image_urls = kwargs.get('extender_image_urls', [])
now_extender_image_urls = []
search_re = re.compile(r'.com/\d+x/', re.IGNORECASE)
@@ -25,6 +27,6 @@ def original_image_extender(pipeline_index, finder_image_urls, *args, **kwargs):
now_extender_image_urls.append(extender_image_url)
output = {}
- output['extender_image_urls'] = pre_extender_image_urls + now_extender_image_urls
+ output['extender_image_urls'] = extender_image_urls + now_extender_image_urls
return output
diff --git a/haul/extenders/pipeline/tumblr.py b/haul/extenders/pipeline/tumblr.py
index c421e6e..745d770 100644
--- a/haul/extenders/pipeline/tumblr.py
+++ b/haul/extenders/pipeline/tumblr.py
@@ -3,7 +3,10 @@
import re
-def media_1280_extender(pipeline_index, finder_image_urls, *args, **kwargs):
+def media_1280_extender(pipeline_index,
+ finder_image_urls,
+ extender_image_urls=[],
+ *args, **kwargs):
"""
Example:
http://25.media.tumblr.com/3f5f10d7216f1dd5eacb5eb3e302286a/tumblr_mtpcwdzKBT1qh9n5lo1_250.png
@@ -12,7 +15,6 @@ def media_1280_extender(pipeline_index, finder_image_urls, *args, **kwargs):
http://25.media.tumblr.com/3f5f10d7216f1dd5eacb5eb3e302286a/tumblr_mtpcwdzKBT1qh9n5lo1_1280.png
"""
- pre_extender_image_urls = kwargs.get('extender_image_urls', [])
now_extender_image_urls = []
search_re = re.compile(r'(tumblr_[a-zA-Z0-9_]+)_(\d+).', re.IGNORECASE)
@@ -24,12 +26,15 @@ def media_1280_extender(pipeline_index, finder_image_urls, *args, **kwargs):
now_extender_image_urls.append(extender_image_url)
output = {}
- output['extender_image_urls'] = pre_extender_image_urls + now_extender_image_urls
+ output['extender_image_urls'] = extender_image_urls + now_extender_image_urls
return output
-def avatar_128_extender(pipeline_index, *args, **kwargs):
+def avatar_128_extender(pipeline_index,
+ finder_image_urls,
+ extender_image_urls=[],
+ *args, **kwargs):
"""
Example:
http://25.media.tumblr.com/avatar_2909d6610c26_16.png
@@ -37,9 +42,6 @@ def avatar_128_extender(pipeline_index, *args, **kwargs):
http://25.media.tumblr.com/avatar_2909d6610c26_128.png
"""
- finder_image_urls = kwargs.get('finder_image_urls', [])
-
- pre_extender_image_urls = kwargs.get('extender_image_urls', [])
now_extender_image_urls = []
search_re = re.compile(r'(avatar_[a-zA-Z0-9_]+)_(\d+).', re.IGNORECASE)
@@ -51,6 +53,6 @@ def avatar_128_extender(pipeline_index, *args, **kwargs):
now_extender_image_urls.append(extender_image_url)
output = {}
- output['extender_image_urls'] = pre_extender_image_urls + now_extender_image_urls
+ output['extender_image_urls'] = extender_image_urls + now_extender_image_urls
return output
diff --git a/haul/extenders/pipeline/wordpress.py b/haul/extenders/pipeline/wordpress.py
index 68bda2e..c227f3b 100644
--- a/haul/extenders/pipeline/wordpress.py
+++ b/haul/extenders/pipeline/wordpress.py
@@ -3,7 +3,10 @@
import re
-def original_image_extender(pipeline_index, finder_image_urls, *args, **kwargs):
+def original_image_extender(pipeline_index,
+ finder_image_urls,
+ extender_image_urls=[],
+ *args, **kwargs):
"""
Example:
http://fashion-fever.nl/wp-content/upload/2013/09/DSC_0058-110x110.jpg
@@ -13,7 +16,6 @@ def original_image_extender(pipeline_index, finder_image_urls, *args, **kwargs):
http://www.wendyslookbook.com/wp-content/uploads/2013/09/Morning-Coffee-Run-7.jpg
"""
- pre_extender_image_urls = kwargs.get('extender_image_urls', [])
now_extender_image_urls = []
check_re = re.compile(r'wp-content/uploads?/', re.IGNORECASE)
@@ -26,6 +28,6 @@ def original_image_extender(pipeline_index, finder_image_urls, *args, **kwargs):
now_extender_image_urls.append(extender_image_url)
output = {}
- output['extender_image_urls'] = pre_extender_image_urls + now_extender_image_urls
+ output['extender_image_urls'] = extender_image_urls + now_extender_image_urls
return output
diff --git a/haul/finders/pipeline/css.py b/haul/finders/pipeline/css.py
index e7155f6..a6a5c60 100644
--- a/haul/finders/pipeline/css.py
+++ b/haul/finders/pipeline/css.py
@@ -5,16 +5,19 @@
from haul.utils import in_ignorecase
-def background_image_finder(pipeline_index, soup, *args, **kwargs):
+def background_image_finder(pipeline_index,
+ soup,
+ finder_image_urls=[],
+ *args, **kwargs):
"""
+ Find image URL in background-image
+
Example:
to
http://distilleryimage10.ak.instagram.com/bde04558a43b11e28e5d22000a1f979a_7.jpg
"""
- pre_finder_image_urls = kwargs.get('finder_image_urls', [])
-
now_finder_image_urls = []
for tag in soup.find_all(style=True):
@@ -26,11 +29,11 @@ def background_image_finder(pipeline_index, soup, *args, **kwargs):
for property_value in background_image.propertyValue:
background_image_url = str(property_value.value)
if background_image_url:
- if (not in_ignorecase(background_image_url, pre_finder_image_urls)) and \
+ if (not in_ignorecase(background_image_url, finder_image_urls)) and \
(not in_ignorecase(background_image_url, now_finder_image_urls)):
now_finder_image_urls.append(background_image_url)
output = {}
- output['finder_image_urls'] = pre_finder_image_urls + now_finder_image_urls
+ output['finder_image_urls'] = finder_image_urls + now_finder_image_urls
return output
diff --git a/haul/finders/pipeline/html.py b/haul/finders/pipeline/html.py
index c6371d8..6058209 100644
--- a/haul/finders/pipeline/html.py
+++ b/haul/finders/pipeline/html.py
@@ -3,44 +3,48 @@
from haul.utils import in_ignorecase
-def img_src_finder(pipeline_index, soup, *args, **kwargs):
+def img_src_finder(pipeline_index,
+ soup,
+ finder_image_urls=[],
+ *args, **kwargs):
"""
+ Find image URL in 's src attribute
"""
- pre_finder_image_urls = kwargs.get('finder_image_urls', [])
-
now_finder_image_urls = []
for img in soup.find_all('img'):
src = img.get('src', None)
if src:
- if (not in_ignorecase(src, pre_finder_image_urls)) and \
+ if (not in_ignorecase(src, finder_image_urls)) and \
(not in_ignorecase(src, now_finder_image_urls)):
now_finder_image_urls.append(src)
output = {}
- output['finder_image_urls'] = pre_finder_image_urls + now_finder_image_urls
+ output['finder_image_urls'] = finder_image_urls + now_finder_image_urls
return output
-def a_href_finder(pipeline_index, soup, *args, **kwargs):
+def a_href_finder(pipeline_index,
+ soup,
+ finder_image_urls=[],
+ *args, **kwargs):
"""
+ Find image URL in 's href attribute
"""
- pre_finder_image_urls = kwargs.get('finder_image_urls', [])
-
now_finder_image_urls = []
for a in soup.find_all('a'):
href = a.get('href', None)
if href:
if filter(href.lower().endswith, ('.jpg', '.jpeg', '.gif', '.png')):
- if (not in_ignorecase(href, pre_finder_image_urls)) and \
+ if (not in_ignorecase(href, finder_image_urls)) and \
(not in_ignorecase(href, now_finder_image_urls)):
now_finder_image_urls.append(href)
output = {}
- output['finder_image_urls'] = pre_finder_image_urls + now_finder_image_urls
+ output['finder_image_urls'] = finder_image_urls + now_finder_image_urls
return output
From 2e497874286d3133e1634835f4525b13d73c0757 Mon Sep 17 00:00:00 2001
From: Vinta
Date: Fri, 4 Oct 2013 10:36:36 +0800
Subject: [PATCH 4/6] remove unnecessary code
---
haul/models.py | 14 --------------
tests/test.py | 5 ++++-
2 files changed, 4 insertions(+), 15 deletions(-)
diff --git a/haul/models.py b/haul/models.py
index c802f6b..e7ad797 100644
--- a/haul/models.py
+++ b/haul/models.py
@@ -10,7 +10,6 @@
simple_url_re = re.compile(r'^https?://\[?\w', re.IGNORECASE)
-simple_url_2_re = re.compile(r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)$', re.IGNORECASE)
class Haul(object):
@@ -189,8 +188,6 @@ def __init__(self):
self.title = None
self.finder_image_urls = []
self.extender_image_urls = []
- # self.finder_image_file = None
- # self.extender_image_file = None
def __repr__(self):
return '' % (self.content_type)
@@ -209,16 +206,5 @@ def image_urls(self):
return all_image_urls
- # @property
- # def image_file(self):
- # if self.extender_image_file:
- # which = self.extender_image_file
- # elif self.finder_image_file:
- # which = self.finder_image_file
- # else:
- # which = None
-
- # return which
-
def to_dict(self):
return self.__dict__
diff --git a/tests/test.py b/tests/test.py
index d3ce6d2..d62781c 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -95,6 +95,7 @@ def test_find_image_url(self):
self.assertIsInstance(hr, HaulResult)
self.assertIn('image/', hr.content_type)
+
class ExtenderPipelineTestCase(HaulBaseTestCase):
def setUp(self):
@@ -143,6 +144,7 @@ def test_wordpress(self):
self.assertIsInstance(hr, HaulResult)
self.assertIn('text/html', hr.content_type)
+
class CustomFinderPipelineTestCase(HaulBaseTestCase):
def setUp(self):
@@ -192,6 +194,7 @@ def img_data_src_finder(pipeline_index,
image_urls_count = len(image_urls)
self.assertEqual(image_urls_count, 6)
+
class ExceptionsTestCase(HaulBaseTestCase):
def setUp(self):
@@ -211,5 +214,5 @@ def test_content_type_not_supported(self):
if __name__ == '__main__':
- print 'testing Haul'
+ print('testing Haul')
unittest.main()
From dcb60ada76c2416a8a97a9e30e0c63e9a8b9aa4b Mon Sep 17 00:00:00 2001
From: Vinta
Date: Fri, 4 Oct 2013 10:47:13 +0800
Subject: [PATCH 5/6] update version
---
haul/__init__.py | 2 +-
setup.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/haul/__init__.py b/haul/__init__.py
index 8656e66..0f22849 100644
--- a/haul/__init__.py
+++ b/haul/__init__.py
@@ -1,6 +1,6 @@
# coding: utf-8
-__version__ = '1.0.0'
+__version__ = '1.1.0'
from .api import find_images
from .models import Haul, HaulResult
diff --git a/setup.py b/setup.py
index 0d4aef2..61d38bd 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,7 @@
setup(
name='haul',
- version='1.0.0',
+ version='1.1.0',
description='An Extensible Image Crawler',
long_description=long_description,
keywords='haul web image content scraper parser crawler',
From 3def58bd08adf8e975686c5132998c77c62fc6e2 Mon Sep 17 00:00:00 2001
From: Vinta
Date: Fri, 4 Oct 2013 10:49:00 +0800
Subject: [PATCH 6/6] update HISTORY
---
HISTORY.rst | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/HISTORY.rst b/HISTORY.rst
index 7b18c1e..a31056f 100644
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -1,9 +1,13 @@
-.. :changelog:
-
History
-------
+1.1.0 (2013-10-04)
+++++++++++++++++++
+
+- Custom finder / extender pipeline support
+
+
1.0.0 (2013-10-03)
++++++++++++++++++
-- Initial Release
+- Initial release