Skip to content

Commit

Permalink
Merge branch 'release/v1.1.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
vinta committed Oct 4, 2013
2 parents 2083567 + 3def58b commit 83454ea
Show file tree
Hide file tree
Showing 14 changed files with 143 additions and 64 deletions.
10 changes: 7 additions & 3 deletions HISTORY.rst
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
.. :changelog:
History
-------

1.1.0 (2013-10-04)
++++++++++++++++++

- Custom finder / extender pipeline support


1.0.0 (2013-10-03)
++++++++++++++++++

- Initial Release
- Initial release
2 changes: 1 addition & 1 deletion haul/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# coding: utf-8

__version__ = '1.0.0'
__version__ = '1.1.0'

from .api import find_images
from .models import Haul, HaulResult
24 changes: 15 additions & 9 deletions haul/extenders/pipeline/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@
import re


def blogspot_s1600_extender(pipeline_index, finder_image_urls, *args, **kwargs):
def blogspot_s1600_extender(pipeline_index,
finder_image_urls,
extender_image_urls=[],
*args, **kwargs):
"""
Example:
http://1.bp.blogspot.com/-S97wTYQKbrY/UkWukhKhTKI/AAAAAAAAJ0g/fcRDiqVC8Us/s898/aaPOP+001.jpg
to
http://1.bp.blogspot.com/-S97wTYQKbrY/UkWukhKhTKI/AAAAAAAAJ0g/fcRDiqVC8Us/s1600/aaPOP+001.jpg
"""

pre_extender_image_urls = kwargs.get('extender_image_urls', [])
now_extender_image_urls = []

search_re = re.compile(r'/s\d+/', re.IGNORECASE)
Expand All @@ -23,20 +25,22 @@ def blogspot_s1600_extender(pipeline_index, finder_image_urls, *args, **kwargs):
now_extender_image_urls.append(extender_image_url)

output = {}
output['extender_image_urls'] = pre_extender_image_urls + now_extender_image_urls
output['extender_image_urls'] = extender_image_urls + now_extender_image_urls

return output


def ggpht_s1600_extender(pipeline_index, finder_image_urls, *args, **kwargs):
def ggpht_s1600_extender(pipeline_index,
finder_image_urls,
extender_image_urls=[],
*args, **kwargs):
"""
Example:
http://lh4.ggpht.com/-fFi-qJRuxeY/UjwHSOTHGOI/AAAAAAAArgE/SWTMT-hXzB4/s640/Celeber-ru-Emma-Watson-Net-A-Porter-The-Edit-Magazine-Photoshoot-2013-01.jpg
to
http://lh4.ggpht.com/-fFi-qJRuxeY/UjwHSOTHGOI/AAAAAAAArgE/SWTMT-hXzB4/s1600/Celeber-ru-Emma-Watson-Net-A-Porter-The-Edit-Magazine-Photoshoot-2013-01.jpg
"""

pre_extender_image_urls = kwargs.get('extender_image_urls', [])
now_extender_image_urls = []

search_re = re.compile(r'/s\d+/', re.IGNORECASE)
Expand All @@ -48,20 +52,22 @@ def ggpht_s1600_extender(pipeline_index, finder_image_urls, *args, **kwargs):
now_extender_image_urls.append(extender_image_url)

output = {}
output['extender_image_urls'] = pre_extender_image_urls + now_extender_image_urls
output['extender_image_urls'] = extender_image_urls + now_extender_image_urls

return output


def googleusercontent_s1600_extender(pipeline_index, finder_image_urls, *args, **kwargs):
def googleusercontent_s1600_extender(pipeline_index,
finder_image_urls,
extender_image_urls=[],
*args, **kwargs):
"""
Example:
https://lh6.googleusercontent.com/-T6V-utZHzbE/Ukjn-1MDOSI/AAAAAAAAA3g/H6Qcw1zt4n0/w555-h399-no/2101_aa2cac09d1c6431b8a635d61cd9c4471.jpg
to
https://lh6.googleusercontent.com/-T6V-utZHzbE/Ukjn-1MDOSI/AAAAAAAAA3g/H6Qcw1zt4n0/s1600/2101_aa2cac09d1c6431b8a635d61cd9c4471.jpg
"""

pre_extender_image_urls = kwargs.get('extender_image_urls', [])
now_extender_image_urls = []

search_re = re.compile(r'/w\d+\-h\d+\-no/', re.IGNORECASE)
Expand All @@ -73,6 +79,6 @@ def googleusercontent_s1600_extender(pipeline_index, finder_image_urls, *args, *
now_extender_image_urls.append(extender_image_url)

output = {}
output['extender_image_urls'] = pre_extender_image_urls + now_extender_image_urls
output['extender_image_urls'] = extender_image_urls + now_extender_image_urls

return output
8 changes: 5 additions & 3 deletions haul/extenders/pipeline/pinterest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
import re


def original_image_extender(pipeline_index, finder_image_urls, *args, **kwargs):
def original_image_extender(pipeline_index,
finder_image_urls,
extender_image_urls=[],
*args, **kwargs):
"""
Example:
http://media-cache-ec0.pinimg.com/70x/50/9b/bd/509bbd5c6543d473bc2b49befe75f4c6.jpg
Expand All @@ -13,7 +16,6 @@ def original_image_extender(pipeline_index, finder_image_urls, *args, **kwargs):
http://media-cache-ec0.pinimg.com/originals/50/9b/bd/509bbd5c6543d473bc2b49befe75f4c6.jpg
"""

pre_extender_image_urls = kwargs.get('extender_image_urls', [])
now_extender_image_urls = []

search_re = re.compile(r'.com/\d+x/', re.IGNORECASE)
Expand All @@ -25,6 +27,6 @@ def original_image_extender(pipeline_index, finder_image_urls, *args, **kwargs):
now_extender_image_urls.append(extender_image_url)

output = {}
output['extender_image_urls'] = pre_extender_image_urls + now_extender_image_urls
output['extender_image_urls'] = extender_image_urls + now_extender_image_urls

return output
18 changes: 10 additions & 8 deletions haul/extenders/pipeline/tumblr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
import re


def media_1280_extender(pipeline_index, finder_image_urls, *args, **kwargs):
def media_1280_extender(pipeline_index,
finder_image_urls,
extender_image_urls=[],
*args, **kwargs):
"""
Example:
http://25.media.tumblr.com/3f5f10d7216f1dd5eacb5eb3e302286a/tumblr_mtpcwdzKBT1qh9n5lo1_250.png
Expand All @@ -12,7 +15,6 @@ def media_1280_extender(pipeline_index, finder_image_urls, *args, **kwargs):
http://25.media.tumblr.com/3f5f10d7216f1dd5eacb5eb3e302286a/tumblr_mtpcwdzKBT1qh9n5lo1_1280.png
"""

pre_extender_image_urls = kwargs.get('extender_image_urls', [])
now_extender_image_urls = []

search_re = re.compile(r'(tumblr_[a-zA-Z0-9_]+)_(\d+).', re.IGNORECASE)
Expand All @@ -24,22 +26,22 @@ def media_1280_extender(pipeline_index, finder_image_urls, *args, **kwargs):
now_extender_image_urls.append(extender_image_url)

output = {}
output['extender_image_urls'] = pre_extender_image_urls + now_extender_image_urls
output['extender_image_urls'] = extender_image_urls + now_extender_image_urls

return output


def avatar_128_extender(pipeline_index, *args, **kwargs):
def avatar_128_extender(pipeline_index,
finder_image_urls,
extender_image_urls=[],
*args, **kwargs):
"""
Example:
http://25.media.tumblr.com/avatar_2909d6610c26_16.png
to
http://25.media.tumblr.com/avatar_2909d6610c26_128.png
"""

finder_image_urls = kwargs.get('finder_image_urls', [])

pre_extender_image_urls = kwargs.get('extender_image_urls', [])
now_extender_image_urls = []

search_re = re.compile(r'(avatar_[a-zA-Z0-9_]+)_(\d+).', re.IGNORECASE)
Expand All @@ -51,6 +53,6 @@ def avatar_128_extender(pipeline_index, *args, **kwargs):
now_extender_image_urls.append(extender_image_url)

output = {}
output['extender_image_urls'] = pre_extender_image_urls + now_extender_image_urls
output['extender_image_urls'] = extender_image_urls + now_extender_image_urls

return output
8 changes: 5 additions & 3 deletions haul/extenders/pipeline/wordpress.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
import re


def original_image_extender(pipeline_index, finder_image_urls, *args, **kwargs):
def original_image_extender(pipeline_index,
finder_image_urls,
extender_image_urls=[],
*args, **kwargs):
"""
Example:
http://fashion-fever.nl/wp-content/upload/2013/09/DSC_0058-110x110.jpg
Expand All @@ -13,7 +16,6 @@ def original_image_extender(pipeline_index, finder_image_urls, *args, **kwargs):
http://www.wendyslookbook.com/wp-content/uploads/2013/09/Morning-Coffee-Run-7.jpg
"""

pre_extender_image_urls = kwargs.get('extender_image_urls', [])
now_extender_image_urls = []

check_re = re.compile(r'wp-content/uploads?/', re.IGNORECASE)
Expand All @@ -26,6 +28,6 @@ def original_image_extender(pipeline_index, finder_image_urls, *args, **kwargs):
now_extender_image_urls.append(extender_image_url)

output = {}
output['extender_image_urls'] = pre_extender_image_urls + now_extender_image_urls
output['extender_image_urls'] = extender_image_urls + now_extender_image_urls

return output
13 changes: 8 additions & 5 deletions haul/finders/pipeline/css.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,19 @@
from haul.utils import in_ignorecase


def background_image_finder(pipeline_index, soup, *args, **kwargs):
def background_image_finder(pipeline_index,
soup,
finder_image_urls=[],
*args, **kwargs):
"""
Find image URL in background-image
Example:
<div style="width: 100%; height: 100%; background-image: url(http://distilleryimage10.ak.instagram.com/bde04558a43b11e28e5d22000a1f979a_7.jpg);" class="Image iLoaded iWithTransition Frame" src="http://distilleryimage10.ak.instagram.com/bde04558a43b11e28e5d22000a1f979a_7.jpg"></div>
to
http://distilleryimage10.ak.instagram.com/bde04558a43b11e28e5d22000a1f979a_7.jpg
"""

pre_finder_image_urls = kwargs.get('finder_image_urls', [])

now_finder_image_urls = []

for tag in soup.find_all(style=True):
Expand All @@ -26,11 +29,11 @@ def background_image_finder(pipeline_index, soup, *args, **kwargs):
for property_value in background_image.propertyValue:
background_image_url = str(property_value.value)
if background_image_url:
if (not in_ignorecase(background_image_url, pre_finder_image_urls)) and \
if (not in_ignorecase(background_image_url, finder_image_urls)) and \
(not in_ignorecase(background_image_url, now_finder_image_urls)):
now_finder_image_urls.append(background_image_url)

output = {}
output['finder_image_urls'] = pre_finder_image_urls + now_finder_image_urls
output['finder_image_urls'] = finder_image_urls + now_finder_image_urls

return output
24 changes: 14 additions & 10 deletions haul/finders/pipeline/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,44 +3,48 @@
from haul.utils import in_ignorecase


def img_src_finder(pipeline_index, soup, *args, **kwargs):
def img_src_finder(pipeline_index,
soup,
finder_image_urls=[],
*args, **kwargs):
"""
Find image URL in <img>'s src attribute
"""

pre_finder_image_urls = kwargs.get('finder_image_urls', [])

now_finder_image_urls = []

for img in soup.find_all('img'):
src = img.get('src', None)
if src:
if (not in_ignorecase(src, pre_finder_image_urls)) and \
if (not in_ignorecase(src, finder_image_urls)) and \
(not in_ignorecase(src, now_finder_image_urls)):
now_finder_image_urls.append(src)

output = {}
output['finder_image_urls'] = pre_finder_image_urls + now_finder_image_urls
output['finder_image_urls'] = finder_image_urls + now_finder_image_urls

return output


def a_href_finder(pipeline_index, soup, *args, **kwargs):
def a_href_finder(pipeline_index,
soup,
finder_image_urls=[],
*args, **kwargs):
"""
Find image URL in <a>'s href attribute
"""

pre_finder_image_urls = kwargs.get('finder_image_urls', [])

now_finder_image_urls = []

for a in soup.find_all('a'):
href = a.get('href', None)
if href:
if filter(href.lower().endswith, ('.jpg', '.jpeg', '.gif', '.png')):
if (not in_ignorecase(href, pre_finder_image_urls)) and \
if (not in_ignorecase(href, finder_image_urls)) and \
(not in_ignorecase(href, now_finder_image_urls)):
now_finder_image_urls.append(href)

output = {}
output['finder_image_urls'] = pre_finder_image_urls + now_finder_image_urls
output['finder_image_urls'] = finder_image_urls + now_finder_image_urls

return output
34 changes: 16 additions & 18 deletions haul/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@


simple_url_re = re.compile(r'^https?://\[?\w', re.IGNORECASE)
simple_url_2_re = re.compile(r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)$', re.IGNORECASE)


class Haul(object):
Expand Down Expand Up @@ -89,9 +88,15 @@ def start_finder_pipeline(self, *args, **kwargs):
pipeline_output['pipeline_index'] = idx
pipeline_output['pipeline_break'] = False

finder_func = utils.module_member(name)
if hasattr(name, '__call__'):
finder_func = name
else:
finder_func = utils.module_member(name)

output = finder_func(*args, **pipeline_output)
pipeline_output.update(output)

if isinstance(output, dict):
pipeline_output.update(output)

if pipeline_output['pipeline_break']:
break
Expand All @@ -115,9 +120,15 @@ def start_extender_pipeline(self, *args, **kwargs):
pipeline_output['pipeline_index'] = idx
pipeline_output['pipeline_break'] = False

extender_func = utils.module_member(name)
if hasattr(name, '__call__'):
extender_func = name
else:
extender_func = utils.module_member(name)

output = extender_func(*args, **pipeline_output)
pipeline_output.update(output)

if isinstance(output, dict):
pipeline_output.update(output)

if pipeline_output['pipeline_break']:
break
Expand Down Expand Up @@ -177,8 +188,6 @@ def __init__(self):
self.title = None
self.finder_image_urls = []
self.extender_image_urls = []
# self.finder_image_file = None
# self.extender_image_file = None

def __repr__(self):
return '<HaulResult [Content-Type: %s]>' % (self.content_type)
Expand All @@ -197,16 +206,5 @@ def image_urls(self):

return all_image_urls

# @property
# def image_file(self):
# if self.extender_image_file:
# which = self.extender_image_file
# elif self.finder_image_file:
# which = self.finder_image_file
# else:
# which = None

# return which

def to_dict(self):
return self.__dict__
1 change: 1 addition & 0 deletions haul/settings.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# coding: utf-8

# http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser
DEFAULT_PARSER = 'lxml'

FINDER_PIPELINE = (
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

setup(
name='haul',
version='1.0.0',
version='1.1.0',
description='An Extensible Image Crawler',
long_description=long_description,
keywords='haul web image content scraper parser crawler',
Expand Down
Loading

0 comments on commit 83454ea

Please sign in to comment.