From bcfc8e1cb69de58ffba729f87735cabbba5b4a5e Mon Sep 17 00:00:00 2001 From: Vinta Date: Fri, 4 Oct 2013 10:28:55 +0800 Subject: [PATCH 1/6] custom finder / extender pipeline support --- haul/models.py | 20 ++++++++++++++++---- haul/settings.py | 1 + 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/haul/models.py b/haul/models.py index f445ae5..c802f6b 100644 --- a/haul/models.py +++ b/haul/models.py @@ -89,9 +89,15 @@ def start_finder_pipeline(self, *args, **kwargs): pipeline_output['pipeline_index'] = idx pipeline_output['pipeline_break'] = False - finder_func = utils.module_member(name) + if hasattr(name, '__call__'): + finder_func = name + else: + finder_func = utils.module_member(name) + output = finder_func(*args, **pipeline_output) - pipeline_output.update(output) + + if isinstance(output, dict): + pipeline_output.update(output) if pipeline_output['pipeline_break']: break @@ -115,9 +121,15 @@ def start_extender_pipeline(self, *args, **kwargs): pipeline_output['pipeline_index'] = idx pipeline_output['pipeline_break'] = False - extender_func = utils.module_member(name) + if hasattr(name, '__call__'): + extender_func = name + else: + extender_func = utils.module_member(name) + output = extender_func(*args, **pipeline_output) - pipeline_output.update(output) + + if isinstance(output, dict): + pipeline_output.update(output) if pipeline_output['pipeline_break']: break diff --git a/haul/settings.py b/haul/settings.py index ef63ffc..f339cd3 100644 --- a/haul/settings.py +++ b/haul/settings.py @@ -1,5 +1,6 @@ # coding: utf-8 +# http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser DEFAULT_PARSER = 'lxml' FINDER_PIPELINE = ( From 873f099efaf7f8b08e215a94089e6a49c92ea986 Mon Sep 17 00:00:00 2001 From: Vinta Date: Fri, 4 Oct 2013 10:29:41 +0800 Subject: [PATCH 2/6] test for custom finder pipeline --- tests/fixtures/page.html | 6 +++++ tests/pipeline/test.py | 0 tests/test.py | 52 ++++++++++++++++++++++++++++++++++++++-- 3 files changed, 56 insertions(+), 2 deletions(-) delete mode 100644 tests/pipeline/test.py diff --git a/tests/fixtures/page.html b/tests/fixtures/page.html index f2ab7a4..3a02c51 100644 --- a/tests/fixtures/page.html +++ b/tests/fixtures/page.html @@ -15,6 +15,12 @@ some image

+
+ + img with data-src + + +