Merge pull request #25 from scrapinghub/ae_poet_0.3.0

ivanprado · web-flow · commit 17ee419a4b03 · 2021-08-05T10:38:22.000+01:00
Upgrade to autoextract-poet 0.3.0
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,6 +1,14 @@
 Changes
 =======
 
+0.7.0 (2021-08-05)
+------------------
+
+* Support for all Automatic Extraction API page types by upgrading to
+  ``autoextract-poet`` 0.3.0
+* Rename Scrapinghub references to Zyte
+* Update README
+
 0.6.1 (2021-06-02)
 ------------------
 
diff --git a/README.rst b/README.rst
@@ -81,21 +81,20 @@ library.
 Within the spider, consuming the AutoExtract result is as easy as::
 
     import scrapy
-    from autoextract_poet import AutoExtractArticleData
+    from autoextract_poet.pages import AutoExtractArticlePage
 
     class SampleSpider(scrapy.Spider):
-
         name = "sample"
 
-        def parse(self, response, article: AutoExtractArticleData):
+        def parse(self, response, article_page: AutoExtractArticlePage):
             # We're making two requests here:
             # - one through Scrapy to build the response argument
-            # - another through providers to build the article argument
-            yield article.to_item()
+            # - the other through the providers to build the article_page argument
+            yield article_page.to_item()
 
 Note that on the example above, we're going to perform two requests:
 
-* one goes through Scrapy (it might use Crawlera, Splash or no proxy at all, depending on your configuration)
+* one goes through Scrapy (it might use Smart Proxy, Splash or no proxy at all, depending on your configuration)
 * another goes through AutoExtract API using `zyte-autoextract`_
 
 If you don't need the additional request going through Scrapy,
@@ -105,16 +104,31 @@ This will ignore the Scrapy request and only the AutoExtract API will be fetched
 For example::
 
     import scrapy
-    from autoextract_poet import AutoExtractArticleData
+    from autoextract_poet.pages import AutoExtractArticlePage
     from scrapy_poet import DummyResponse
 
     class SampleSpider(scrapy.Spider):
-
         name = "sample"
 
-        def parse(self, response: DummyResponse, article: AutoExtractArticleData):
+        def parse(self, response: DummyResponse, article_page: AutoExtractArticlePage):
             # We're making a single request here to build the article argument
-            yield article.to_item()
+            yield article_page.to_item()
+
+
+The examples above extract an article from the page, but you may want to
+extract a different type of item, like a product or a job posting. It is
+as easy as using the correct type annotation in the callback. This
+is how the callback looks like if we need to extract a real state
+from the page::
+
+    def parse(self,
+              response: DummyResponse,
+              real_estate_page: AutoExtractRealEstatePage):
+        yield real_estate_page.to_item()
+
+You can even use ``AutoExtractWebPage`` if what you need is the raw browser HTML to
+extract some additional data. Visit the full list of `supported page types`_
+to get a better idea of the supported pages.
 
 Configuration
 ^^^^^^^^^^^^^
@@ -164,27 +178,30 @@ You can capture those exceptions using an error callback (``errback``)::
 
     import scrapy
     from autoextract.aio.errors import RequestError
+    from autoextract_poet.pages import AutoExtractArticlePage
     from scrapy_autoextract.errors import QueryError
+    from scrapy_poet import DummyResponse
     from twisted.python.failure import Failure
 
     class SampleSpider(scrapy.Spider):
-
         name = "sample"
         urls = [...]
 
         def start_requests(self):
             for url in self.urls:
-                yield scrapy.Request(url, callback=self.parse_article, errback=self.errback_article)
+                yield scrapy.Request(url, callback=self.parse_article,
+                                     errback=self.errback_article)
 
-        def parse_article(self, response: DummyResponse, article: AutoExtractArticleData):
-            yield article.to_item()
+        def parse_article(self, response: DummyResponse,
+                          article_page: AutoExtractArticlePage):
+            yield article_page.to_item()
 
         def errback_article(self, failure: Failure):
             if failure.check(RequestError):
-                self.logger.error(f"RequestError on {failure.request.url})
+                self.logger.error(f"RequestError on {failure.request.url}")
 
             if failure.check(QueryError):
-                self.logger.error(f"QueryError: {failure.message})
+                self.logger.error(f"QueryError: {failure.value.message}")
 
 See `Scrapy documentation <https://docs.scrapy.org/en/latest/topics/request-response.html#using-errbacks-to-catch-exceptions-in-request-processing>`_
 for more details on how to capture exceptions using request's errback.
@@ -254,9 +271,6 @@ When using the AutoExtract middleware, there are some limitations.
 When using the AutoExtract providers, be aware that:
 
 * With scrapy-poet integration, retry requests don't go through Scrapy
-* Not all data types are supported with scrapy-poet,
-  currently only Articles, Products and Product Lists are supported with
-  `autoextract-poet`_
 
 .. _`web-poet`: https://github.com/scrapinghub/web-poet
 .. _`scrapy-poet`: https://github.com/scrapinghub/scrapy-poet
@@ -267,3 +281,4 @@ When using the AutoExtract providers, be aware that:
 .. _`Scrapy's asyncio documentation`: https://docs.scrapy.org/en/latest/topics/asyncio.html
 .. _`Request-level error`: https://doc.scrapinghub.com/autoextract.html#request-level
 .. _`Query-level error`: https://doc.scrapinghub.com/autoextract.html#query-level
+.. _`supported page types`: https://autoextract-poet.readthedocs.io/en/stable/_autosummary/autoextract_poet.pages.html#module-autoextract_poet.pages
diff --git a/setup.py b/setup.py
@@ -18,16 +18,16 @@ def get_version():
 setup(
     name=NAME,
     version=get_version(),
-    author='Scrapinghub Inc',
-    author_email='info@scrapinghub.com',
-    maintainer='Scrapinghub Inc',
-    maintainer_email='info@scrapinghub.com',
-    description='Scrapinghub AutoExtract API integration for Scrapy',
+    author='Zyte Group Ltd',
+    author_email='info@zyte.com',
+    maintainer='Zyte Group Ltd',
+    maintainer_email='info@zyte.com',
+    description='Zyte Automatic Extraction API integration for Scrapy',
     long_description=open('README.rst').read(),
     url='https://github.com/scrapinghub/scrapy-autoextract',
     packages=find_packages(),
     install_requires=[
-        'autoextract-poet>=0.2.1',
+        'autoextract-poet>=0.3.0',
         'zyte-autoextract>=0.7.0',
         'scrapy-poet>=0.2.0',
         'aiohttp',
diff --git a/tests/test_providers.py b/tests/test_providers.py
@@ -15,7 +15,11 @@
 from autoextract_poet import (
     AutoExtractArticleData, AutoExtractProductData, AutoExtractHtml)
 from tests.utils import assert_stats, request_error, async_test
-from autoextract_poet.page_inputs import AutoExtractData
+from autoextract_poet.page_inputs import AutoExtractData, \
+    AutoExtractArticleListData, AutoExtractCommentsData, \
+    AutoExtractForumPostsData, AutoExtractJobPostingData, \
+    AutoExtractProductListData, AutoExtractRealEstateData, \
+    AutoExtractReviewsData, AutoExtractVehicleData
 from scrapy import Spider
 from scrapy.crawler import Crawler
 from scrapy_autoextract.providers import (
@@ -26,10 +30,17 @@
 
 DATA_INPUTS = (
     AutoExtractArticleData,
+    AutoExtractArticleListData,
+    AutoExtractCommentsData,
+    AutoExtractForumPostsData,
+    AutoExtractJobPostingData,
     AutoExtractProductData,
+    AutoExtractProductListData,
+    AutoExtractRealEstateData,
+    AutoExtractReviewsData,
+    AutoExtractVehicleData,
 )
 
-
 def test_stop_spider_on_account_disabled(mocker: MockerFixture):
     class Engine:
         close_spider = mocker.Mock()