Rename PageCoroutine -> PageMethod (scrapy-plugins#70)

pass3id · Mar 26, 2022 · a06b3b1 · a06b3b1
1 parent 9ee07aa
commit a06b3b1
Show file tree

Hide file tree

Showing 14 changed files with 314 additions and 198 deletions.
diff --git a/README.md b/README.md
@@ -326,54 +326,75 @@ async def parse_in_new_context(self, response):
 ```
 
 
-## Page coroutines
+## Executing actions on pages
 
 A sorted iterable (`list`, `tuple` or `dict`, for instance) could be passed
-in the `playwright_page_coroutines`
+in the `playwright_page_methods`
 [Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta)
 key to request coroutines to be awaited on the `Page` before returning the final
 `Response` to the callback.
 
 This is useful when you need to perform certain actions on a page, like scrolling
-down or clicking links, and you want everything to count as a single Scrapy
-Response, containing the final result.
+down or clicking links, and you want to handle only the final result in your callback.
 
-### `PageCoroutine` class
+### `PageMethod` class
 
-* `scrapy_playwright.page.PageCoroutine(method: str, *args, **kwargs)`:
+#### `scrapy_playwright.page.PageMethod(method: str, *args, **kwargs)`:
 
-    Represents a coroutine to be awaited on a `playwright.page.Page` object,
-    such as "click", "screenshot", "evaluate", etc.
-    `method` should be the name of the coroutine, `*args` and `**kwargs`
-    are passed to the function call. The return value of the coroutine call
-    will be stored in the `PageCoroutine.result` attribute.
+Represents a method to be called (and awaited if necessary) on a
+`playwright.page.Page` object, such as "click", "screenshot", "evaluate", etc.
+`method` is the name of the method, `*args` and `**kwargs`
+are passed when calling such method. The return value
+will be stored in the `PageMethod.result` attribute.
 
-    For instance,
-    ```python
-    PageCoroutine("screenshot", path="quotes.png", fullPage=True)
-    ```
+For instance,
+```python
+def start_requests(self):
+    yield Request(
+        url="https://example.org",
+        meta={
+            "playwright": True,
+            "playwright_page_methods": [
+                PageMethod("screenshot", path="example.png", fullPage=True),
+            ],
+        },
+    )
 
-    produces the same effect as:
-    ```python
-    # 'page' is a playwright.async_api.Page object
-    await page.screenshot(path="quotes.png", fullPage=True)
-    ```
+def parse(self, response):
+    screenshot = response.meta["playwright_page_methods"][0]
+    # screenshot.result contains the image's bytes
+```
 
+produces the same effect as:
+```python
+def start_requests(self):
+    yield Request(
+        url="https://example.org",
+        meta={"playwright": True, "playwright_include_page": True},
+    )
 
-### Supported coroutines
+async def parse(self, response):
+    page = response.meta["playwright_page"]
+    await page.screenshot(path="example.png", full_page=True)
+    await page.close()
+```
+
+
+### Supported methods
 
 Please refer to the [upstream docs for the `Page` class](https://playwright.dev/python/docs/api/class-page)
-to see available coroutines
+to see available methods.
 
 ### Impact on Response objects
 
 Certain `Response` attributes (e.g. `url`, `ip_address`) reflect the state after the last
-action performed on a page. If you issue a `PageCoroutine` with an action that results in
+action performed on a page. If you issue a `PageMethod` with an action that results in
 a navigation (e.g. a `click` on a link), the `Response.url` attribute will point to the
 new URL, which might be different from the request's URL.
 
 
 ## Page events
+
 A dictionary of Page event handlers can be specified in the `playwright_page_event_handlers`
 [Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta) key.
 Keys are the name of the event to be handled (`dialog`, `download`, etc).
@@ -430,15 +451,15 @@ class ClickAndSavePdfSpider(scrapy.Spider):
             url="https://example.org",
             meta=dict(
                 playwright=True,
-                playwright_page_coroutines={
-                    "click": PageCoroutine("click", selector="a"),
-                    "pdf": PageCoroutine("pdf", path="/tmp/file.pdf"),
+                playwright_page_methods={
+                    "click": PageMethod("click", selector="a"),
+                    "pdf": PageMethod("pdf", path="/tmp/file.pdf"),
                 },
             ),
         )
 
     def parse(self, response):
-        pdf_bytes = response.meta["playwright_page_coroutines"]["pdf"].result
+        pdf_bytes = response.meta["playwright_page_methods"]["pdf"].result
         with open("iana.pdf", "wb") as fp:
             fp.write(pdf_bytes)
         yield {"url": response.url}  # response.url is "https://www.iana.org/domains/reserved"
@@ -456,10 +477,10 @@ class ScrollSpider(scrapy.Spider):
             meta=dict(
                 playwright=True,
                 playwright_include_page=True,
-                playwright_page_coroutines=[
-                    PageCoroutine("wait_for_selector", "div.quote"),
-                    PageCoroutine("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
-                    PageCoroutine("wait_for_selector", "div.quote:nth-child(11)"),  # 10 per page
+                playwright_page_methods=[
+                    PageMethod("wait_for_selector", "div.quote"),
+                    PageMethod("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
+                    PageMethod("wait_for_selector", "div.quote:nth-child(11)"),  # 10 per page
                 ],
             ),
         )
@@ -487,7 +508,14 @@ For more examples, please see the scripts in the [examples](examples) directory.
   Refer to the [Proxy support](#proxy-support) section for more information.
 
 
-##  Deprecations
+##  Deprecation policy
+
+Deprecated features will be supported for at least six months
+following the release that deprecated them. After that, they
+may be removed at any time. See the [changelog](changelog.md)
+for more information about deprecations and removals.
+
+### Currently deprecated features
 
 * `PLAYWRIGHT_CONTEXT_ARGS` setting (type `dict`, default `{}`)
 
@@ -497,3 +525,15 @@ For more examples, please see the scripts in the [examples](examples) directory.
     Deprecated since
     [`v0.0.4`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.4),
     use the `PLAYWRIGHT_CONTEXTS` setting instead
+
+* `scrapy_playwright.page.PageCoroutine` class
+
+    Deprecated since
+    [`v0.0.14`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.14),
+    use `scrapy_playwright.page.PageMethod` instead
+
+* `playwright_page_coroutines` Request meta key
+
+    Deprecated since
+    [`v0.0.14`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.14),
+    use `playwright_page_methods` instead
diff --git a/changelog.md b/changelog.md
@@ -1,5 +1,10 @@
 # scrapy-playwright changelog
 
+### [v0.0.14](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.14) (2022-03-26)
+
+* Renamed `PageCoroutine` to `PageMethod` (`PageCoroutine` is now deprecated)
+
+
 ### [v0.0.13](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.13) (2022-03-24)
 
 * PageCoroutine checks

diff --git a/examples/cookies.py b/examples/cookies.py
@@ -2,7 +2,7 @@
 
 from scrapy import Spider, Request
 from scrapy.crawler import CrawlerProcess
-from scrapy_playwright.page import PageCoroutine
+from scrapy_playwright.page import PageMethod
 
 
 class CookieSpider(Spider):
@@ -18,8 +18,8 @@ def start_requests(self):
             cookies={"foo": "bar"},
             meta={
                 "playwright": True,
-                "playwright_page_coroutines": [
-                    PageCoroutine(
+                "playwright_page_methods": [
+                    PageMethod(
                         "screenshot", path=Path(__file__).parent / "cookies.png", full_page=True
                     ),
                 ],

diff --git a/examples/events.py b/examples/events.py
@@ -1,7 +1,7 @@
 from playwright.async_api import Dialog, Response as PlaywrightResponse
 from scrapy import Spider, Request
 from scrapy.crawler import CrawlerProcess
-from scrapy_playwright.page import PageCoroutine
+from scrapy_playwright.page import PageMethod
 
 
 class EventsSpider(Spider):
@@ -16,8 +16,8 @@ def start_requests(self):
             url="https://example.org",
             meta={
                 "playwright": True,
-                "playwright_page_coroutines": [
-                    PageCoroutine("evaluate", "alert('foobar');"),
+                "playwright_page_methods": [
+                    PageMethod("evaluate", "alert('foobar');"),
                 ],
                 "playwright_page_event_handlers": {
                     "dialog": self.handle_dialog,

diff --git a/examples/exception.py b/examples/exception.py
@@ -3,7 +3,7 @@
 
 from scrapy import Spider, Request
 from scrapy.crawler import CrawlerProcess
-from scrapy_playwright.page import PageCoroutine
+from scrapy_playwright.page import PageMethod
 
 
 class HandleTimeoutMiddleware:
@@ -13,8 +13,8 @@ def process_exception(self, request, exception, spider):
             url="https://httpbin.org/get",
             meta={
                 "playwright": True,
-                "playwright_page_coroutines": [
-                    PageCoroutine(
+                "playwright_page_methods": [
+                    PageMethod(
                         "screenshot", path=Path(__file__).parent / "recovered.png", full_page=True
                     ),
                 ],

diff --git a/examples/post.py b/examples/post.py
@@ -2,7 +2,7 @@
 
 from scrapy import Spider, FormRequest
 from scrapy.crawler import CrawlerProcess
-from scrapy_playwright.page import PageCoroutine
+from scrapy_playwright.page import PageMethod
 
 
 class PostSpider(Spider):
@@ -18,8 +18,8 @@ def start_requests(self):
             formdata={"foo": "bar"},
             meta={
                 "playwright": True,
-                "playwright_page_coroutines": [
-                    PageCoroutine(
+                "playwright_page_methods": [
+                    PageMethod(
                         "screenshot", path=Path(__file__).parent / "post.png", full_page=True
                     ),
                 ],

diff --git a/examples/scroll.py b/examples/scroll.py
@@ -2,7 +2,7 @@
 
 from scrapy import Spider, Request
 from scrapy.crawler import CrawlerProcess
-from scrapy_playwright.page import PageCoroutine
+from scrapy_playwright.page import PageMethod
 
 
 class ScrollSpider(Spider):
@@ -18,11 +18,11 @@ def start_requests(self):
             cookies={"foo": "bar", "asdf": "qwerty"},
             meta={
                 "playwright": True,
-                "playwright_page_coroutines": [
-                    PageCoroutine("wait_for_selector", "div.quote"),
-                    PageCoroutine("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
-                    PageCoroutine("wait_for_selector", "div.quote:nth-child(11)"),  # 10 per page
-                    PageCoroutine(
+                "playwright_page_methods": [
+                    PageMethod("wait_for_selector", "div.quote"),
+                    PageMethod("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
+                    PageMethod("wait_for_selector", "div.quote:nth-child(11)"),  # 10 per page
+                    PageMethod(
                         "screenshot", path=Path(__file__).parent / "scroll.png", full_page=True
                     ),
                 ],
@@ -41,6 +41,7 @@ def parse(self, response):
                 # "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
                 "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
             },
+            "LOG_LEVEL": "INFO",
         }
     )
     process.crawl(ScrollSpider)

diff --git a/examples/storage.py b/examples/storage.py
@@ -1,6 +1,6 @@
 from scrapy import Spider, Request
 from scrapy.crawler import CrawlerProcess
-from scrapy_playwright.page import PageCoroutine
+from scrapy_playwright.page import PageMethod
 
 
 class StorageSpider(Spider):
@@ -16,8 +16,8 @@ def start_requests(self):
             meta={
                 "playwright": True,
                 "playwright_include_page": True,
-                "playwright_page_coroutines": [
-                    PageCoroutine("evaluate_handle", "window.localStorage.setItem('foo', 'bar');"),
+                "playwright_page_methods": [
+                    PageMethod("evaluate_handle", "window.localStorage.setItem('foo', 'bar');"),
                 ],
             },
         )

diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py
@@ -19,6 +19,7 @@
 from scrapy import Spider, signals
 from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
 from scrapy.crawler import Crawler
+from scrapy.exceptions import ScrapyDeprecationWarning
 from scrapy.http import Request, Response
 from scrapy.http.headers import Headers
 from scrapy.responsetypes import responsetypes
@@ -30,7 +31,7 @@
 from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
 
 from scrapy_playwright.headers import use_scrapy_headers
-from scrapy_playwright.page import PageCoroutine
+from scrapy_playwright.page import PageMethod
 
 
 __all__ = ["ScrapyPlaywrightDownloadHandler"]
@@ -96,7 +97,7 @@ def __init__(self, crawler: Crawler) -> None:
                 "The PLAYWRIGHT_CONTEXT_ARGS setting is deprecated, please use"
                 " PLAYWRIGHT_CONTEXTS instead. Keyword arguments defined in"
                 " PLAYWRIGHT_CONTEXT_ARGS will be used when creating the 'default' context",
-                category=DeprecationWarning,
+                category=ScrapyDeprecationWarning,
                 stacklevel=2,
             )
         self.context_kwargs: defaultdict = defaultdict(dict)
@@ -250,23 +251,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
         start_time = time()
         response = await page.goto(request.url)
 
-        page_coroutines = request.meta.get("playwright_page_coroutines") or ()
-        if isinstance(page_coroutines, dict):
-            page_coroutines = page_coroutines.values()
-        for pc in page_coroutines:
-            if isinstance(pc, PageCoroutine):
-                try:
-                    method = getattr(page, pc.method)
-                except AttributeError:
-                    logger.warning(f"Ignoring {repr(pc)}: could not find coroutine")
-                else:
-                    result = method(*pc.args, **pc.kwargs)
-                    pc.result = await result if isawaitable(result) else result
-                    await page.wait_for_load_state(timeout=self.default_navigation_timeout)
-            else:
-                logger.warning(
-                    f"Ignoring {repr(pc)}: expected PageCoroutine, got {repr(type(pc))}"
-                )
+        await self._apply_page_methods(page, request)
 
         body_str = await page.content()
         request.meta["download_latency"] = time() - start_time
@@ -300,6 +285,33 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
             ip_address=server_ip_address,
         )
 
+    async def _apply_page_methods(self, page: Page, request: Request) -> None:
+        page_methods = request.meta.get("playwright_page_methods") or ()
+
+        if not page_methods and "playwright_page_coroutines" in request.meta:
+            page_methods = request.meta["playwright_page_coroutines"]
+            warnings.warn(
+                "The 'playwright_page_coroutines' request meta key is deprecated,"
+                " please use 'playwright_page_methods' instead.",
+                category=ScrapyDeprecationWarning,
+                stacklevel=1,
+            )
+
+        if isinstance(page_methods, dict):
+            page_methods = page_methods.values()
+        for pm in page_methods:
+            if isinstance(pm, PageMethod):
+                try:
+                    method = getattr(page, pm.method)
+                except AttributeError:
+                    logger.warning(f"Ignoring {repr(pm)}: could not find method")
+                else:
+                    result = method(*pm.args, **pm.kwargs)
+                    pm.result = await result if isawaitable(result) else result
+                    await page.wait_for_load_state(timeout=self.default_navigation_timeout)
+            else:
+                logger.warning(f"Ignoring {repr(pm)}: expected PageMethod, got {repr(type(pm))}")
+
     def _increment_request_stats(self, request: PlaywrightRequest) -> None:
         stats_prefix = "playwright/request_count"
         self.stats.inc_value(stats_prefix)