Merge 9c4c687 into 68d8fd1

scrapy-plugins · Jul 15, 2021 · 1f51309 · 1f51309
2 parents 68d8fd1 + 9c4c687
commit 1f51309
Show file tree

Hide file tree

Showing 6 changed files with 479 additions and 87 deletions.
diff --git a/README.md b/README.md
@@ -55,6 +55,8 @@ Also, be sure to [install the `asyncio`-based Twisted reactor](https://docs.scra
 TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 ```
 
+### Settings
+
 `scrapy-playwright` accepts the following settings:
 
 * `PLAYWRIGHT_BROWSER_TYPE` (type `str`, default `chromium`)
@@ -67,7 +69,28 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 
 * `PLAYWRIGHT_CONTEXT_ARGS` (type `dict`, default `{}`)
 
-    A dictionary with keyword arguments to be passed when creating the default Browser context.
+    A dictionary with default keyword arguments to be passed when creating the
+    "default" Browser context.
+
+    **Deprecated: use `PLAYWRIGHT_CONTEXTS` instead**
+
+* `PLAYWRIGHT_CONTEXTS` (type `dict[str, dict]`, default `{}`)
+
+    A dictionary which defines Browser contexts to be created on startup.
+    It should be a mapping of (name, keyword arguments) For instance:
+    ```python
+    {
+        "first": {
+            "context_arg1": "value",
+            "context_arg2": "value",
+        },
+        "second": {
+            "context_arg1": "value",
+        },
+    }
+    ```
+    If no contexts are defined, a default context (called `default`) is created.
+    The arguments passed here take precedence over the ones defined in `PLAYWRIGHT_CONTEXT_ARGS`.
     See the docs for [`Browser.new_context`](https://playwright.dev/python/docs/api/class-browser#browsernew_contextkwargs).
 
 * `PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT` (type `Optional[int]`, default `None`)
@@ -104,42 +127,7 @@ class AwesomeSpider(scrapy.Spider):
 ```
 
 
-## Page coroutines
-
-A sorted iterable (`list`, `tuple` or `dict`, for instance) could be passed
-in the `playwright_page_coroutines`
-[Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta)
-key to request coroutines to be awaited on the `Page` before returning the final
-`Response` to the callback.
-
-This is useful when you need to perform certain actions on a page, like scrolling
-down or clicking links, and you want everything to count as a single Scrapy
-Response, containing the final result.
-
-### Supported actions
-
-* `scrapy_playwright.page.PageCoroutine(method: str, *args, **kwargs)`:
-
-    _Represents a coroutine to be awaited on a `playwright.page.Page` object,
-    such as "click", "screenshot", "evaluate", etc.
-    `method` should be the name of the coroutine, `*args` and `**kwargs`
-    are passed to the function call._
-
-    _The coroutine result will be stored in the `PageCoroutine.result` attribute_
-
-    For instance,
-    ```python
-    PageCoroutine("screenshot", path="quotes.png", fullPage=True)
-    ```
-
-    produces the same effect as:
-    ```python
-    # 'page' is a playwright.async_api.Page object
-    await page.screenshot(path="quotes.png", fullPage=True)
-    ```
-
-
-### Receiving the Page object in the callback
+## Receiving the Page object in the callback
 
 Specifying a non-False value for the `playwright_include_page` `meta` key for a
 request will result in the corresponding `playwright.async_api.Page` object
@@ -176,6 +164,109 @@ class AwesomeSpiderWithPage(scrapy.Spider):
   Scrapy request workflow (Scheduler, Middlewares, etc).
 
 
+## Multiple browser contexts
+
+Multiple [browser contexts](https://playwright.dev/python/docs/core-concepts/#browser-contexts)
+to be launched at startup can be defined via the `PLAYWRIGHT_CONTEXTS` [setting](#settings).
+
+### Choosing a specific context for a request
+
+Pass the name of the desired context in the `playwright_context` meta key:
+
+```python
+yield scrapy.Request(
+    url="https://example.org",
+    meta={"playwright": True, "playwright_context": "first"},
+)
+```
+
+### Creating a context during a crawl
+
+If the context specified in the `playwright_context` meta key does not exist, it will be created.
+You can specify keyword arguments to be passed to
+[`Browser.new_context`](https://playwright.dev/python/docs/api/class-browser#browsernew_contextkwargs)
+in the `playwright_context_kwargs` meta key:
+
+```python
+yield scrapy.Request(
+    url="https://example.org",
+    meta={
+        "playwright": True,
+        "playwright_context": "new",
+        "playwright_context_kwargs": {
+            "java_script_enabled": False,
+            "ignore_https_errors": True,
+            "proxy": {
+                "server": "http://myproxy.com:3128",
+                "username": "user",
+                "password": "pass",
+            },
+        },
+    },
+)
+```
+
+Please note that if a context with the specified name already exists,
+that context is used and `playwright_context_kwargs` are ignored.
+
+### Closing a context during a crawl
+
+After [receiving the Page object in your callback](#receiving-the-page-object-in-the-callback),
+you can access a context though the corresponding [`Page.context`](https://playwright.dev/python/docs/api/class-page#page-context)
+attribute, and await [`close`](https://playwright.dev/python/docs/api/class-browsercontext#browser-context-close) on it.
+
+```python
+def parse(self, response):
+    yield scrapy.Request(
+        url="https://example.org",
+        callback=self.parse_in_new_context,
+        meta={"playwright": True, "playwright_context": "new", "playwright_include_page": True},
+    )
+
+async def parse_in_new_context(self, response):
+    page = response.meta["playwright_page"]
+    title = await page.title()
+    await page.context.close()  # close the context
+    await page.close()
+    return {"title": title}
+```
+
+
+## Page coroutines
+
+A sorted iterable (`list`, `tuple` or `dict`, for instance) could be passed
+in the `playwright_page_coroutines`
+[Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta)
+key to request coroutines to be awaited on the `Page` before returning the final
+`Response` to the callback.
+
+This is useful when you need to perform certain actions on a page, like scrolling
+down or clicking links, and you want everything to count as a single Scrapy
+Response, containing the final result.
+
+### Supported actions
+
+* `scrapy_playwright.page.PageCoroutine(method: str, *args, **kwargs)`:
+
+    _Represents a coroutine to be awaited on a `playwright.page.Page` object,
+    such as "click", "screenshot", "evaluate", etc.
+    `method` should be the name of the coroutine, `*args` and `**kwargs`
+    are passed to the function call._
+
+    _The coroutine result will be stored in the `PageCoroutine.result` attribute_
+
+    For instance,
+    ```python
+    PageCoroutine("screenshot", path="quotes.png", fullPage=True)
+    ```
+
+    produces the same effect as:
+    ```python
+    # 'page' is a playwright.async_api.Page object
+    await page.screenshot(path="quotes.png", fullPage=True)
+    ```
+
+
 ## Examples
 
 **Click on a link, save the resulting page as PDF**

diff --git a/examples/books.py b/examples/books.py
@@ -1,6 +1,7 @@
 import hashlib
+import logging
 from pathlib import Path
-from typing import Generator
+from typing import Generator, Optional
 
 from scrapy import Spider
 from scrapy.crawler import CrawlerProcess
@@ -12,25 +13,23 @@ class BooksSpider(Spider):
 
     name = "books"
     start_urls = ["http://books.toscrape.com"]
-    custom_settings = {
-        "CLOSESPIDER_ITEMCOUNT": 100,
-        "CONCURRENT_REQUESTS": 32,
-        "FEEDS": {
-            "books.json": {"format": "json", "encoding": "utf-8", "indent": 4},
-        },
-    }
-
-    def parse(self, response: Response) -> Generator:
+
+    def parse(self, response: Response, current_page: Optional[int] = None) -> Generator:
         page_count = response.css(".pager .current::text").re_first(r"Page \d+ of (\d+)")
         page_count = int(page_count)
         for page in range(2, page_count + 1):
-            yield response.follow(f"/catalogue/page-{page}.html")
+            yield response.follow(f"/catalogue/page-{page}.html", cb_kwargs={"current_page": page})
 
+        current_page = current_page or 1
         for book in response.css("article.product_pod a"):
             yield response.follow(
                 book,
                 callback=self.parse_book,
-                meta={"playwright": True, "playwright_include_page": True},
+                meta={
+                    "playwright": True,
+                    "playwright_include_page": True,
+                    "playwright_context": f"page-{current_page}",
+                },
             )
 
     async def parse_book(self, response: Response) -> dict:
@@ -57,7 +56,14 @@ async def parse_book(self, response: Response) -> dict:
                 # "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
                 "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
             },
+            "CONCURRENT_REQUESTS": 32,
+            "CLOSESPIDER_ITEMCOUNT": 100,
+            "FEEDS": {
+                "books.json": {"format": "json", "encoding": "utf-8", "indent": 4},
+            },
         }
     )
     process.crawl(BooksSpider)
+    logging.getLogger("scrapy.core.engine").setLevel(logging.WARNING)
+    logging.getLogger("scrapy.core.scraper").setLevel(logging.WARNING)
     process.start()
diff --git a/examples/contexts.py b/examples/contexts.py
@@ -0,0 +1,107 @@
+from scrapy import Spider, Request
+from scrapy.crawler import CrawlerProcess
+
+
+class MultipleContextsSpider(Spider):
+    """Handle multiple browser contexts"""
+
+    name = "contexts"
+    custom_settings = {
+        "PLAYWRIGHT_CONTEXTS": {
+            "first": {
+                "storage_state": {
+                    "cookies": [
+                        {
+                            "url": "https://httpbin.org/headers",
+                            "name": "context",
+                            "value": "first",
+                        },
+                    ],
+                },
+            },
+            "second": {
+                "storage_state": {
+                    "cookies": [
+                        {
+                            "url": "https://httpbin.org/headers",
+                            "name": "context",
+                            "value": "second",
+                        },
+                    ],
+                },
+            },
+        },
+    }
+
+    def start_requests(self):
+        # using existing contexts
+        yield Request(
+            url="https://httpbin.org/headers",
+            meta={
+                "playwright": True,
+                "playwright_context": "first",
+                "playwright_include_page": True,
+            },
+            dont_filter=True,
+        )
+        yield Request(
+            url="https://httpbin.org/headers",
+            meta={
+                "playwright": True,
+                "playwright_context": "second",
+                "playwright_include_page": True,
+            },
+            dont_filter=True,
+        )
+        # create a new context
+        yield Request(
+            url="https://httpbin.org/headers",
+            meta={
+                "playwright": True,
+                "playwright_context": "third",
+                "playwright_context_kwargs": {
+                    "storage_state": {
+                        "cookies": [
+                            {
+                                "url": "https://httpbin.org/headers",
+                                "name": "context",
+                                "value": "third",
+                            },
+                        ],
+                    },
+                },
+                "playwright_include_page": True,
+            },
+            dont_filter=True,
+        )
+        # default context
+        yield Request(
+            url="https://httpbin.org/headers",
+            meta={"playwright": True, "playwright_include_page": True},
+            dont_filter=True,
+        )
+
+    async def parse(self, response):
+        page = response.meta["playwright_page"]
+        context_name = response.meta["playwright_context"]
+        storage_state = await page.context.storage_state()
+        await page.context.close()
+        return {
+            "url": response.url,
+            "context": context_name,
+            "cookies": storage_state["cookies"],
+        }
+
+
+if __name__ == "__main__":
+    process = CrawlerProcess(
+        settings={
+            "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
+            "DOWNLOAD_HANDLERS": {
+                "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+                # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+            },
+        }
+    )
+    process.crawl(MultipleContextsSpider)
+    process.start()