add playwright page.goto kwargs (scrapy-plugins#54)

pass3id · Jul 17, 2022 · bd9bd95 · bd9bd95
1 parent 683ee5f
commit bd9bd95
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -233,9 +233,15 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
             callback=self.parse_headers,
             meta={"playwright": True, "playwright_page": page},
         )
-
     ```
 
+* `playwright_page_goto_kwargs` (type `dict`, default `{}`)
+
+    A dictionary with keyword arguments to be passed to the page's
+    [`goto` method](https://playwright.dev/python/docs/api/class-page#page-goto)
+    when navigating to an URL. The `url` key is ignored if present, the request's
+    URL is used instead.
+
 * `playwright_security_details` (type `Optional[dict]`, read only)
 
     A dictionary with [security information](https://playwright.dev/python/docs/api/class-response#response-security-details)

diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py
@@ -288,7 +288,9 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
             request.meta["playwright_page"] = page
 
         start_time = time()
-        response = await page.goto(request.url)
+        page_goto_kwargs = request.meta.get("playwright_page_goto_kwargs") or {}
+        page_goto_kwargs.pop("url", None)
+        response = await page.goto(url=request.url, **page_goto_kwargs)
         await self._apply_page_methods(page, request)
         body_str = await page.content()
         request.meta["download_latency"] = time() - start_time

diff --git a/tests/mockserver.py b/tests/mockserver.py
@@ -6,10 +6,19 @@
 from pathlib import Path
 from subprocess import Popen, PIPE
 from threading import Thread
+from typing import Optional
 from urllib.parse import urljoin
 
 
 class StaticMockServer:
+    """A web server that serves the contents of the sibling "site" directory.
+    To be used as a context manager:
+
+        with StaticMockServer() as server:
+            url = server.urljoin("/index.html")
+            ...
+    """
+
     def __enter__(self):
         self.proc = Popen(
             [sys.executable, "-u", "-m", "http.server", "0", "--bind", "127.0.0.1"],
@@ -58,6 +67,8 @@ def do_GET(self):
 
 
 class MockServer:
+    """A context manager web server using the _RequestHandler class to handle requests."""
+
     def __enter__(self):
         self.httpd = HTTPServer(("127.0.0.1", 0), _RequestHandler)
         self.address, self.port = self.httpd.server_address
@@ -69,7 +80,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self.httpd.shutdown()
         self.thread.join()
 
-    def urljoin(self, url: str) -> str:
+    def urljoin(self, url: Optional[str] = None) -> str:
         return urljoin(f"http://{self.address}:{self.port}", url)
 
 

diff --git a/tests/test_playwright_requests.py b/tests/test_playwright_requests.py
@@ -1,3 +1,4 @@
+import json
 import logging
 import platform
 import subprocess
@@ -321,15 +322,33 @@ async def test_event_handler_dialog_missing(self, caplog):
     async def test_response_attributes(self):
         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
             with MockServer() as server:
-                spider = DialogSpider()
                 req = Request(
-                    url=server.urljoin("/index.html"),
+                    url=server.urljoin(),
                     meta={"playwright": True},
                 )
-                response = await handler._download_request(req, spider)
+                response = await handler._download_request(req, Spider("spider_name"))
 
         assert response.ip_address == ip_address(server.address)
 
+    @pytest.mark.asyncio
+    async def test_page_goto_kwargs_referer(self):
+        if self.browser_type != "chromium":
+            pytest.skip("referer as goto kwarg seems to work only with chromium :shrug:")
+        async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
+            with MockServer() as server:
+                fake_referer = server.urljoin("/fake/referer")
+                req = Request(
+                    url=server.urljoin("/headers"),
+                    meta={
+                        "playwright": True,
+                        "playwright_page_goto_kwargs": {"referer": fake_referer},
+                    },
+                )
+                response = await handler._download_request(req, Spider("spider_name"))
+
+        headers = json.loads(response.css("pre::text").get())
+        assert headers["Referer"] == fake_referer
+
     @pytest.mark.asyncio
     async def test_abort_requests(self):
         async def should_abort_request_async(request):