Skip to content

Commit

Permalink
add playwright page.goto kwargs (scrapy-plugins#54)
Browse files Browse the repository at this point in the history
  • Loading branch information
Pandaaaa906 authored Jul 17, 2022
1 parent 683ee5f commit bd9bd95
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 6 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -233,9 +233,15 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
callback=self.parse_headers,
meta={"playwright": True, "playwright_page": page},
)

```

* `playwright_page_goto_kwargs` (type `dict`, default `{}`)

A dictionary with keyword arguments to be passed to the page's
[`goto` method](https://playwright.dev/python/docs/api/class-page#page-goto)
when navigating to an URL. The `url` key is ignored if present, the request's
URL is used instead.

* `playwright_security_details` (type `Optional[dict]`, read only)

A dictionary with [security information](https://playwright.dev/python/docs/api/class-response#response-security-details)
Expand Down
4 changes: 3 additions & 1 deletion scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,9 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
request.meta["playwright_page"] = page

start_time = time()
response = await page.goto(request.url)
page_goto_kwargs = request.meta.get("playwright_page_goto_kwargs") or {}
page_goto_kwargs.pop("url", None)
response = await page.goto(url=request.url, **page_goto_kwargs)
await self._apply_page_methods(page, request)
body_str = await page.content()
request.meta["download_latency"] = time() - start_time
Expand Down
13 changes: 12 additions & 1 deletion tests/mockserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,19 @@
from pathlib import Path
from subprocess import Popen, PIPE
from threading import Thread
from typing import Optional
from urllib.parse import urljoin


class StaticMockServer:
"""A web server that serves the contents of the sibling "site" directory.
To be used as a context manager:
with StaticMockServer() as server:
url = server.urljoin("/index.html")
...
"""

def __enter__(self):
self.proc = Popen(
[sys.executable, "-u", "-m", "http.server", "0", "--bind", "127.0.0.1"],
Expand Down Expand Up @@ -58,6 +67,8 @@ def do_GET(self):


class MockServer:
"""A context manager web server using the _RequestHandler class to handle requests."""

def __enter__(self):
self.httpd = HTTPServer(("127.0.0.1", 0), _RequestHandler)
self.address, self.port = self.httpd.server_address
Expand All @@ -69,7 +80,7 @@ def __exit__(self, exc_type, exc_value, traceback):
self.httpd.shutdown()
self.thread.join()

def urljoin(self, url: str) -> str:
def urljoin(self, url: Optional[str] = None) -> str:
return urljoin(f"http://{self.address}:{self.port}", url)


Expand Down
25 changes: 22 additions & 3 deletions tests/test_playwright_requests.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import logging
import platform
import subprocess
Expand Down Expand Up @@ -321,15 +322,33 @@ async def test_event_handler_dialog_missing(self, caplog):
async def test_response_attributes(self):
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
with MockServer() as server:
spider = DialogSpider()
req = Request(
url=server.urljoin("/index.html"),
url=server.urljoin(),
meta={"playwright": True},
)
response = await handler._download_request(req, spider)
response = await handler._download_request(req, Spider("spider_name"))

assert response.ip_address == ip_address(server.address)

@pytest.mark.asyncio
async def test_page_goto_kwargs_referer(self):
if self.browser_type != "chromium":
pytest.skip("referer as goto kwarg seems to work only with chromium :shrug:")
async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
with MockServer() as server:
fake_referer = server.urljoin("/fake/referer")
req = Request(
url=server.urljoin("/headers"),
meta={
"playwright": True,
"playwright_page_goto_kwargs": {"referer": fake_referer},
},
)
response = await handler._download_request(req, Spider("spider_name"))

headers = json.loads(response.css("pre::text").get())
assert headers["Referer"] == fake_referer

@pytest.mark.asyncio
async def test_abort_requests(self):
async def should_abort_request_async(request):
Expand Down

0 comments on commit bd9bd95

Please sign in to comment.