Skip to content

Commit

Permalink
Connect to browser using CDP (scrapy-plugins#227)
Browse files Browse the repository at this point in the history
  • Loading branch information
elacuesta authored Sep 4, 2023
1 parent f1004bd commit 10d330e
Show file tree
Hide file tree
Showing 3 changed files with 159 additions and 12 deletions.
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,37 @@ PLAYWRIGHT_LAUNCH_OPTIONS = {
}
```

### `PLAYWRIGHT_CDP_URL`
Type Optional[`str`], default `None`

The endpoint of a remote Chromium browser to connect using the
[Chrome DevTools Protocol](https://chromedevtools.github.io/devtools-protocol/),
via [`BrowserType.connect_over_cdp`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect-over-cdp).
If this setting is used:
* all non-persistent contexts will be created on the connected remote browser
* the `PLAYWRIGHT_LAUNCH_OPTIONS` setting is ignored
* the `PLAYWRIGHT_BROWSER_TYPE` setting must not be set to a value different than "chromium"

```python
PLAYWRIGHT_CDP_URL = "http://localhost:9222"
```

### `PLAYWRIGHT_CDP_KWARGS`
Type `dict[str, Any]`, default `{}`

Additional keyword arguments to be passed to
[`BrowserType.connect_over_cdp`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect-over-cdp)
when using `PLAYWRIGHT_CDP_URL`. The `endpoint_url` key is always ignored,
`PLAYWRIGHT_CDP_URL` is used instead.

```python
PLAYWRIGHT_CDP_KWARGS = {
"slow_mo": 1000,
"timeout": 10 * 1000
}
```


### `PLAYWRIGHT_CONTEXTS`
Type `dict[str, dict]`, default `{}`

Expand Down
61 changes: 49 additions & 12 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from typing import Awaitable, Callable, Dict, Optional, Type, TypeVar, Union

from playwright.async_api import (
Browser,
BrowserContext,
BrowserType,
Error as PlaywrightError,
Expand Down Expand Up @@ -69,9 +68,14 @@ def __init__(self, crawler: Crawler) -> None:
self.stats = crawler.stats

# browser
self.browser_cdp_url = settings.get("PLAYWRIGHT_CDP_URL")
self.browser_cdp_kwargs = settings.get("PLAYWRIGHT_CDP_KWARGS") or {}
self.browser_cdp_kwargs.pop("endpoint_url", None)
self.browser_type_name = settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE
self.browser_launch_lock = asyncio.Lock()
self.launch_options: dict = settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}
if self.browser_cdp_url and self.launch_options:
logger.warning("PLAYWRIGHT_CDP_URL is set, ignoring PLAYWRIGHT_LAUNCH_OPTIONS")

# contexts
self.max_pages_per_context: int = settings.getint(
Expand Down Expand Up @@ -138,36 +142,63 @@ async def _maybe_launch_browser(self) -> None:
async with self.browser_launch_lock:
if not hasattr(self, "browser"):
logger.info("Launching browser %s", self.browser_type.name)
self.browser: Browser = await self.browser_type.launch(**self.launch_options)
self.browser = await self.browser_type.launch(**self.launch_options)
logger.info("Browser %s launched", self.browser_type.name)

async def _maybe_connect_devtools(self) -> None:
async with self.browser_launch_lock:
if not hasattr(self, "browser"):
logger.info("Connecting using CDP: %s", self.browser_cdp_url)
self.browser = await self.browser_type.connect_over_cdp(
self.browser_cdp_url, **self.browser_cdp_kwargs
)
logger.info("Connected using CDP: %s", self.browser_cdp_url)

async def _create_browser_context(
self,
name: str,
context_kwargs: Optional[dict],
spider: Optional[Spider] = None,
) -> BrowserContextWrapper:
"""Create a new context, also launching a browser if necessary."""
"""Create a new context, also launching a local browser or connecting
to a remote one if necessary.
"""
if hasattr(self, "context_semaphore"):
await self.context_semaphore.acquire()
context_kwargs = context_kwargs or {}
if context_kwargs.get(PERSISTENT_CONTEXT_PATH_KEY):
context = await self.browser_type.launch_persistent_context(**context_kwargs)
persistent = True
self.stats.inc_value("playwright/context_count/persistent")
remote = False
elif self.browser_cdp_url:
await self._maybe_connect_devtools()
context = await self.browser.new_context(**context_kwargs)
persistent = False
remote = True
else:
await self._maybe_launch_browser()
context = await self.browser.new_context(**context_kwargs)
persistent = False
self.stats.inc_value("playwright/context_count/non_persistent")
context.on("close", self._make_close_browser_context_callback(name, persistent, spider))
remote = False

context.on(
"close", self._make_close_browser_context_callback(name, persistent, remote, spider)
)
self.stats.inc_value("playwright/context_count")
self.stats.inc_value(f"playwright/context_count/persistent/{persistent}")
self.stats.inc_value(f"playwright/context_count/remote/{remote}")
logger.debug(
"Browser context started: '%s' (persistent=%s)",
"Browser context started: '%s' (persistent=%s, remote=%s)",
name,
persistent,
extra={"spider": spider, "context_name": name, "persistent": persistent},
remote,
extra={
"spider": spider,
"context_name": name,
"persistent": persistent,
"remote": remote,
},
)
self.stats.inc_value("playwright/context_count")
if self.default_navigation_timeout is not None:
context.set_default_navigation_timeout(self.default_navigation_timeout)
self.context_wrappers[name] = BrowserContextWrapper(
Expand Down Expand Up @@ -436,17 +467,23 @@ def close_page_callback() -> None:
return close_page_callback

def _make_close_browser_context_callback(
self, name: str, persistent: bool, spider: Optional[Spider] = None
self, name: str, persistent: bool, remote: bool, spider: Optional[Spider] = None
) -> Callable:
def close_browser_context_callback() -> None:
self.context_wrappers.pop(name, None)
if hasattr(self, "context_semaphore"):
self.context_semaphore.release()
logger.debug(
"Browser context closed: '%s' (persistent=%s)",
"Browser context closed: '%s' (persistent=%s, remote=%s)",
name,
persistent,
extra={"spider": spider, "context_name": name},
remote,
extra={
"spider": spider,
"context_name": name,
"persistent": persistent,
"remote": remote,
},
)

return close_browser_context_callback
Expand Down
79 changes: 79 additions & 0 deletions tests/tests_asyncio/test_remote.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import logging
import re
import subprocess
import time
from contextlib import asynccontextmanager
from typing import Tuple
from unittest import IsolatedAsyncioTestCase

import pytest
from playwright.async_api import async_playwright
from scrapy import Request, Spider

from tests import make_handler, assert_correct_response
from tests.mockserver import StaticMockServer


async def _run_chromium() -> Tuple[subprocess.Popen, str]:
"""Run a Croumium instance in a separate process, return the process
object and a string with its devtools endpoint.
"""
async with async_playwright() as playwright:
proc = subprocess.Popen( # pylint: disable=consider-using-with
[playwright.chromium.executable_path, "--headless", "--remote-debugging-port=0"],
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
devtools_url = None
while devtools_url is None:
line = proc.stderr.readline().strip() # type: ignore
if not line:
time.sleep(0.2)
continue
print("browser output:", line)
if match := re.match(r"^DevTools listening on (.+)$", line):
devtools_url = match.group(1)
print("devtools_url:", devtools_url)
return proc, devtools_url


@asynccontextmanager
async def remote_chromium():
"""Launch a Chromium instance with remote debugging enabled."""
proc = None
devtools_url = None
try:
proc, devtools_url = await _run_chromium()
except Exception:
pass
else:
yield devtools_url
finally:
if proc:
proc.kill()
proc.communicate()


class TestRemoteDevtools(IsolatedAsyncioTestCase):
@pytest.fixture(autouse=True)
def inject_fixtures(self, caplog):
caplog.set_level(logging.DEBUG)
self._caplog = caplog

async def test_devtools(self):
async with remote_chromium() as devtools_url:
settings_dict = {
"PLAYWRIGHT_CDP_URL": devtools_url,
"PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True},
}
async with make_handler(settings_dict) as handler:
with StaticMockServer() as server:
req = Request(server.urljoin("/index.html"), meta={"playwright": True})
resp = await handler._download_request(req, Spider("foo"))
assert_correct_response(resp, req)
assert (
"scrapy-playwright",
logging.WARNING,
"PLAYWRIGHT_CDP_URL is set, ignoring PLAYWRIGHT_LAUNCH_OPTIONS",
) in self._caplog.record_tuples

0 comments on commit 10d330e

Please sign in to comment.