Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Limit concurrent context count (PLAYWRIGHT_MAX_CONTEXTS setting) #95

Merged
merged 17 commits into from
May 22, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Launch persistent context
  • Loading branch information
elacuesta committed May 14, 2022
commit 9c1a773893c4d258a154196bc1fea124d94ea36d
35 changes: 35 additions & 0 deletions examples/persistent_context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from pathlib import Path

from scrapy import Spider, Request


class PersistentContextSpider(Spider):
"""Use a persistent browser context"""

name = "persistent_context"
custom_settings = {
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
# "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
""
"PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS": {
"user_data_dir": str(Path.home() / "playwright-persistent-context"),
"java_script_enabled": False,
"extra_http_headers": {"Asdf": "Qwerty"},
"user_agent": "foobar",
},
"PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None,
}

def start_requests(self):
yield Request(url="https://httpbin.org/get", meta={"playwright": True})

def parse(self, response):
content = response.css("pre::text").get()
print(content)
return {
"url": response.url,
"context": response.meta["playwright_context"],
}
63 changes: 48 additions & 15 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@
logger = logging.getLogger("scrapy-playwright")


DEFAULT_CONTEXT_NAME = "default"
PERSISTENT_CONTEXT_NAME = "persistent"


class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler):
def __init__(self, crawler: Crawler) -> None:
super().__init__(settings=crawler.settings, crawler=crawler)
Expand Down Expand Up @@ -80,9 +84,22 @@ def __init__(self, crawler: Crawler) -> None:
else:
self.process_request_headers = use_scrapy_headers

self.context_kwargs: dict = crawler.settings.getdict("PLAYWRIGHT_CONTEXTS")
# if PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS is present we only launch one context
self.contexts: Dict[str, BrowserContext] = {}
self.persistent_context: bool = False
self.context_semaphores: Dict[str, asyncio.Semaphore] = {}
self.context_kwargs: dict = {}
if crawler.settings.get("PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS"):
self.persistent_context = True
ctx_kwargs = crawler.settings.getdict("PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS")
self.context_kwargs[PERSISTENT_CONTEXT_NAME] = ctx_kwargs
if crawler.settings.getdict("PLAYWRIGHT_CONTEXTS"):
logger.info(
"Both PLAYWRIGHT_PERSISTENT_CONTEXT_KWARGS and PLAYWRIGHT_CONTEXTS"
" are set, ignoring PLAYWRIGHT_CONTEXTS"
)
else:
self.context_kwargs = crawler.settings.getdict("PLAYWRIGHT_CONTEXTS")

self.abort_request: Optional[Callable[[PlaywrightRequest], Union[Awaitable, bool]]] = None
if crawler.settings.get("PLAYWRIGHT_ABORT_REQUEST"):
Expand All @@ -99,20 +116,32 @@ def _engine_started(self) -> Deferred:
async def _launch_browser(self) -> None:
self.playwright_context_manager = PlaywrightContextManager()
self.playwright = await self.playwright_context_manager.start()
logger.info("Launching browser")
browser_launcher = getattr(self.playwright, self.browser_type).launch
self.browser = await browser_launcher(**self.launch_options)
logger.info(f"Browser {self.browser_type} launched")
contexts = await asyncio.gather(
*[
self._create_browser_context(name, kwargs)
for name, kwargs in self.context_kwargs.items()
]
browser_type = getattr(self.playwright, self.browser_type)
if self.persistent_context:
logger.info("Launching single persistent context")
self.contexts[PERSISTENT_CONTEXT_NAME] = await browser_type.launch_persistent_context(
**self.context_kwargs[PERSISTENT_CONTEXT_NAME]
)
if self.default_navigation_timeout is not None:
self.contexts[PERSISTENT_CONTEXT_NAME].set_default_navigation_timeout(
self.default_navigation_timeout
)
logger.info("Persistent context launched")
else:
logger.info(f"Launching browser {self.browser_type}")
self.browser = await browser_type.launch(**self.launch_options)
logger.info("Launching startup context(s)")
contexts = await asyncio.gather(
*[
self._create_browser_context(name, kwargs)
for name, kwargs in self.context_kwargs.items()
]
)
self.contexts = dict(zip(self.context_kwargs.keys(), contexts))
self.context_semaphores.update(
{name: asyncio.Semaphore(value=self.max_pages_per_context) for name in self.contexts}
)
self.contexts = dict(zip(self.context_kwargs.keys(), contexts))
self.context_semaphores = {
name: asyncio.Semaphore(value=self.max_pages_per_context) for name in self.contexts
}
logger.info(f"Browser {self.browser_type} launched")

async def _create_browser_context(self, name: str, context_kwargs: dict) -> BrowserContext:
context = await self.browser.new_context(**context_kwargs)
Expand All @@ -125,7 +154,11 @@ async def _create_browser_context(self, name: str, context_kwargs: dict) -> Brow

async def _create_page(self, request: Request) -> Page:
"""Create a new page in a context, also creating a new context if necessary."""
context_name = request.meta.setdefault("playwright_context", "default")
if self.persistent_context:
context_name = request.meta["playwright_context"] = PERSISTENT_CONTEXT_NAME
else:
context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME)

context = self.contexts.get(context_name)
if context is None:
context_kwargs = request.meta.get("playwright_context_kwargs") or {}
Expand Down