Skip to content

Commit

Permalink
Rename PageCoroutine -> PageMethod (scrapy-plugins#70)
Browse files Browse the repository at this point in the history
  • Loading branch information
elacuesta authored Mar 26, 2022
1 parent 9ee07aa commit a06b3b1
Show file tree
Hide file tree
Showing 14 changed files with 314 additions and 198 deletions.
104 changes: 72 additions & 32 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -326,54 +326,75 @@ async def parse_in_new_context(self, response):
```


## Page coroutines
## Executing actions on pages

A sorted iterable (`list`, `tuple` or `dict`, for instance) could be passed
in the `playwright_page_coroutines`
in the `playwright_page_methods`
[Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta)
key to request coroutines to be awaited on the `Page` before returning the final
`Response` to the callback.

This is useful when you need to perform certain actions on a page, like scrolling
down or clicking links, and you want everything to count as a single Scrapy
Response, containing the final result.
down or clicking links, and you want to handle only the final result in your callback.

### `PageCoroutine` class
### `PageMethod` class

* `scrapy_playwright.page.PageCoroutine(method: str, *args, **kwargs)`:
#### `scrapy_playwright.page.PageMethod(method: str, *args, **kwargs)`:

Represents a coroutine to be awaited on a `playwright.page.Page` object,
such as "click", "screenshot", "evaluate", etc.
`method` should be the name of the coroutine, `*args` and `**kwargs`
are passed to the function call. The return value of the coroutine call
will be stored in the `PageCoroutine.result` attribute.
Represents a method to be called (and awaited if necessary) on a
`playwright.page.Page` object, such as "click", "screenshot", "evaluate", etc.
`method` is the name of the method, `*args` and `**kwargs`
are passed when calling such method. The return value
will be stored in the `PageMethod.result` attribute.

For instance,
```python
PageCoroutine("screenshot", path="quotes.png", fullPage=True)
```
For instance,
```python
def start_requests(self):
yield Request(
url="https://example.org",
meta={
"playwright": True,
"playwright_page_methods": [
PageMethod("screenshot", path="example.png", fullPage=True),
],
},
)

produces the same effect as:
```python
# 'page' is a playwright.async_api.Page object
await page.screenshot(path="quotes.png", fullPage=True)
```
def parse(self, response):
screenshot = response.meta["playwright_page_methods"][0]
# screenshot.result contains the image's bytes
```

produces the same effect as:
```python
def start_requests(self):
yield Request(
url="https://example.org",
meta={"playwright": True, "playwright_include_page": True},
)

### Supported coroutines
async def parse(self, response):
page = response.meta["playwright_page"]
await page.screenshot(path="example.png", full_page=True)
await page.close()
```


### Supported methods

Please refer to the [upstream docs for the `Page` class](https://playwright.dev/python/docs/api/class-page)
to see available coroutines
to see available methods.

### Impact on Response objects

Certain `Response` attributes (e.g. `url`, `ip_address`) reflect the state after the last
action performed on a page. If you issue a `PageCoroutine` with an action that results in
action performed on a page. If you issue a `PageMethod` with an action that results in
a navigation (e.g. a `click` on a link), the `Response.url` attribute will point to the
new URL, which might be different from the request's URL.


## Page events

A dictionary of Page event handlers can be specified in the `playwright_page_event_handlers`
[Request.meta](https://docs.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.meta) key.
Keys are the name of the event to be handled (`dialog`, `download`, etc).
Expand Down Expand Up @@ -430,15 +451,15 @@ class ClickAndSavePdfSpider(scrapy.Spider):
url="https://example.org",
meta=dict(
playwright=True,
playwright_page_coroutines={
"click": PageCoroutine("click", selector="a"),
"pdf": PageCoroutine("pdf", path="/tmp/file.pdf"),
playwright_page_methods={
"click": PageMethod("click", selector="a"),
"pdf": PageMethod("pdf", path="/tmp/file.pdf"),
},
),
)

def parse(self, response):
pdf_bytes = response.meta["playwright_page_coroutines"]["pdf"].result
pdf_bytes = response.meta["playwright_page_methods"]["pdf"].result
with open("iana.pdf", "wb") as fp:
fp.write(pdf_bytes)
yield {"url": response.url} # response.url is "https://www.iana.org/domains/reserved"
Expand All @@ -456,10 +477,10 @@ class ScrollSpider(scrapy.Spider):
meta=dict(
playwright=True,
playwright_include_page=True,
playwright_page_coroutines=[
PageCoroutine("wait_for_selector", "div.quote"),
PageCoroutine("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
PageCoroutine("wait_for_selector", "div.quote:nth-child(11)"), # 10 per page
playwright_page_methods=[
PageMethod("wait_for_selector", "div.quote"),
PageMethod("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
PageMethod("wait_for_selector", "div.quote:nth-child(11)"), # 10 per page
],
),
)
Expand Down Expand Up @@ -487,7 +508,14 @@ For more examples, please see the scripts in the [examples](examples) directory.
Refer to the [Proxy support](#proxy-support) section for more information.


##  Deprecations
##  Deprecation policy

Deprecated features will be supported for at least six months
following the release that deprecated them. After that, they
may be removed at any time. See the [changelog](changelog.md)
for more information about deprecations and removals.

### Currently deprecated features

* `PLAYWRIGHT_CONTEXT_ARGS` setting (type `dict`, default `{}`)

Expand All @@ -497,3 +525,15 @@ For more examples, please see the scripts in the [examples](examples) directory.
Deprecated since
[`v0.0.4`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.4),
use the `PLAYWRIGHT_CONTEXTS` setting instead

* `scrapy_playwright.page.PageCoroutine` class

Deprecated since
[`v0.0.14`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.14),
use `scrapy_playwright.page.PageMethod` instead

* `playwright_page_coroutines` Request meta key

Deprecated since
[`v0.0.14`](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.14),
use `playwright_page_methods` instead
5 changes: 5 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# scrapy-playwright changelog

### [v0.0.14](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.14) (2022-03-26)

* Renamed `PageCoroutine` to `PageMethod` (`PageCoroutine` is now deprecated)


### [v0.0.13](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.13) (2022-03-24)

* PageCoroutine checks
Expand Down
6 changes: 3 additions & 3 deletions examples/cookies.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from scrapy import Spider, Request
from scrapy.crawler import CrawlerProcess
from scrapy_playwright.page import PageCoroutine
from scrapy_playwright.page import PageMethod


class CookieSpider(Spider):
Expand All @@ -18,8 +18,8 @@ def start_requests(self):
cookies={"foo": "bar"},
meta={
"playwright": True,
"playwright_page_coroutines": [
PageCoroutine(
"playwright_page_methods": [
PageMethod(
"screenshot", path=Path(__file__).parent / "cookies.png", full_page=True
),
],
Expand Down
6 changes: 3 additions & 3 deletions examples/events.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from playwright.async_api import Dialog, Response as PlaywrightResponse
from scrapy import Spider, Request
from scrapy.crawler import CrawlerProcess
from scrapy_playwright.page import PageCoroutine
from scrapy_playwright.page import PageMethod


class EventsSpider(Spider):
Expand All @@ -16,8 +16,8 @@ def start_requests(self):
url="https://example.org",
meta={
"playwright": True,
"playwright_page_coroutines": [
PageCoroutine("evaluate", "alert('foobar');"),
"playwright_page_methods": [
PageMethod("evaluate", "alert('foobar');"),
],
"playwright_page_event_handlers": {
"dialog": self.handle_dialog,
Expand Down
6 changes: 3 additions & 3 deletions examples/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from scrapy import Spider, Request
from scrapy.crawler import CrawlerProcess
from scrapy_playwright.page import PageCoroutine
from scrapy_playwright.page import PageMethod


class HandleTimeoutMiddleware:
Expand All @@ -13,8 +13,8 @@ def process_exception(self, request, exception, spider):
url="https://httpbin.org/get",
meta={
"playwright": True,
"playwright_page_coroutines": [
PageCoroutine(
"playwright_page_methods": [
PageMethod(
"screenshot", path=Path(__file__).parent / "recovered.png", full_page=True
),
],
Expand Down
6 changes: 3 additions & 3 deletions examples/post.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from scrapy import Spider, FormRequest
from scrapy.crawler import CrawlerProcess
from scrapy_playwright.page import PageCoroutine
from scrapy_playwright.page import PageMethod


class PostSpider(Spider):
Expand All @@ -18,8 +18,8 @@ def start_requests(self):
formdata={"foo": "bar"},
meta={
"playwright": True,
"playwright_page_coroutines": [
PageCoroutine(
"playwright_page_methods": [
PageMethod(
"screenshot", path=Path(__file__).parent / "post.png", full_page=True
),
],
Expand Down
13 changes: 7 additions & 6 deletions examples/scroll.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from scrapy import Spider, Request
from scrapy.crawler import CrawlerProcess
from scrapy_playwright.page import PageCoroutine
from scrapy_playwright.page import PageMethod


class ScrollSpider(Spider):
Expand All @@ -18,11 +18,11 @@ def start_requests(self):
cookies={"foo": "bar", "asdf": "qwerty"},
meta={
"playwright": True,
"playwright_page_coroutines": [
PageCoroutine("wait_for_selector", "div.quote"),
PageCoroutine("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
PageCoroutine("wait_for_selector", "div.quote:nth-child(11)"), # 10 per page
PageCoroutine(
"playwright_page_methods": [
PageMethod("wait_for_selector", "div.quote"),
PageMethod("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
PageMethod("wait_for_selector", "div.quote:nth-child(11)"), # 10 per page
PageMethod(
"screenshot", path=Path(__file__).parent / "scroll.png", full_page=True
),
],
Expand All @@ -41,6 +41,7 @@ def parse(self, response):
# "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"LOG_LEVEL": "INFO",
}
)
process.crawl(ScrollSpider)
Expand Down
6 changes: 3 additions & 3 deletions examples/storage.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from scrapy import Spider, Request
from scrapy.crawler import CrawlerProcess
from scrapy_playwright.page import PageCoroutine
from scrapy_playwright.page import PageMethod


class StorageSpider(Spider):
Expand All @@ -16,8 +16,8 @@ def start_requests(self):
meta={
"playwright": True,
"playwright_include_page": True,
"playwright_page_coroutines": [
PageCoroutine("evaluate_handle", "window.localStorage.setItem('foo', 'bar');"),
"playwright_page_methods": [
PageMethod("evaluate_handle", "window.localStorage.setItem('foo', 'bar');"),
],
},
)
Expand Down
50 changes: 31 additions & 19 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from scrapy import Spider, signals
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
from scrapy.crawler import Crawler
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Request, Response
from scrapy.http.headers import Headers
from scrapy.responsetypes import responsetypes
Expand All @@ -30,7 +31,7 @@
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding

from scrapy_playwright.headers import use_scrapy_headers
from scrapy_playwright.page import PageCoroutine
from scrapy_playwright.page import PageMethod


__all__ = ["ScrapyPlaywrightDownloadHandler"]
Expand Down Expand Up @@ -96,7 +97,7 @@ def __init__(self, crawler: Crawler) -> None:
"The PLAYWRIGHT_CONTEXT_ARGS setting is deprecated, please use"
" PLAYWRIGHT_CONTEXTS instead. Keyword arguments defined in"
" PLAYWRIGHT_CONTEXT_ARGS will be used when creating the 'default' context",
category=DeprecationWarning,
category=ScrapyDeprecationWarning,
stacklevel=2,
)
self.context_kwargs: defaultdict = defaultdict(dict)
Expand Down Expand Up @@ -250,23 +251,7 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
start_time = time()
response = await page.goto(request.url)

page_coroutines = request.meta.get("playwright_page_coroutines") or ()
if isinstance(page_coroutines, dict):
page_coroutines = page_coroutines.values()
for pc in page_coroutines:
if isinstance(pc, PageCoroutine):
try:
method = getattr(page, pc.method)
except AttributeError:
logger.warning(f"Ignoring {repr(pc)}: could not find coroutine")
else:
result = method(*pc.args, **pc.kwargs)
pc.result = await result if isawaitable(result) else result
await page.wait_for_load_state(timeout=self.default_navigation_timeout)
else:
logger.warning(
f"Ignoring {repr(pc)}: expected PageCoroutine, got {repr(type(pc))}"
)
await self._apply_page_methods(page, request)

body_str = await page.content()
request.meta["download_latency"] = time() - start_time
Expand Down Expand Up @@ -300,6 +285,33 @@ async def _download_request_with_page(self, request: Request, page: Page) -> Res
ip_address=server_ip_address,
)

async def _apply_page_methods(self, page: Page, request: Request) -> None:
page_methods = request.meta.get("playwright_page_methods") or ()

if not page_methods and "playwright_page_coroutines" in request.meta:
page_methods = request.meta["playwright_page_coroutines"]
warnings.warn(
"The 'playwright_page_coroutines' request meta key is deprecated,"
" please use 'playwright_page_methods' instead.",
category=ScrapyDeprecationWarning,
stacklevel=1,
)

if isinstance(page_methods, dict):
page_methods = page_methods.values()
for pm in page_methods:
if isinstance(pm, PageMethod):
try:
method = getattr(page, pm.method)
except AttributeError:
logger.warning(f"Ignoring {repr(pm)}: could not find method")
else:
result = method(*pm.args, **pm.kwargs)
pm.result = await result if isawaitable(result) else result
await page.wait_for_load_state(timeout=self.default_navigation_timeout)
else:
logger.warning(f"Ignoring {repr(pm)}: expected PageMethod, got {repr(type(pm))}")

def _increment_request_stats(self, request: PlaywrightRequest) -> None:
stats_prefix = "playwright/request_count"
self.stats.inc_value(stats_prefix)
Expand Down
Loading

0 comments on commit a06b3b1

Please sign in to comment.