Skip to content

Commit 38c6dd2

Browse files
committed
feat: update chromium
1 parent 950e859 commit 38c6dd2

File tree

1 file changed

+14
-9
lines changed

1 file changed

+14
-9
lines changed

scrapegraphai/docloaders/chromium.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
"""
2-
chromiumloader module
3-
"""
41
import asyncio
52
from typing import Any, AsyncIterator, Iterator, List, Optional
63
from langchain_community.document_loaders.base import BaseLoader
@@ -12,15 +9,16 @@
129
logger = get_logger("web-loader")
1310

1411
class ChromiumLoader(BaseLoader):
15-
"""scrapes HTML pages from URLs using a (headless) instance of the
16-
Chromium web driver with proxy protection
12+
"""Scrapes HTML pages from URLs using a (headless) instance of the
13+
Chromium web driver with proxy protection.
1714
1815
Attributes:
1916
backend: The web driver backend library; defaults to 'playwright'.
2017
browser_config: A dictionary containing additional browser kwargs.
21-
headless: whether to run browser in headless mode.
18+
headless: Whether to run browser in headless mode.
2219
proxy: A dictionary containing proxy settings; None disables protection.
2320
urls: A list of URLs to scrape content from.
21+
requires_js_support: Flag to determine if JS rendering is required.
2422
"""
2523

2624
RETRY_LIMIT = 3
@@ -34,15 +32,17 @@ def __init__(
3432
headless: bool = True,
3533
proxy: Optional[Proxy] = None,
3634
load_state: str = "domcontentloaded",
35+
requires_js_support: bool = False,
3736
**kwargs: Any,
3837
):
3938
"""Initialize the loader with a list of URL paths.
4039
4140
Args:
4241
backend: The web driver backend library; defaults to 'playwright'.
43-
headless: whether to run browser in headless mode.
42+
headless: Whether to run browser in headless mode.
4443
proxy: A dictionary containing proxy information; None disables protection.
4544
urls: A list of URLs to scrape content from.
45+
requires_js_support: Whether to use JS rendering for scraping.
4646
kwargs: A dictionary containing additional browser kwargs.
4747
4848
Raises:
@@ -61,6 +61,7 @@ def __init__(
6161
self.proxy = parse_or_search_proxy(proxy) if proxy else None
6262
self.urls = urls
6363
self.load_state = load_state
64+
self.requires_js_support = requires_js_support
6465

6566
async def ascrape_undetected_chromedriver(self, url: str) -> str:
6667
"""
@@ -186,7 +187,9 @@ def lazy_load(self) -> Iterator[Document]:
186187
Yields:
187188
Document: The scraped content encapsulated within a Document object.
188189
"""
189-
scraping_fn = getattr(self, f"ascrape_{self.backend}")
190+
scraping_fn = (
191+
self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
192+
)
190193

191194
for url in self.urls:
192195
html_content = asyncio.run(scraping_fn(url))
@@ -206,7 +209,9 @@ async def alazy_load(self) -> AsyncIterator[Document]:
206209
Document: A Document object containing the scraped content, along with its
207210
source URL as metadata.
208211
"""
209-
scraping_fn = getattr(self, f"ascrape_{self.backend}")
212+
scraping_fn = (
213+
self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
214+
)
210215

211216
tasks = [scraping_fn(url) for url in self.urls]
212217
results = await asyncio.gather(*tasks)

0 commit comments

Comments
 (0)