Skip to content

feat: update chromium #787

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 6, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions scrapegraphai/docloaders/chromium.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
"""
chromiumloader module
"""
import asyncio
from typing import Any, AsyncIterator, Iterator, List, Optional
from langchain_community.document_loaders.base import BaseLoader
Expand All @@ -12,15 +9,16 @@
logger = get_logger("web-loader")

class ChromiumLoader(BaseLoader):
"""scrapes HTML pages from URLs using a (headless) instance of the
Chromium web driver with proxy protection
"""Scrapes HTML pages from URLs using a (headless) instance of the
Chromium web driver with proxy protection.

Attributes:
backend: The web driver backend library; defaults to 'playwright'.
browser_config: A dictionary containing additional browser kwargs.
headless: whether to run browser in headless mode.
headless: Whether to run browser in headless mode.
proxy: A dictionary containing proxy settings; None disables protection.
urls: A list of URLs to scrape content from.
requires_js_support: Flag to determine if JS rendering is required.
"""

RETRY_LIMIT = 3
Expand All @@ -34,15 +32,17 @@ def __init__(
headless: bool = True,
proxy: Optional[Proxy] = None,
load_state: str = "domcontentloaded",
requires_js_support: bool = False,
**kwargs: Any,
):
"""Initialize the loader with a list of URL paths.

Args:
backend: The web driver backend library; defaults to 'playwright'.
headless: whether to run browser in headless mode.
headless: Whether to run browser in headless mode.
proxy: A dictionary containing proxy information; None disables protection.
urls: A list of URLs to scrape content from.
requires_js_support: Whether to use JS rendering for scraping.
kwargs: A dictionary containing additional browser kwargs.

Raises:
Expand All @@ -61,6 +61,7 @@ def __init__(
self.proxy = parse_or_search_proxy(proxy) if proxy else None
self.urls = urls
self.load_state = load_state
self.requires_js_support = requires_js_support

async def ascrape_undetected_chromedriver(self, url: str) -> str:
"""
Expand Down Expand Up @@ -186,7 +187,9 @@ def lazy_load(self) -> Iterator[Document]:
Yields:
Document: The scraped content encapsulated within a Document object.
"""
scraping_fn = getattr(self, f"ascrape_{self.backend}")
scraping_fn = (
self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
)

for url in self.urls:
html_content = asyncio.run(scraping_fn(url))
Expand All @@ -206,7 +209,9 @@ async def alazy_load(self) -> AsyncIterator[Document]:
Document: A Document object containing the scraped content, along with its
source URL as metadata.
"""
scraping_fn = getattr(self, f"ascrape_{self.backend}")
scraping_fn = (
self.ascrape_with_js_support if self.requires_js_support else getattr(self, f"ascrape_{self.backend}")
)

tasks = [scraping_fn(url) for url in self.urls]
results = await asyncio.gather(*tasks)
Expand Down
Loading