1
- """
2
- chromiumloader module
3
- """
4
1
import asyncio
5
2
from typing import Any , AsyncIterator , Iterator , List , Optional
6
3
from langchain_community .document_loaders .base import BaseLoader
12
9
logger = get_logger ("web-loader" )
13
10
14
11
class ChromiumLoader (BaseLoader ):
15
- """scrapes HTML pages from URLs using a (headless) instance of the
16
- Chromium web driver with proxy protection
12
+ """Scrapes HTML pages from URLs using a (headless) instance of the
13
+ Chromium web driver with proxy protection.
17
14
18
15
Attributes:
19
16
backend: The web driver backend library; defaults to 'playwright'.
20
17
browser_config: A dictionary containing additional browser kwargs.
21
- headless: whether to run browser in headless mode.
18
+ headless: Whether to run browser in headless mode.
22
19
proxy: A dictionary containing proxy settings; None disables protection.
23
20
urls: A list of URLs to scrape content from.
21
+ requires_js_support: Flag to determine if JS rendering is required.
24
22
"""
25
23
26
24
RETRY_LIMIT = 3
@@ -34,15 +32,17 @@ def __init__(
34
32
headless : bool = True ,
35
33
proxy : Optional [Proxy ] = None ,
36
34
load_state : str = "domcontentloaded" ,
35
+ requires_js_support : bool = False ,
37
36
** kwargs : Any ,
38
37
):
39
38
"""Initialize the loader with a list of URL paths.
40
39
41
40
Args:
42
41
backend: The web driver backend library; defaults to 'playwright'.
43
- headless: whether to run browser in headless mode.
42
+ headless: Whether to run browser in headless mode.
44
43
proxy: A dictionary containing proxy information; None disables protection.
45
44
urls: A list of URLs to scrape content from.
45
+ requires_js_support: Whether to use JS rendering for scraping.
46
46
kwargs: A dictionary containing additional browser kwargs.
47
47
48
48
Raises:
@@ -61,6 +61,7 @@ def __init__(
61
61
self .proxy = parse_or_search_proxy (proxy ) if proxy else None
62
62
self .urls = urls
63
63
self .load_state = load_state
64
+ self .requires_js_support = requires_js_support
64
65
65
66
async def ascrape_undetected_chromedriver (self , url : str ) -> str :
66
67
"""
@@ -186,7 +187,9 @@ def lazy_load(self) -> Iterator[Document]:
186
187
Yields:
187
188
Document: The scraped content encapsulated within a Document object.
188
189
"""
189
- scraping_fn = getattr (self , f"ascrape_{ self .backend } " )
190
+ scraping_fn = (
191
+ self .ascrape_with_js_support if self .requires_js_support else getattr (self , f"ascrape_{ self .backend } " )
192
+ )
190
193
191
194
for url in self .urls :
192
195
html_content = asyncio .run (scraping_fn (url ))
@@ -206,7 +209,9 @@ async def alazy_load(self) -> AsyncIterator[Document]:
206
209
Document: A Document object containing the scraped content, along with its
207
210
source URL as metadata.
208
211
"""
209
- scraping_fn = getattr (self , f"ascrape_{ self .backend } " )
212
+ scraping_fn = (
213
+ self .ascrape_with_js_support if self .requires_js_support else getattr (self , f"ascrape_{ self .backend } " )
214
+ )
210
215
211
216
tasks = [scraping_fn (url ) for url in self .urls ]
212
217
results = await asyncio .gather (* tasks )
0 commit comments