From 72fbdac467b8e0a3aba511e93353cb42d45b1842 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 5 Jan 2025 19:26:46 +0800 Subject: [PATCH] fix(extraction): JsonCss selector and crawler improvements - Fix JsonCssExtractionStrategy._get_elements to return all matching elements instead of just one - Add robust error handling to page_need_scroll with default fallback - Improve JSON extraction strategies documentation - Refactor content scraping strategy - Update version to 0.4.247 --- crawl4ai/__version__.py | 2 +- crawl4ai/async_crawler_strategy.py | 17 +++- crawl4ai/content_scraping_strategy.py | 93 ------------------- crawl4ai/extraction_strategy.py | 4 +- crawl4ai/utils.py | 21 +++++ .../tutorials/async-webcrawler-basics.md | 21 ++++- 6 files changed, 56 insertions(+), 102 deletions(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 3f798c0c..8ec3d053 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.246" +__version__ = "0.4.247" diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 0cdaffd5..b879413c 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -2163,7 +2163,7 @@ async def get_page_dimensions(self, page: Page): } """) - async def page_need_scroll(self, page: Page): + async def page_need_scroll(self, page: Page) -> bool: """ Determine whether the page need to scroll @@ -2171,12 +2171,21 @@ async def page_need_scroll(self, page: Page): page: Playwright page object Returns: - page should scroll or not + bool: True if page needs scrolling """ - return await page.evaluate(""" + try: + need_scroll = await page.evaluate(""" () => { const scrollHeight = document.documentElement.scrollHeight; const viewportHeight = window.innerHeight; return scrollHeight > viewportHeight; } - """) \ No newline at end of file + """) + return need_scroll + except Exception as e: + self.logger.warning( + message="Failed to check scroll need: {error}. Defaulting to True for safety.", + tag="SCROLL", + params={"error": str(e)} + ) + return True # Default to scrolling if check fails \ No newline at end of file diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 985ff592..f3a96cf3 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -122,92 +122,6 @@ async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: """ return await asyncio.to_thread(self._scrap, url, html, **kwargs) - def _generate_markdown_content(self, cleaned_html: str,html: str,url: str, success: bool, **kwargs) -> Dict[str, Any]: - """ - Generate markdown content from cleaned HTML. - - Args: - cleaned_html (str): The cleaned HTML content. - html (str): The original HTML content. - url (str): The URL of the page. - success (bool): Whether the content was successfully cleaned. - **kwargs: Additional keyword arguments. - - Returns: - Dict[str, Any]: A dictionary containing the generated markdown content. - """ - markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator()) - - if markdown_generator: - try: - if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter: - markdown_generator.content_filter = BM25ContentFilter( - user_query=kwargs.get('fit_markdown_user_query', None), - bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) - ) - - markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( - cleaned_html=cleaned_html, - base_url=url, - html2text_options=kwargs.get('html2text', {}) - ) - - return { - 'markdown': markdown_result.raw_markdown, - 'fit_markdown': markdown_result.fit_markdown, - 'fit_html': markdown_result.fit_html, - 'markdown_v2': markdown_result - } - except Exception as e: - self._log('error', - message="Error using new markdown generation strategy: {error}", - tag="SCRAPE", - params={"error": str(e)} - ) - markdown_generator = None - return { - 'markdown': f"Error using new markdown generation strategy: {str(e)}", - 'fit_markdown': "Set flag 'fit_markdown' to True to get cleaned HTML content.", - 'fit_html': "Set flag 'fit_markdown' to True to get cleaned HTML content.", - 'markdown_v2': None - } - - # Legacy method - """ - # h = CustomHTML2Text() - # h.update_params(**kwargs.get('html2text', {})) - # markdown = h.handle(cleaned_html) - # markdown = markdown.replace(' ```', '```') - - # fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content." - # fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content." - - # if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False): - # content_filter = kwargs.get('content_filter', None) - # if not content_filter: - # content_filter = BM25ContentFilter( - # user_query=kwargs.get('fit_markdown_user_query', None), - # bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) - # ) - # fit_html = content_filter.filter_content(html) - # fit_html = '\n'.join('
{}
'.format(s) for s in fit_html) - # fit_markdown = h.handle(fit_html) - - # markdown_v2 = MarkdownGenerationResult( - # raw_markdown=markdown, - # markdown_with_citations=markdown, - # references_markdown=markdown, - # fit_markdown=fit_markdown - # ) - - # return { - # 'markdown': markdown, - # 'fit_markdown': fit_markdown, - # 'fit_html': fit_html, - # 'markdown_v2' : markdown_v2 - # } - """ - def flatten_nested_elements(self, node): """ Flatten nested elements in a HTML tree. @@ -798,13 +712,6 @@ def _scrap(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRES cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ') - # markdown_content = self._generate_markdown_content( - # cleaned_html=cleaned_html, - # html=html, - # url=url, - # success=success, - # **kwargs - # ) return { # **markdown_content, diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 7441e32d..1e9d9c79 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -974,8 +974,7 @@ def _get_base_elements(self, parsed_html, selector: str): return parsed_html.select(selector) def _get_elements(self, element, selector: str): - selected = element.select_one(selector) - return [selected] if selected else [] + return element.select(selector) def _get_element_text(self, element) -> str: return element.get_text(strip=True) @@ -1050,3 +1049,4 @@ def _get_element_html(self, element) -> str: def _get_element_attribute(self, element, attribute: str): return element.get(attribute) + diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 214ebbc6..6fd7429f 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -21,6 +21,8 @@ import cProfile import pstats from functools import wraps +import asyncio + class InvalidCSSSelectorError(Exception): pass @@ -1579,6 +1581,25 @@ def ensure_content_dirs(base_path: str) -> Dict[str, str]: return content_paths +def configure_windows_event_loop(): + """ + Configure the Windows event loop to use ProactorEventLoop. + This resolves the NotImplementedError that occurs on Windows when using asyncio subprocesses. + + This function should only be called on Windows systems and before any async operations. + On non-Windows systems, this function does nothing. + + Example: + ```python + from crawl4ai.async_configs import configure_windows_event_loop + + # Call this before any async operations if you're on Windows + configure_windows_event_loop() + ``` + """ + if platform.system() == 'Windows': + asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) + def get_error_context(exc_info, context_lines: int = 5): """ Extract error context with more reliable line number tracking. diff --git a/docs/md_v3/tutorials/async-webcrawler-basics.md b/docs/md_v3/tutorials/async-webcrawler-basics.md index 46256eaa..6236d899 100644 --- a/docs/md_v3/tutorials/async-webcrawler-basics.md +++ b/docs/md_v3/tutorials/async-webcrawler-basics.md @@ -148,7 +148,24 @@ Below are a few `BrowserConfig` and `CrawlerRunConfig` parameters you might twea --- -## 5. Putting It All Together +## 5. Windows-Specific Configuration + +When using AsyncWebCrawler on Windows, you might encounter a `NotImplementedError` related to `asyncio.create_subprocess_exec`. This is a known Windows-specific issue that occurs because Windows' default event loop doesn't support subprocess operations. + +To resolve this, Crawl4AI provides a utility function to configure Windows to use the ProactorEventLoop. Call this function before running any async operations: + +```python +from crawl4ai.utils import configure_windows_event_loop + +# Call this before any async operations if you're on Windows +configure_windows_event_loop() + +# Your AsyncWebCrawler code here +``` + +--- + +## 6. Putting It All Together Here’s a slightly more in-depth example that shows off a few key config parameters at once: @@ -193,7 +210,7 @@ if __name__ == "__main__": --- -## 6. Next Steps +## 7. Next Steps - **Smart Crawling Techniques**: Learn to handle iframes, advanced caching, and selective extraction in the [next tutorial](./smart-crawling.md). - **Hooks & Custom Code**: See how to inject custom logic before and after navigation in a dedicated [Hooks Tutorial](./hooks-custom.md).