fix(extraction): JsonCss selector and crawler improvements

- Fix JsonCssExtractionStrategy._get_elements to return all matching elements instead of just one - Add robust error handling to page_need_scroll with default fallback - Improve JSON extraction strategies documentation - Refactor content scraping strategy - Update version to 0.4.247
unclecode · Jan 5, 2025 · 72fbdac · 72fbdac
1 parent 0857c7b
commit 72fbdac
Show file tree

Hide file tree

Showing 6 changed files with 56 additions and 102 deletions.
diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.4.246"
+__version__ = "0.4.247"
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
@@ -2163,20 +2163,29 @@ async def get_page_dimensions(self, page: Page):
             }
         """)
 
-    async def page_need_scroll(self, page: Page):
+    async def page_need_scroll(self, page: Page) -> bool:
         """
         Determine whether the page need to scroll
         
         Args:
             page: Playwright page object
             
         Returns:
-            page should scroll or not
+            bool: True if page needs scrolling
         """
-        return await page.evaluate("""
+        try:
+            need_scroll = await page.evaluate("""
             () => {
                 const scrollHeight = document.documentElement.scrollHeight;
                 const viewportHeight = window.innerHeight;
                 return scrollHeight > viewportHeight;
             }
-        """)
+            """)
+            return need_scroll
+        except Exception as e:
+            self.logger.warning(
+                message="Failed to check scroll need: {error}. Defaulting to True for safety.",
+                tag="SCROLL",
+                params={"error": str(e)}
+            )
+            return True  # Default to scrolling if check fails
diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
@@ -122,92 +122,6 @@ async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
         """
         return await asyncio.to_thread(self._scrap, url, html, **kwargs)
 
-    def _generate_markdown_content(self, cleaned_html: str,html: str,url: str, success: bool, **kwargs) -> Dict[str, Any]:
-        """
-        Generate markdown content from cleaned HTML.
-
-        Args:
-            cleaned_html (str): The cleaned HTML content.
-            html (str): The original HTML content.
-            url (str): The URL of the page.
-            success (bool): Whether the content was successfully cleaned.
-            **kwargs: Additional keyword arguments.
-
-        Returns:
-            Dict[str, Any]: A dictionary containing the generated markdown content.
-        """
-        markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator())
-
-        if markdown_generator:
-            try:
-                if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter:
-                        markdown_generator.content_filter = BM25ContentFilter(
-                            user_query=kwargs.get('fit_markdown_user_query', None),
-                            bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
-                        )
-
-                markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
-                    cleaned_html=cleaned_html,
-                    base_url=url,
-                    html2text_options=kwargs.get('html2text', {})
-                )
-
-                return {
-                    'markdown': markdown_result.raw_markdown,  
-                    'fit_markdown': markdown_result.fit_markdown,
-                    'fit_html': markdown_result.fit_html, 
-                    'markdown_v2': markdown_result
-                }
-            except Exception as e:
-                self._log('error',
-                    message="Error using new markdown generation strategy: {error}",
-                    tag="SCRAPE",
-                    params={"error": str(e)}
-                )
-                markdown_generator = None
-                return {
-                    'markdown': f"Error using new markdown generation strategy: {str(e)}",
-                    'fit_markdown': "Set flag 'fit_markdown' to True to get cleaned HTML content.",
-                    'fit_html': "Set flag 'fit_markdown' to True to get cleaned HTML content.",
-                    'markdown_v2': None                    
-                }
-
-        # Legacy method
-        """
-        # h = CustomHTML2Text()
-        # h.update_params(**kwargs.get('html2text', {}))            
-        # markdown = h.handle(cleaned_html)
-        # markdown = markdown.replace('    ```', '```')
-        
-        # fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content."
-        # fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content."
-        
-        # if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False):
-        #     content_filter = kwargs.get('content_filter', None)
-        #     if not content_filter:
-        #         content_filter = BM25ContentFilter(
-        #             user_query=kwargs.get('fit_markdown_user_query', None),
-        #             bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
-        #         )
-        #     fit_html = content_filter.filter_content(html)
-        #     fit_html = '\n'.join('<div>{}</div>'.format(s) for s in fit_html)
-        #     fit_markdown = h.handle(fit_html)
-
-        # markdown_v2 = MarkdownGenerationResult(
-        #     raw_markdown=markdown,
-        #     markdown_with_citations=markdown,
-        #     references_markdown=markdown,
-        #     fit_markdown=fit_markdown
-        # )
-        
-        # return {
-        #     'markdown': markdown,
-        #     'fit_markdown': fit_markdown,
-        #     'fit_html': fit_html,
-        #     'markdown_v2' : markdown_v2
-        # }
-        """
-
     def flatten_nested_elements(self, node):
         """
         Flatten nested elements in a HTML tree.
@@ -798,13 +712,6 @@ def _scrap(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRES
 
         cleaned_html = str_body.replace('\n\n', '\n').replace('  ', ' ')
 
-        # markdown_content = self._generate_markdown_content(
-        #     cleaned_html=cleaned_html,
-        #     html=html,
-        #     url=url,
-        #     success=success,
-        #     **kwargs
-        # )
 
         return {
             # **markdown_content,

diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
@@ -974,8 +974,7 @@ def _get_base_elements(self, parsed_html, selector: str):
         return parsed_html.select(selector)
 
     def _get_elements(self, element, selector: str):
-        selected = element.select_one(selector)
-        return [selected] if selected else []
+        return element.select(selector)
 
     def _get_element_text(self, element) -> str:
         return element.get_text(strip=True)
@@ -1050,3 +1049,4 @@ def _get_element_html(self, element) -> str:
     def _get_element_attribute(self, element, attribute: str):
         return element.get(attribute)
 
+
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
@@ -21,6 +21,8 @@
 import cProfile
 import pstats
 from functools import wraps
+import asyncio
+
 
 class InvalidCSSSelectorError(Exception):
     pass
@@ -1579,6 +1581,25 @@ def ensure_content_dirs(base_path: str) -> Dict[str, str]:
 
     return content_paths
 
+def configure_windows_event_loop():
+    """
+    Configure the Windows event loop to use ProactorEventLoop.
+    This resolves the NotImplementedError that occurs on Windows when using asyncio subprocesses.
+    
+    This function should only be called on Windows systems and before any async operations.
+    On non-Windows systems, this function does nothing.
+    
+    Example:
+        ```python
+        from crawl4ai.async_configs import configure_windows_event_loop
+        
+        # Call this before any async operations if you're on Windows
+        configure_windows_event_loop()
+        ```
+    """
+    if platform.system() == 'Windows':
+        asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
+
 def get_error_context(exc_info, context_lines: int = 5):
     """
     Extract error context with more reliable line number tracking.

diff --git a/docs/md_v3/tutorials/async-webcrawler-basics.md b/docs/md_v3/tutorials/async-webcrawler-basics.md
@@ -148,7 +148,24 @@ Below are a few `BrowserConfig` and `CrawlerRunConfig` parameters you might twea
 
 ---
 
-## 5. Putting It All Together
+## 5. Windows-Specific Configuration
+
+When using AsyncWebCrawler on Windows, you might encounter a `NotImplementedError` related to `asyncio.create_subprocess_exec`. This is a known Windows-specific issue that occurs because Windows' default event loop doesn't support subprocess operations.
+
+To resolve this, Crawl4AI provides a utility function to configure Windows to use the ProactorEventLoop. Call this function before running any async operations:
+
+```python
+from crawl4ai.utils import configure_windows_event_loop
+
+# Call this before any async operations if you're on Windows
+configure_windows_event_loop()
+
+# Your AsyncWebCrawler code here
+```
+
+---
+
+## 6. Putting It All Together
 
 Here’s a slightly more in-depth example that shows off a few key config parameters at once:
 
@@ -193,7 +210,7 @@ if __name__ == "__main__":
 
 ---
 
-## 6. Next Steps
+## 7. Next Steps
 
 - **Smart Crawling Techniques**: Learn to handle iframes, advanced caching, and selective extraction in the [next tutorial](./smart-crawling.md).
 - **Hooks & Custom Code**: See how to inject custom logic before and after navigation in a dedicated [Hooks Tutorial](./hooks-custom.md).