Skip to content

Commit

Permalink
fix(extraction): JsonCss selector and crawler improvements
Browse files Browse the repository at this point in the history
- Fix JsonCssExtractionStrategy._get_elements to return all matching elements instead of just one
- Add robust error handling to page_need_scroll with default fallback
- Improve JSON extraction strategies documentation
- Refactor content scraping strategy
- Update version to 0.4.247
  • Loading branch information
unclecode committed Jan 5, 2025
1 parent 0857c7b commit 72fbdac
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 102 deletions.
2 changes: 1 addition & 1 deletion crawl4ai/__version__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# crawl4ai/_version.py
__version__ = "0.4.246"
__version__ = "0.4.247"
17 changes: 13 additions & 4 deletions crawl4ai/async_crawler_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2163,20 +2163,29 @@ async def get_page_dimensions(self, page: Page):
}
""")

async def page_need_scroll(self, page: Page):
async def page_need_scroll(self, page: Page) -> bool:
"""
Determine whether the page need to scroll
Args:
page: Playwright page object
Returns:
page should scroll or not
bool: True if page needs scrolling
"""
return await page.evaluate("""
try:
need_scroll = await page.evaluate("""
() => {
const scrollHeight = document.documentElement.scrollHeight;
const viewportHeight = window.innerHeight;
return scrollHeight > viewportHeight;
}
""")
""")
return need_scroll
except Exception as e:
self.logger.warning(
message="Failed to check scroll need: {error}. Defaulting to True for safety.",
tag="SCROLL",
params={"error": str(e)}
)
return True # Default to scrolling if check fails
93 changes: 0 additions & 93 deletions crawl4ai/content_scraping_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,92 +122,6 @@ async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
"""
return await asyncio.to_thread(self._scrap, url, html, **kwargs)

def _generate_markdown_content(self, cleaned_html: str,html: str,url: str, success: bool, **kwargs) -> Dict[str, Any]:
"""
Generate markdown content from cleaned HTML.
Args:
cleaned_html (str): The cleaned HTML content.
html (str): The original HTML content.
url (str): The URL of the page.
success (bool): Whether the content was successfully cleaned.
**kwargs: Additional keyword arguments.
Returns:
Dict[str, Any]: A dictionary containing the generated markdown content.
"""
markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator())

if markdown_generator:
try:
if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter:
markdown_generator.content_filter = BM25ContentFilter(
user_query=kwargs.get('fit_markdown_user_query', None),
bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
)

markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
cleaned_html=cleaned_html,
base_url=url,
html2text_options=kwargs.get('html2text', {})
)

return {
'markdown': markdown_result.raw_markdown,
'fit_markdown': markdown_result.fit_markdown,
'fit_html': markdown_result.fit_html,
'markdown_v2': markdown_result
}
except Exception as e:
self._log('error',
message="Error using new markdown generation strategy: {error}",
tag="SCRAPE",
params={"error": str(e)}
)
markdown_generator = None
return {
'markdown': f"Error using new markdown generation strategy: {str(e)}",
'fit_markdown': "Set flag 'fit_markdown' to True to get cleaned HTML content.",
'fit_html': "Set flag 'fit_markdown' to True to get cleaned HTML content.",
'markdown_v2': None
}

# Legacy method
"""
# h = CustomHTML2Text()
# h.update_params(**kwargs.get('html2text', {}))
# markdown = h.handle(cleaned_html)
# markdown = markdown.replace(' ```', '```')
# fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content."
# fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content."
# if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False):
# content_filter = kwargs.get('content_filter', None)
# if not content_filter:
# content_filter = BM25ContentFilter(
# user_query=kwargs.get('fit_markdown_user_query', None),
# bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
# )
# fit_html = content_filter.filter_content(html)
# fit_html = '\n'.join('<div>{}</div>'.format(s) for s in fit_html)
# fit_markdown = h.handle(fit_html)
# markdown_v2 = MarkdownGenerationResult(
# raw_markdown=markdown,
# markdown_with_citations=markdown,
# references_markdown=markdown,
# fit_markdown=fit_markdown
# )
# return {
# 'markdown': markdown,
# 'fit_markdown': fit_markdown,
# 'fit_html': fit_html,
# 'markdown_v2' : markdown_v2
# }
"""

def flatten_nested_elements(self, node):
"""
Flatten nested elements in a HTML tree.
Expand Down Expand Up @@ -798,13 +712,6 @@ def _scrap(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRES

cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')

# markdown_content = self._generate_markdown_content(
# cleaned_html=cleaned_html,
# html=html,
# url=url,
# success=success,
# **kwargs
# )

return {
# **markdown_content,
Expand Down
4 changes: 2 additions & 2 deletions crawl4ai/extraction_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -974,8 +974,7 @@ def _get_base_elements(self, parsed_html, selector: str):
return parsed_html.select(selector)

def _get_elements(self, element, selector: str):
selected = element.select_one(selector)
return [selected] if selected else []
return element.select(selector)

def _get_element_text(self, element) -> str:
return element.get_text(strip=True)
Expand Down Expand Up @@ -1050,3 +1049,4 @@ def _get_element_html(self, element) -> str:
def _get_element_attribute(self, element, attribute: str):
return element.get(attribute)


21 changes: 21 additions & 0 deletions crawl4ai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import cProfile
import pstats
from functools import wraps
import asyncio


class InvalidCSSSelectorError(Exception):
pass
Expand Down Expand Up @@ -1579,6 +1581,25 @@ def ensure_content_dirs(base_path: str) -> Dict[str, str]:

return content_paths

def configure_windows_event_loop():
"""
Configure the Windows event loop to use ProactorEventLoop.
This resolves the NotImplementedError that occurs on Windows when using asyncio subprocesses.
This function should only be called on Windows systems and before any async operations.
On non-Windows systems, this function does nothing.
Example:
```python
from crawl4ai.async_configs import configure_windows_event_loop
# Call this before any async operations if you're on Windows
configure_windows_event_loop()
```
"""
if platform.system() == 'Windows':
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())

def get_error_context(exc_info, context_lines: int = 5):
"""
Extract error context with more reliable line number tracking.
Expand Down
21 changes: 19 additions & 2 deletions docs/md_v3/tutorials/async-webcrawler-basics.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,24 @@ Below are a few `BrowserConfig` and `CrawlerRunConfig` parameters you might twea
---

## 5. Putting It All Together
## 5. Windows-Specific Configuration

When using AsyncWebCrawler on Windows, you might encounter a `NotImplementedError` related to `asyncio.create_subprocess_exec`. This is a known Windows-specific issue that occurs because Windows' default event loop doesn't support subprocess operations.

To resolve this, Crawl4AI provides a utility function to configure Windows to use the ProactorEventLoop. Call this function before running any async operations:

```python
from crawl4ai.utils import configure_windows_event_loop

# Call this before any async operations if you're on Windows
configure_windows_event_loop()

# Your AsyncWebCrawler code here
```

---

## 6. Putting It All Together

Here’s a slightly more in-depth example that shows off a few key config parameters at once:

Expand Down Expand Up @@ -193,7 +210,7 @@ if __name__ == "__main__":

---

## 6. Next Steps
## 7. Next Steps

- **Smart Crawling Techniques**: Learn to handle iframes, advanced caching, and selective extraction in the [next tutorial](./smart-crawling.md).
- **Hooks & Custom Code**: See how to inject custom logic before and after navigation in a dedicated [Hooks Tutorial](./hooks-custom.md).
Expand Down

0 comments on commit 72fbdac

Please sign in to comment.