Closed
Description
I tweaked this example code:
https://github.com/unclecode/crawl4ai/blob/main/docs/examples/full_page_screenshot_and_pdf_export.md
but none of them is working :(
Scraping was successful
No screenshot was captured.
No pdf was rendered.
crawl4ai: v0.4.247
My code is:
import os, sys
import asyncio
import random
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
# Adjust paths as needed
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
test_url="https://blog.langchain.dev/semantic-search-for-langgraph-memory/"
async def main():
browser_config = BrowserConfig(
verbose=True,
headless=True, # Screenshots work better in headless mode
viewport_width=1920,
viewport_height=1080
)
run_config = CrawlerRunConfig(
page_timeout=30000,
delay_before_return_html=random.randint(1, 3),
# Content filtering
word_count_threshold=10, # Minimum words per content block
exclude_external_links=True, # Remove external links
excluded_tags=["form", "header", "footer"],
keep_data_attributes=False,
# Content processing
remove_overlay_elements=True, # Remove popups/modals
process_iframes=True, # Process iframe content
# Cache control
cache_mode=CacheMode.BYPASS,
)
async def screenshot_with_running_js_code():
async with AsyncWebCrawler(config=browser_config) as crawler:
js_code = [
"window.scrollTo(0, document.body.scrollHeight, {behavior: 'smooth'});",
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(" +
"button => button.textContent.trim().toLowerCase().includes('load more'));" +
"if (loadMoreButton) loadMoreButton.click();"
]
# Request both PDF and screenshot
result = await crawler.arun(
config=run_config,
url=test_url,
pdf=True,
screenshot=True,
js_code=js_code,
wait_for="document.readyState === 'complete'", # Ensure page is fully loaded
cache_mode=CacheMode.BYPASS,
)
if not result.success:
print(f"Crawl failed: {result.error_message}")
print(f"Status code: {result.status_code}")
else:
# print(result.cleaned_html[:500]) # Freed of forms, header, footer, data-* attributes
print("Scraping was successful")
# Save screenshot
if result.screenshot:
from base64 import b64decode
with open(os.path.join(__location__, "screenshots", "screenshot.png"), "wb") as f:
f.write(b64decode(result.screenshot))
else:
print("No screenshot was captured.")
# Save PDF
if result.pdf:
pdf_bytes = b64decode(result.pdf)
with open(os.path.join(__location__, "pdfs", "page.pdf"), "wb") as f:
f.write(pdf_bytes)
else:
print("No pdf was rendered.")
await screenshot_with_running_js_code()
if __name__ == "__main__":
asyncio.run(main())
Activity