Skip to content

Neither screenshot nor PDF creation is working. #477

Closed
@vanetreg

Description

I tweaked this example code:
https://github.com/unclecode/crawl4ai/blob/main/docs/examples/full_page_screenshot_and_pdf_export.md
but none of them is working :(

Scraping was successful
No screenshot was captured.
No pdf was rendered.

crawl4ai: v0.4.247

My code is:

import os, sys
import asyncio
import random
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode

# Adjust paths as needed
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

test_url="https://blog.langchain.dev/semantic-search-for-langgraph-memory/"


async def main():
    browser_config = BrowserConfig(
        verbose=True, 
        headless=True,  # Screenshots work better in headless mode
        viewport_width=1920,
        viewport_height=1080
    )

    run_config = CrawlerRunConfig(
        page_timeout=30000,
        delay_before_return_html=random.randint(1, 3),
        # Content filtering
        word_count_threshold=10,        # Minimum words per content block
        exclude_external_links=True,    # Remove external links
        excluded_tags=["form", "header", "footer"],
        keep_data_attributes=False,
        # Content processing
        remove_overlay_elements=True,   # Remove popups/modals
        process_iframes=True,            # Process iframe content
        # Cache control
        cache_mode=CacheMode.BYPASS,
        )

    async def screenshot_with_running_js_code():
        async with AsyncWebCrawler(config=browser_config) as crawler:
            js_code = [
                "window.scrollTo(0, document.body.scrollHeight, {behavior: 'smooth'});",
                "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(" +
                "button => button.textContent.trim().toLowerCase().includes('load more'));" +
                "if (loadMoreButton) loadMoreButton.click();"
            ]
            # Request both PDF and screenshot
            result = await crawler.arun(
                config=run_config,
                url=test_url,
                pdf=True,
                screenshot=True,
                js_code=js_code,
                wait_for="document.readyState === 'complete'",  # Ensure page is fully loaded
                cache_mode=CacheMode.BYPASS,
            )
            if not result.success:
                print(f"Crawl failed: {result.error_message}")
                print(f"Status code: {result.status_code}")
            else:
                # print(result.cleaned_html[:500]) # Freed of forms, header, footer, data-* attributes
                print("Scraping was successful")
                # Save screenshot
                if result.screenshot:
                    from base64 import b64decode
                    with open(os.path.join(__location__, "screenshots", "screenshot.png"), "wb") as f:
                        f.write(b64decode(result.screenshot))
                else:
                    print("No screenshot was captured.")
                # Save PDF
                if result.pdf:
                    pdf_bytes = b64decode(result.pdf)
                    with open(os.path.join(__location__, "pdfs", "page.pdf"), "wb") as f:
                        f.write(pdf_bytes)
                else:
                    print("No pdf was rendered.")

    await screenshot_with_running_js_code()
    

if __name__ == "__main__":
    asyncio.run(main())

Activity

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Metadata

Assignees

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions