Added pdf form extractor

D-K-P · D-K-P · commit 18a089cf999b · 2025-03-18T16:41:37.000Z
diff --git a/docs/docs.json b/docs/docs.json
@@ -299,7 +299,14 @@
               }
             ]
           },
-
+          {
+            "group": "Python guides",
+            "pages": [
+              "guides/python/python-image-processing",
+              "guides/python/python-crawl4ai",
+              "guides/python/python-pdf-form-extractor"
+            ]
+          },
           {
             "group": "Example projects",
             "pages": [
@@ -310,10 +317,6 @@
               "guides/example-projects/vercel-ai-sdk-image-generator"
             ]
           },
-          {
-            "group": "Python examples",
-            "pages": ["guides/python/python-image-processing", "guides/python/python-crawl4ai"]
-          },
           {
             "group": "Example tasks",
             "pages": [
diff --git a/docs/guides/python/python-pdf-form-extractor.mdx b/docs/guides/python/python-pdf-form-extractor.mdx
@@ -0,0 +1,194 @@
+---
+title: "Python PDF form extractor example"
+sidebarTitle: "Python PDF form extractor"
+description: "Learn how to use Trigger.dev with Python to extract form data from PDF files."
+---
+
+import PythonLearnMore from "/snippets/python-learn-more.mdx";
+
+## Overview
+
+This demo showcases how to use Trigger.dev with Python to extract structured form data from a PDF file available at a URL.
+
+## Prerequisites
+
+- A project with [Trigger.dev initialized](/quick-start)
+- [Python](https://www.python.org/) installed on your local machine
+
+## Features
+
+- A [Trigger.dev](https://trigger.dev) task to trigger the Python script
+- [Trigger.dev Python build extension](https://trigger.dev/docs/config/extensions/pythonExtension) to install the dependencies and run the Python script
+- [PyMuPDF](https://pymupdf.readthedocs.io/en/latest/) to extract form data from PDF files
+- [Requests](https://docs.python-requests.org/en/master/) to download PDF files from URLs
+
+## GitHub repo
+
+<Card
+  title="View the project on GitHub"
+  icon="GitHub"
+  href="https://github.com/triggerdotdev/examples/edit/main/python-pdf-form-extractor/"
+>
+  Click here to view the full code for this project in our examples repository on GitHub. You can
+  fork it and use it as a starting point for your own project.
+</Card>
+
+## The code
+
+### Build configuration
+
+After you've initialized your project with Trigger.dev, add these build settings to your `trigger.config.ts` file:
+
+```ts trigger.config.ts
+import { pythonExtension } from "@trigger.dev/python/extension";
+import { defineConfig } from "@trigger.dev/sdk/v3";
+
+export default defineConfig({
+  runtime: "node",
+  project: "<your-project-ref>",
+  // Your other config settings...
+  build: {
+    extensions: [
+      pythonExtension({
+        // The path to your requirements.txt file
+        requirementsFile: "./requirements.txt",
+        // The path to your Python binary
+        devPythonBinaryPath: `venv/bin/python`,
+        // The paths to your Python scripts to run
+        scripts: ["src/python/**/*.py"],
+      }),
+    ],
+  },
+});
+```
+
+<Info>
+  Learn more about executing scripts in your Trigger.dev project using our Python build extension
+  [here](/config/extensions/pythonExtension).
+</Info>
+
+### Task code
+
+This task uses the `python.runScript` method to run the `image-processing.py` script with the given image URL as an argument. You can adjust the image processing parameters in the payload, with options such as height, width, quality, output format, etc.
+
+```ts src/trigger/pythonPdfTask.ts
+import { task } from "@trigger.dev/sdk/v3";
+import { python } from "@trigger.dev/python";
+
+export const processPdfForm = task({
+  id: "process-pdf-form",
+  run: async (payload: { pdfUrl: string }, io: any) => {
+    const { pdfUrl } = payload;
+    const args = [pdfUrl];
+
+    const result = await python.runScript("./src/python/extract-pdf-form.py", args);
+
+    // Parse the JSON output from the script
+    let formData;
+    try {
+      formData = JSON.parse(result.stdout);
+    } catch (error) {
+      throw new Error(`Failed to parse JSON output: ${result.stdout}`);
+    }
+
+    return {
+      formData,
+      stderr: result.stderr,
+      exitCode: result.exitCode,
+    };
+  },
+});
+```
+
+### Add a requirements.txt file
+
+Add the following to your `requirements.txt` file. This is required in Python projects to install the dependencies.
+
+```txt requirements.txt
+PyMuPDF==1.23.8
+requests==2.31.0
+```
+
+### The Python script
+
+The Python script uses PyMuPDF to extract form data from a PDF file. You can see the original script in our examples repository [here](https://github.com/triggerdotdev/examples/blob/main/python-pdf-form-extractor/src/python/extract-pdf-form.py).
+
+```python src/python/extract-pdf-form.py
+import fitz  # PyMuPDF
+import requests
+import os
+import json
+import sys
+from urllib.parse import urlparse
+
+def download_pdf(url):
+    """Download PDF from URL to a temporary file"""
+    response = requests.get(url)
+    response.raise_for_status()
+
+    # Get filename from URL or use default
+    filename = os.path.basename(urlparse(url).path) or "downloaded.pdf"
+    filepath = os.path.join("/tmp", filename)
+
+    with open(filepath, 'wb') as f:
+        f.write(response.content)
+    return filepath
+
+def extract_form_data(pdf_path):
+    """Extract form data from a PDF file."""
+    doc = fitz.open(pdf_path)
+    form_data = {}
+
+    for page_num, page in enumerate(doc):
+        fields = page.widgets()
+        for field in fields:
+            field_name = field.field_name or f"unnamed_field_{page_num}_{len(form_data)}"
+            field_type = field.field_type_string
+            field_value = field.field_value
+
+            # For checkboxes, convert to boolean
+            if field_type == "CheckBox":
+                field_value = field_value == "Yes"
+
+            form_data[field_name] = {
+                "type": field_type,
+                "value": field_value,
+                "page": page_num + 1
+            }
+
+    return form_data
+
+def main():
+    if len(sys.argv) < 2:
+        print(json.dumps({"error": "PDF URL is required as an argument"}), file=sys.stderr)
+        return 1
+
+    url = sys.argv[1]
+
+    try:
+        pdf_path = download_pdf(url)
+        form_data = extract_form_data(pdf_path)
+
+        # Convert to JSON for structured output
+        structured_output = json.dumps(form_data, indent=2)
+        print(structured_output)
+        return 0
+    except Exception as e:
+        print(json.dumps({"error": str(e)}), file=sys.stderr)
+        return 1
+
+if __name__ == "__main__":
+    sys.exit(main())
+```
+
+## Testing your task
+
+1. Create a virtual environment `python -m venv venv`
+2. Activate the virtual environment, depending on your OS: On Mac/Linux: `source venv/bin/activate`, on Windows: `venv\Scripts\activate`
+3. Install the Python dependencies `pip install -r requirements.txt`
+4. Copy the project ref from your [Trigger.dev dashboard](https://cloud.trigger.dev) and add it to the `trigger.config.ts` file.
+5. Run the Trigger.dev CLI `dev` command (it may ask you to authorize the CLI if you haven't already).
+6. Test the task in the dashboard by providing a valid PDF URL.
+7. Deploy the task to production using the Trigger.dev CLI `deploy` command.
+
+<PythonLearnMore />

Original file line number	Diff line number	Diff line change
`@@ -299,7 +299,14 @@`
`299`	`299`	`}`
`300`	`300`	`]`
`301`	`301`	`},`
`302`		`-`
	`302`	`+ {`
	`303`	`+ "group": "Python guides",`
	`304`	`+ "pages": [`
	`305`	`+ "guides/python/python-image-processing",`
	`306`	`+ "guides/python/python-crawl4ai",`
	`307`	`+ "guides/python/python-pdf-form-extractor"`
	`308`	`+ ]`
	`309`	`+ },`
`303`	`310`	`{`
`304`	`311`	`"group": "Example projects",`
`305`	`312`	`"pages": [`
`@@ -310,10 +317,6 @@`
`310`	`317`	`"guides/example-projects/vercel-ai-sdk-image-generator"`
`311`	`318`	`]`
`312`	`319`	`},`
`313`		`- {`
`314`		`- "group": "Python examples",`
`315`		`- "pages": ["guides/python/python-image-processing", "guides/python/python-crawl4ai"]`
`316`		`- },`
`317`	`320`	`{`
`318`	`321`	`"group": "Example tasks",`
`319`	`322`	`"pages": [`