Skip to content

Commit 18a089c

Browse files
committed
Added pdf form extractor
1 parent f217fa0 commit 18a089c

File tree

2 files changed

+202
-5
lines changed

2 files changed

+202
-5
lines changed

docs/docs.json

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,14 @@
299299
}
300300
]
301301
},
302-
302+
{
303+
"group": "Python guides",
304+
"pages": [
305+
"guides/python/python-image-processing",
306+
"guides/python/python-crawl4ai",
307+
"guides/python/python-pdf-form-extractor"
308+
]
309+
},
303310
{
304311
"group": "Example projects",
305312
"pages": [
@@ -310,10 +317,6 @@
310317
"guides/example-projects/vercel-ai-sdk-image-generator"
311318
]
312319
},
313-
{
314-
"group": "Python examples",
315-
"pages": ["guides/python/python-image-processing", "guides/python/python-crawl4ai"]
316-
},
317320
{
318321
"group": "Example tasks",
319322
"pages": [
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
---
2+
title: "Python PDF form extractor example"
3+
sidebarTitle: "Python PDF form extractor"
4+
description: "Learn how to use Trigger.dev with Python to extract form data from PDF files."
5+
---
6+
7+
import PythonLearnMore from "/snippets/python-learn-more.mdx";
8+
9+
## Overview
10+
11+
This demo showcases how to use Trigger.dev with Python to extract structured form data from a PDF file available at a URL.
12+
13+
## Prerequisites
14+
15+
- A project with [Trigger.dev initialized](/quick-start)
16+
- [Python](https://www.python.org/) installed on your local machine
17+
18+
## Features
19+
20+
- A [Trigger.dev](https://trigger.dev) task to trigger the Python script
21+
- [Trigger.dev Python build extension](https://trigger.dev/docs/config/extensions/pythonExtension) to install the dependencies and run the Python script
22+
- [PyMuPDF](https://pymupdf.readthedocs.io/en/latest/) to extract form data from PDF files
23+
- [Requests](https://docs.python-requests.org/en/master/) to download PDF files from URLs
24+
25+
## GitHub repo
26+
27+
<Card
28+
title="View the project on GitHub"
29+
icon="GitHub"
30+
href="https://github.com/triggerdotdev/examples/edit/main/python-pdf-form-extractor/"
31+
>
32+
Click here to view the full code for this project in our examples repository on GitHub. You can
33+
fork it and use it as a starting point for your own project.
34+
</Card>
35+
36+
## The code
37+
38+
### Build configuration
39+
40+
After you've initialized your project with Trigger.dev, add these build settings to your `trigger.config.ts` file:
41+
42+
```ts trigger.config.ts
43+
import { pythonExtension } from "@trigger.dev/python/extension";
44+
import { defineConfig } from "@trigger.dev/sdk/v3";
45+
46+
export default defineConfig({
47+
runtime: "node",
48+
project: "<your-project-ref>",
49+
// Your other config settings...
50+
build: {
51+
extensions: [
52+
pythonExtension({
53+
// The path to your requirements.txt file
54+
requirementsFile: "./requirements.txt",
55+
// The path to your Python binary
56+
devPythonBinaryPath: `venv/bin/python`,
57+
// The paths to your Python scripts to run
58+
scripts: ["src/python/**/*.py"],
59+
}),
60+
],
61+
},
62+
});
63+
```
64+
65+
<Info>
66+
Learn more about executing scripts in your Trigger.dev project using our Python build extension
67+
[here](/config/extensions/pythonExtension).
68+
</Info>
69+
70+
### Task code
71+
72+
This task uses the `python.runScript` method to run the `image-processing.py` script with the given image URL as an argument. You can adjust the image processing parameters in the payload, with options such as height, width, quality, output format, etc.
73+
74+
```ts src/trigger/pythonPdfTask.ts
75+
import { task } from "@trigger.dev/sdk/v3";
76+
import { python } from "@trigger.dev/python";
77+
78+
export const processPdfForm = task({
79+
id: "process-pdf-form",
80+
run: async (payload: { pdfUrl: string }, io: any) => {
81+
const { pdfUrl } = payload;
82+
const args = [pdfUrl];
83+
84+
const result = await python.runScript("./src/python/extract-pdf-form.py", args);
85+
86+
// Parse the JSON output from the script
87+
let formData;
88+
try {
89+
formData = JSON.parse(result.stdout);
90+
} catch (error) {
91+
throw new Error(`Failed to parse JSON output: ${result.stdout}`);
92+
}
93+
94+
return {
95+
formData,
96+
stderr: result.stderr,
97+
exitCode: result.exitCode,
98+
};
99+
},
100+
});
101+
```
102+
103+
### Add a requirements.txt file
104+
105+
Add the following to your `requirements.txt` file. This is required in Python projects to install the dependencies.
106+
107+
```txt requirements.txt
108+
PyMuPDF==1.23.8
109+
requests==2.31.0
110+
```
111+
112+
### The Python script
113+
114+
The Python script uses PyMuPDF to extract form data from a PDF file. You can see the original script in our examples repository [here](https://github.com/triggerdotdev/examples/blob/main/python-pdf-form-extractor/src/python/extract-pdf-form.py).
115+
116+
```python src/python/extract-pdf-form.py
117+
import fitz # PyMuPDF
118+
import requests
119+
import os
120+
import json
121+
import sys
122+
from urllib.parse import urlparse
123+
124+
def download_pdf(url):
125+
"""Download PDF from URL to a temporary file"""
126+
response = requests.get(url)
127+
response.raise_for_status()
128+
129+
# Get filename from URL or use default
130+
filename = os.path.basename(urlparse(url).path) or "downloaded.pdf"
131+
filepath = os.path.join("/tmp", filename)
132+
133+
with open(filepath, 'wb') as f:
134+
f.write(response.content)
135+
return filepath
136+
137+
def extract_form_data(pdf_path):
138+
"""Extract form data from a PDF file."""
139+
doc = fitz.open(pdf_path)
140+
form_data = {}
141+
142+
for page_num, page in enumerate(doc):
143+
fields = page.widgets()
144+
for field in fields:
145+
field_name = field.field_name or f"unnamed_field_{page_num}_{len(form_data)}"
146+
field_type = field.field_type_string
147+
field_value = field.field_value
148+
149+
# For checkboxes, convert to boolean
150+
if field_type == "CheckBox":
151+
field_value = field_value == "Yes"
152+
153+
form_data[field_name] = {
154+
"type": field_type,
155+
"value": field_value,
156+
"page": page_num + 1
157+
}
158+
159+
return form_data
160+
161+
def main():
162+
if len(sys.argv) < 2:
163+
print(json.dumps({"error": "PDF URL is required as an argument"}), file=sys.stderr)
164+
return 1
165+
166+
url = sys.argv[1]
167+
168+
try:
169+
pdf_path = download_pdf(url)
170+
form_data = extract_form_data(pdf_path)
171+
172+
# Convert to JSON for structured output
173+
structured_output = json.dumps(form_data, indent=2)
174+
print(structured_output)
175+
return 0
176+
except Exception as e:
177+
print(json.dumps({"error": str(e)}), file=sys.stderr)
178+
return 1
179+
180+
if __name__ == "__main__":
181+
sys.exit(main())
182+
```
183+
184+
## Testing your task
185+
186+
1. Create a virtual environment `python -m venv venv`
187+
2. Activate the virtual environment, depending on your OS: On Mac/Linux: `source venv/bin/activate`, on Windows: `venv\Scripts\activate`
188+
3. Install the Python dependencies `pip install -r requirements.txt`
189+
4. Copy the project ref from your [Trigger.dev dashboard](https://cloud.trigger.dev) and add it to the `trigger.config.ts` file.
190+
5. Run the Trigger.dev CLI `dev` command (it may ask you to authorize the CLI if you haven't already).
191+
6. Test the task in the dashboard by providing a valid PDF URL.
192+
7. Deploy the task to production using the Trigger.dev CLI `deploy` command.
193+
194+
<PythonLearnMore />

0 commit comments

Comments
 (0)