Improving step by step instructions for structured_parser (#985)

varunfb · web-flow · commit 7621d1f03c39 · 2025-08-22T15:44:40.000-07:00
diff --git a/.github/scripts/spellcheck_conf/wordlist.txt b/.github/scripts/spellcheck_conf/wordlist.txt
@@ -1547,3 +1547,12 @@ DeepEval
 SDV
 sklearn
 GCP
+compat
+ArtifactExtractor
+DatabaseManager
+DocumentLens
+PDFs
+RequestBuilder
+VectorIndexManager
+csvs
+programmatically
diff --git a/.github/workflows/pytest_cpu_gha_runner.yaml b/.github/workflows/pytest_cpu_gha_runner.yaml
@@ -72,5 +72,4 @@ jobs:
         with:
           paths: |
             **/*.xml
-            !**/AndroidManifest.xml
         if: always()
diff --git a/end-to-end-use-cases/structured_parser/README.md b/end-to-end-use-cases/structured_parser/README.md
@@ -26,27 +26,40 @@ The tool is designed to handle complex documents with high accuracy and provides
 1. Clone the repository
 2. Install dependencies:
 
+```bash
+git clone https://github.com/meta-llama/llama-cookbook.git
+```
+```bash
+cd llama-cookbook
+```
+```bash
+pip install -r requirements.txt
+```
+2. Install project specific dependencies:
+```bash
+cd end-to-end-use-cases/structured_parser
+```
 ```bash
 pip install -r requirements.txt
 ```
-
-3. Configure the tool (see Configuration section)
-
 ## Quick Start
 
-Extract text from a PDF:
+### Configure the tool (see [Configuration](#Configuration) section)
+(Note: Setup API Key, Model for inferencing, etc.)
+
+### Extract text from a PDF:
 
 ```bash
 python src/structured_extraction.py path/to/document.pdf --text
 ```
 
-Extract text and tables, and save tables as CSV files:
+### Extract text and tables, and save tables as CSV files:
 
 ```bash
 python src/structured_extraction.py path/to/document.pdf --text --tables --save_tables_as_csv
 ```
 
-Process a directory of PDFs and export tables to Excel:
+### Process a directory of PDFs and export tables to Excel:
 
 ```bash
 python src/structured_extraction.py path/to/pdf_directory --text --tables --export_excel
diff --git a/end-to-end-use-cases/structured_parser/requirements.txt b/end-to-end-use-cases/structured_parser/requirements.txt
@@ -14,7 +14,6 @@ vllm>=0.2.0
 openai>=1.0.0
 
 # Database and vector search
-sqlite3>=3.35.0
 chromadb>=0.4.0
 sqlalchemy>=2.0.0
 
diff --git a/end-to-end-use-cases/structured_parser/src/config.yaml b/end-to-end-use-cases/structured_parser/src/config.yaml
@@ -23,7 +23,7 @@ model:
 extraction_inference:
   temperature: 0.2
   top_p: 0.9
-  max_completion_tokens: 17000
+  max_completion_tokens: 32000
   seed: 42
 
 # Artifact configuration
@@ -64,7 +64,7 @@ artifacts:
 
   images:
     prompts:
-      system: "You are an OCR expert.\n\n1.Your task is to extract images, pictures and diagrams data from the following document. Do not extract tables or charts. \n2. For each extracted image, you must write\n a) a caption as given in the document,\n b) a detailed description of the image; utilize the surrounding text for this. Your descriptions should be very informative so that a human can understand what is in the image without ever seeing the document.Think step-by-step and write a JSON that corresponds to the schema and the information in the document\n\nIf there is nothing to extract, simply return an empty JSON {\"images\": []}. Ensure your final answer is appropriately formatted as a JSON object and wrapped in a ```json\n\n``` block."
+      system: "You are an OCR expert. (Note: Do not extract tables)\n\n1.Your task is to extract images, pictures, charts and diagrams only from the following document.\n 2. For each extracted image, you must write\n a) a caption as given in the document\n	b) a detailed description of the image; utilize the surrounding text for this. Your descriptions should be very informative so that a human can understand what is in the image without ever seeing the document. Think step-by-step and write a JSON that corresponds to the schema and the information in the document\n\nIf there is nothing to extract, simply return an empty JSON {\"images\": []}. \nIf the image is a table, simply return an empty JSON {\"images\": []}. \n\nIf the image is a chart or a graph then you must convert them to JSON outputs.\n\n# Instructions to convert charts & graphs to JSON\nYour task is to: Analyze and describe the chart or graph. Summarize the type of chart/graph (e.g., bar chart, line graph, pie chart). Identify the axes, labels, categories, and any notable trends or patterns. Provide a brief textual description of what the chart/graph represents. Extract and structure the data:\n1. Capture all relevant values and data points from the chart/graph.\n2. Organize the extracted data into a clear and logical JSON structure.\n\n# Output format:\n\nYour response should be captured in the chart_data attribute of the JSON schema. Ensure your final answer is appropriately formatted as a JSON object and wrapped in a ```json\n\n``` block."
       user: "TARGET SCHEMA:\n```json\n{schema}\n```"
     output_schema: {
       "type": "object",
@@ -93,6 +93,79 @@ artifacts:
               "image_type": {
                 "type": "string",
                 "description": "Type of image (e.g., 'photograph', 'chart', 'diagram', 'illustration')"
+              },
+              "chart_data": {
+                  "type": "object",
+                  "properties": {
+                    "type": {
+                      "type": "string",
+                      "enum": ["bar", "line", "pie", "scatter", "area"]
+                    },
+                    "title": {
+                      "type": "string"
+                    },
+                    "subtitle": {
+                      "type": "string"
+                    },
+                    "xAxis": {
+                      "type": "object",
+                      "properties": {
+                        "title": { "type": "string" },
+                        "labels": {
+                          "type": "array",
+                          "items": { "type": "string" }
+                        }
+                      },
+                      "required": ["title", "labels"]
+                    },
+                    "yAxis": {
+                      "type": "object",
+                      "properties": {
+                        "title": { "type": "string" },
+                        "labels": {
+                          "type": "array",
+                          "items": { "type": "string" }
+                        }
+                      },
+                      "required": ["title", "labels"]
+                    },
+                    "data": {
+                      "type": "array",
+                      "items": {
+                        "oneOf": [
+                          {
+                            "type": "object",
+                            "properties": {
+                              "label": { "type": "string" },
+                              "values": {
+                                "type": "array",
+                                "items": { "type": "number" }
+                              }
+                            },
+                            "required": ["label", "values"]
+                          },
+                          {
+                            "type": "object",
+                            "properties": {
+                              "x": { "type": "number" },
+                              "y": { "type": "number" }
+                            },
+                            "required": ["x", "y"]
+                          }
+                        ]
+                      }
+                    },
+                    "options": {
+                      "type": "object",
+                      "properties": {
+                        "legend": { "type": "boolean" },
+                        "rtl": { "type": "boolean" },
+                        "responsive": { "type": "boolean" },
+                        "animation": { "type": "boolean" }
+                      }
+                    }
+                  },
+                  "required": ["type", "title", "xAxis", "yAxis", "data"]
               }
             },
             "required": [
diff --git a/end-to-end-use-cases/structured_parser/src/json_to_sql.py b/end-to-end-use-cases/structured_parser/src/json_to_sql.py
@@ -100,7 +100,8 @@ def create_artifact_table(sql_db_path: str) -> None:
                 cursor.execute("DROP TABLE IF EXISTS document_artifacts")
 
                 # Create table with schema
-                cursor.execute("""
+                cursor.execute(
+                    """
                 CREATE TABLE IF NOT EXISTS document_artifacts (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
                     doc_path TEXT,
@@ -124,7 +125,8 @@ def create_artifact_table(sql_db_path: str) -> None:
                     image_caption TEXT,
                     image_type TEXT
                 )
-                """)
+                """
+                )
 
                 # Create indexes for common queries
                 cursor.execute(
diff --git a/end-to-end-use-cases/structured_parser/src/structured_extraction.py b/end-to-end-use-cases/structured_parser/src/structured_extraction.py
@@ -196,6 +196,7 @@ def _run_inference(
         artifact_types = [r[0] for r in requests]
         inference_requests = [r[1] for r in requests]
 
+        response_batch = []
         if backend == "offline-vllm":
             request_batch = InferenceUtils.make_vllm_batch(inference_requests)
             response_batch = InferenceUtils.run_vllm_inference(request_batch)
@@ -304,79 +305,6 @@ def from_pdf(pdf_path: str, artifact_types: List[str]) -> List[ExtractedPage]:
 
         return pdf_pages
 
-    # @staticmethod
-    # async def _run_inference_async(
-    #     requests: List[Tuple[str, InferenceRequest]],
-    # ) -> List[Tuple[str, str]]:
-    #     """
-    #     Run inference asynchronously for all requests.
-
-    #     Args:
-    #         requests: List of tuples containing (artifact_type, inference_request)
-
-    #     Returns:
-    #         List of tuples containing (artifact_type, response)
-
-    #     Raises:
-    #         ValueError: If the backend is not supported
-    #     """
-    #     backend = config["model"].get("backend")
-    #     if backend not in SUPPORTED_BACKENDS:
-    #         raise ValueError(
-    #             f"Allowed config.model.backend: {SUPPORTED_BACKENDS}, got unknown value: {backend}"
-    #         )
-
-    #     artifact_types = [r[0] for r in requests]
-    #     inference_requests = [r[1] for r in requests]
-
-    #     if backend == "offline-vllm":
-    #         request_batch = InferenceUtils.make_vllm_batch(inference_requests)
-    #         response_batch = InferenceUtils.run_vllm_inference(request_batch)
-    #     elif backend == "openai-compat":
-    #         tasks = [
-    #             InferenceUtils.async_run_openai_inference(request)
-    #             for request in inference_requests
-    #         ]
-    #         response_batch = await asyncio.gather(*tasks)
-
-    #     return list(zip(artifact_types, response_batch))
-
-    # @staticmethod
-    # async def from_image_async(
-    #     img_path: str,
-    #     artifact_types: Union[List[str], str],
-    # ) -> ArtifactCollection:
-    #     """
-    #     Extract artifacts from an image asynchronously.
-
-    #     Args:
-    #         img_path: Path to the image file
-    #         artifact_types: Type(s) of artifacts to extract
-
-    #     Returns:
-    #         ArtifactCollection: Extracted artifacts
-
-    #     Raises:
-    #         ValueError: If the backend is not supported
-    #         FileNotFoundError: If the image file doesn't exist
-    #     """
-    #     if not os.path.exists(img_path):
-    #         raise FileNotFoundError(f"Image file not found: {img_path}")
-
-    #     if isinstance(artifact_types, str):
-    #         artifact_types = [artifact_types]
-
-    #     # Prepare inference requests
-    #     requests = ArtifactExtractor._prepare_inference_requests(
-    #         img_path, artifact_types
-    #     )
-
-    #     # Run inference asynchronously
-    #     responses = await ArtifactExtractor._run_inference_async(requests)
-
-    #     # Process responses
-    #     return ArtifactExtractor._process_responses(responses)
-
 
 def get_artifact_types(text: bool, tables: bool, images: bool) -> List[str]:
     """
@@ -422,16 +350,16 @@ def get_target_files(target_path: str) -> List[Path]:
     if not os.path.exists(target_path):
         raise FileNotFoundError(f"Target path not found: {target_path}")
 
-    target_path = Path(target_path)
-    if target_path.is_file() and target_path.suffix not in SUPPORTED_FILE_TYPES:
+    target_path_obj = Path(target_path)
+    if target_path_obj.is_file() and target_path_obj.suffix not in SUPPORTED_FILE_TYPES:
         raise ValueError(
-            f"Unsupported file type: {target_path.suffix}. Supported types: {SUPPORTED_FILE_TYPES}"
+            f"Unsupported file type: {target_path_obj.suffix}. Supported types: {SUPPORTED_FILE_TYPES}"
         )
 
     targets = (
-        [target_path]
-        if target_path.is_file()
-        else [f for f in target_path.iterdir() if f.suffix in SUPPORTED_FILE_TYPES]
+        [target_path_obj]
+        if target_path_obj.is_file()
+        else [f for f in target_path_obj.iterdir() if f.suffix in SUPPORTED_FILE_TYPES]
     )
     logger.debug(f"Processing {len(targets)} files")
     if not targets:
@@ -456,7 +384,7 @@ def process_files(
     out_json = []
     for target in targets:
         try:
-            artifacts = ArtifactExtractor.from_pdf(target, artifact_types)
+            artifacts = ArtifactExtractor.from_pdf(str(target), artifact_types)
             out_json.extend(artifacts)
         except Exception as e:
             logger.error(f"Failed to process {target}: {e}")
@@ -485,6 +413,7 @@ def save_results(
     output_dir.mkdir(parents=True, exist_ok=True)
 
     # Save to JSON file
+    output_path = None
     try:
         output_path = output_dir / f"artifacts_{timestamp}.json"
         json_content = json.dumps(data, indent=2)
@@ -562,8 +491,8 @@ def main(
     results = process_files(targets, artifact_types)
 
     # Save results
-    target_path = Path(target_path)
-    output_dir = target_path.parent / "extracted"
+    target_path_obj = Path(target_path)
+    output_dir = target_path_obj.parent / "extracted"
     save_results(
         output_dir,
         results,
diff --git a/src/tests/datasets/test_samsum_datasets.py b/src/tests/datasets/test_samsum_datasets.py
@@ -1,32 +1,36 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
-import pytest
 from dataclasses import dataclass
 from functools import partial
 from unittest.mock import patch
+
+import pytest
 from datasets import load_dataset
 
+
 @dataclass
 class Config:
     model_type: str = "llama"
 
+
 try:
-    load_dataset("Samsung/samsum")
+    load_dataset("knkarthick/samsum")
     SAMSUM_UNAVAILABLE = False
 except ValueError:
     SAMSUM_UNAVAILABLE = True
 
+
 @pytest.mark.skipif(SAMSUM_UNAVAILABLE, reason="Samsum dataset is unavailable")
 @pytest.mark.skip_missing_tokenizer
-@patch('llama_cookbook.finetuning.train')
-@patch('llama_cookbook.finetuning.AutoTokenizer')
+@patch("llama_cookbook.finetuning.train")
+@patch("llama_cookbook.finetuning.AutoTokenizer")
 @patch("llama_cookbook.finetuning.AutoConfig.from_pretrained")
 @patch("llama_cookbook.finetuning.AutoProcessor")
 @patch("llama_cookbook.finetuning.MllamaForConditionalGeneration.from_pretrained")
-@patch('llama_cookbook.finetuning.LlamaForCausalLM.from_pretrained')
-@patch('llama_cookbook.finetuning.optim.AdamW')
-@patch('llama_cookbook.finetuning.StepLR')
+@patch("llama_cookbook.finetuning.LlamaForCausalLM.from_pretrained")
+@patch("llama_cookbook.finetuning.optim.AdamW")
+@patch("llama_cookbook.finetuning.StepLR")
 def test_samsum_dataset(
     step_lr,
     optimizer,
@@ -39,11 +43,13 @@ def test_samsum_dataset(
     mocker,
     setup_tokenizer,
     llama_version,
-    ):
+):
     from llama_cookbook.finetuning import main
 
     setup_tokenizer(tokenizer)
-    get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256]
+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [
+        32000 if "Llama-2" in llama_version else 128256
+    ]
     get_mmodel.return_value.get_input_embeddings.return_value.weight.shape = [0]
     get_config.return_value = Config()
 
@@ -55,7 +61,7 @@ def test_samsum_dataset(
         "use_peft": False,
         "dataset": "samsum_dataset",
         "batching_strategy": "padding",
-        }
+    }
 
     main(**kwargs)