Unstructured-IO
diff --git a/‎.github/workflows/ci.yml
Lines changed: 26 additions & 0 deletions b/‎.github/workflows/ci.yml
Lines changed: 26 additions & 0 deletions
diff --git a/‎.github/workflows/ingest-test-fixtures-update-pr.yml
Lines changed: 11 additions & 7 deletions b/‎.github/workflows/ingest-test-fixtures-update-pr.yml
Lines changed: 11 additions & 7 deletions
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎CHANGELOG.md
Lines changed: 3 additions & 1 deletion b/‎CHANGELOG.md
Lines changed: 3 additions & 1 deletion
diff --git a/‎Makefile
Lines changed: 9 additions & 0 deletions b/‎Makefile
Lines changed: 9 additions & 0 deletions
diff --git a/‎scripts/html/elements_json_to_html.py
Lines changed: 66 additions & 0 deletions b/‎scripts/html/elements_json_to_html.py
Lines changed: 66 additions & 0 deletions
@@ -319,6 +319,32 @@ jobs:
         make install-ingest
         ./test_unstructured_ingest/test-ingest-src.sh
 
+  test_json_to_html:
+    strategy:
+      matrix:
+        python-version: ["3.9","3.10"]
+    runs-on: ubuntu-latest-m
+    needs: [setup, lint]
+    steps:
+    - uses: 'actions/checkout@v4'
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Get full Python version
+      id: full-python-version
+      run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
+    - name: Setup virtual environment
+      uses: ./.github/actions/base-cache
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Test HTML fixtures
+      env:
+        OVERWRITE_FIXTURES: "false"
+        PYTHONPATH: ${{ github.workspace }}
+      run: |
+        source .venv/bin/activate
+        ./test_unstructured_ingest/check-diff-expected-output-html.sh
 
   test_unstructured_api_unit:
     strategy:
 
@@ -16,10 +16,10 @@ jobs:
     env:
       NLTK_DATA: ${{ github.workspace }}/nltk_data
     steps:
-    - uses: actions/checkout@v3
-    - uses: ./.github/actions/base-cache
-      with:
-        python-version: ${{ env.PYTHON_VERSION }}
+      - uses: actions/checkout@v3
+      - uses: ./.github/actions/base-cache
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
 
   setup_ingest:
     runs-on: ubuntu-latest
@@ -31,14 +31,14 @@ jobs:
       - uses: ./.github/actions/base-ingest-cache
         with:
           python-version: ${{ env.PYTHON_VERSION }}
-          check-only: 'true'
+          check-only: "true"
 
   update-fixtures-and-pr:
     runs-on: ubuntu-latest-m
     needs: [setup_ingest]
     steps:
       # actions/checkout MUST come before auth
-      - uses: 'actions/checkout@v4'
+      - uses: "actions/checkout@v4"
       - name: Set up Python ${{ env.PYTHON_VERSION }}
         uses: actions/setup-python@v5
         with:
@@ -53,7 +53,7 @@ jobs:
       - name: Setup docker-compose
         uses: KengoTODA/actions-setup-docker-compose@v1
         with:
-          version: '2.22.0'
+          version: "2.22.0"
       - name: Update test fixtures
         env:
           AIRTABLE_PERSONAL_ACCESS_TOKEN: ${{ secrets.AIRTABLE_PERSONAL_ACCESS_TOKEN }}
@@ -111,6 +111,10 @@ jobs:
           tesseract --version
           python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng
           ./test_unstructured_ingest/test-ingest-src.sh
+      - name: Update HTML fixtures
+        run: |
+          source .venv/bin/activate
+          make html-fixtures-update
 
       - name: Save branch name to environment file
         id: branch
 
@@ -204,6 +204,7 @@ example-docs/*_images
 examples/**/output/
 
 outputdiff.txt
+outputhtmldiff.txt
 metricsdiff.txt
 
 # analysis
 
@@ -1,11 +1,13 @@
-## 0.16.24-dev2
+## 0.16.24-dev3
 
 ### Enhancements
 
 - **`extract_image_block_types` now also works for CamelCase elemenet type names**. Previously `NarrativeText` and similar CamelCase element types can't be extracted using the mentioned parameter in `partition`. Now figures for those elements can be extracted like `Image` and `Table` elements
 
 ### Features
 
+- **Add JSON elements to HTML converter** - Converts JSON elements file into an HTML file.
+
 ### Fixes
 
 ## 0.16.23
 
@@ -327,3 +327,12 @@ docker-jupyter-notebook:
 .PHONY: run-jupyter
 run-jupyter:
 	PYTHONPATH=$(realpath .) JUPYTER_PATH=$(realpath .) jupyter-notebook --NotebookApp.token='' --NotebookApp.password=''
+
+
+###########
+# Other #
+###########
+
+.PHONY: html-fixtures-update
+html-fixtures-update:
+	test_unstructured_ingest/structured-json-to-html.sh test_unstructured_ingest/expected-structured-output-html
@@ -0,0 +1,66 @@
+import argparse
+import logging
+import os
+from pathlib import Path
+
+from unstructured.partition.html.convert import elements_to_html
+from unstructured.staging.base import elements_from_json
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+
+def json_to_html(
+    filepath: Path, outdir: Path, exclude_binary_image_data: bool, no_group_by_page: bool
+):
+    logger.info("Processing: %s", filepath)
+    elements = elements_from_json(str(filepath))
+    elements_html = elements_to_html(elements, exclude_binary_image_data, no_group_by_page)
+
+    outpath = outdir / filepath.with_suffix(".html").name
+    os.makedirs(outpath.parent, exist_ok=True)
+    with open(outpath, "w+") as f:
+        f.write(elements_html)
+    logger.info("HTML rendered and saved to: %s", outpath)
+
+
+def multiple_json_to_html(
+    path: Path, outdir: Path, exclude_binary_image_data: bool, no_group_by_page: bool
+):
+    for root, _, files in os.walk(path):
+        for file in files:
+            if file.endswith(".json"):
+                json_file_path = Path(root) / file
+                outpath = outdir / json_file_path.relative_to(path).parent
+                json_to_html(json_file_path, outpath, exclude_binary_image_data, no_group_by_page)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert JSON elements to HTML.")
+    parser.add_argument(
+        "filepath",
+        type=str,
+        help="""Path to the JSON file or directory containing elements.
+        If given directory it will convert all JSON files in directory
+        and all sub-directories.""",
+    )
+    parser.add_argument(
+        "--outdir", type=str, help="Output directory for the HTML file.", default=""
+    )
+    parser.add_argument(
+        "--exclude-img", action="store_true", help="Exclude binary image data from the HTML."
+    )
+    parser.add_argument("--no-group", action="store_true", help="Don't group elements by pages.")
+    args = parser.parse_args()
+
+    filepath = Path(args.filepath)
+    outdir = Path(args.outdir)
+
+    if filepath.is_file():
+        json_to_html(filepath, outdir, args.exclude_img, args.no_group)
+    else:
+        multiple_json_to_html(filepath, outdir, args.exclude_img, args.no_group)
+
+
+if __name__ == "__main__":
+    main()