Open-Source-Legal · JSv4 · Aug 25, 2025 · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
@@ -0,0 +1,68 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(yarn install)",
+      "Bash(yarn run test:ct:*)",
+      "Bash(find:*)",
+      "Bash(mv:*)",
+      "Bash(yarn playwright test:*)",
+      "Bash(yarn lint)",
+      "Bash(yarn prettier:*)",
+      "Bash(yarn:*)",
+      "Bash(rm:*)",
+      "Bash(grep:*)",
+      "Bash(mkdir:*)",
+      "Bash(ls:*)",
+      "Bash(npx tsc:*)",
+      "Bash(timeout:*)",
+      "Bash(sed:*)",
+      "Bash(awk:*)",
+      "Bash(git log:*)",
+      "Bash(docker compose:*)",
+      "Bash(docker compose:*)",
+      "Bash(time docker compose -f test.yml run --rm django echo \"Container started successfully\")",
+      "Bash(docker exec:*)",
+      "Bash(docker volume:*)",
+      "Bash(docker logs:*)",
+      "Bash(docker wait:*)",
+      "Bash(echo \"Exit code: $?\")",
+      "Bash(npm run typecheck:*)",
+      "Bash(pkill:*)",
+      "Bash(cat:*)",
+      "Bash(docker:*)",
+      "Bash(curl:*)",
+      "Bash(npx vitest run:*)",
+      "Bash(pip show:*)",
+      "Bash(python:*)",
+      "Read(//home/jman/test-opencontracts/**)",
+      "Read(//home/jman/test-opencontracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)",
+      "Read(//home/jman/test-opencontracts/OpenContracts/**)"
+    ],
+    "deny": [],
+    "defaultMode": "acceptEdits"
+  }
+}
diff --git a/.cursor/rules/running-frontend-tests.mdc b/.cursor/rules/running-frontend-tests.mdc
@@ -0,0 +1,6 @@
+---
+description: How to run frontend component tests
+globs: frontend/tests/**
+alwaysApply: false
+---
+Use yarn to run playwright tests with command `yarn run test:ct --reporter=list`. Using the list reported prevents the html results from rendering and hanging your tool. Obv you can use -g flag or specify test(s) names when running tests using this command syntax - no need to run the ENTIRE suite unless you have to.
diff --git a/.envs/.test/.django b/.envs/.test/.django
@@ -40,6 +40,7 @@ USE_AUTH0=false
 # ------------------------------------------------------------------------------
 OPENAI_API_KEY=fake
 OPENAI_MODEL=gpt-4o
+ANTHROPIC_API_KEY=fake
 
 # Set Proper Settings Module
 # ------------------------------------------------------------------------------

diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
@@ -101,17 +101,24 @@ jobs:
       - name: Build Pytest Coverage File
         timeout-minutes: 100
         run: |
-          # Run the full test suite
-          docker compose -f test.yml run django coverage run -m pytest --cov-report=xml --cov -x -v
+          # Run the full test suite with coverage
+          docker compose -f test.yml run django coverage run -m pytest -x -v
+          # Generate XML coverage report (configuration in setup.cfg handles temporary files)
+          docker compose -f test.yml run django coverage xml
+
+      - name: Verify Coverage File Exists
+        run: |
+          # Verify coverage.xml exists in the working directory
+          ls -la coverage.xml
 
       - name: Upload Coverage Reports to Codecov
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-          COMMIT_SHA: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || env.GITHUB_SHA }}
-        run:  |
-              # use bash variable expression to get the substring
-              ci_env=`bash <(curl -s https://codecov.io/env)`
-              docker compose -f test.yml run $ci_env django /bin/codecov -v -t ${CODECOV_TOKEN} -R . -f coverage.xml -C ${COMMIT_SHA}
+        uses: codecov/codecov-action@v4
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          file: ./coverage.xml
+          flags: backend
+          name: backend-coverage
+          fail_ci_if_error: false
 
       - name: Tear down the Stack
         run:  docker compose -f test.yml down
diff --git a/.github/workflows/frontend.yml b/.github/workflows/frontend.yml
@@ -9,6 +9,9 @@ on:
     branches:
       - main
   pull_request:
+    paths:
+      - 'frontend/**'
+      - '.github/workflows/frontend.yml'
 
 jobs:
   lint:
@@ -59,6 +62,11 @@ jobs:
       - name: Run Playwright Component Tests
         run: yarn run test:ct
 
+      - name: Run Metadata Component Tests
+        run: |
+          yarn run test:ct tests/*Metadata*.ct.tsx
+        continue-on-error: true
+
   unit-test:
     name: Unit Tests
     runs-on: ubuntu-latest

diff --git a/README.md b/README.md
@@ -13,7 +13,9 @@
 
 ## TLDR: What Does it Do?
 
-OpenContracts is an **GPL-3** enterprise document analytics tool. It supports multiple formats - including PDF and txt-based formats (with more on the way). It also supports multiple document ingestion pipelines with a [pluggable architecture](docs/pipelines/pipeline_overview.md) designed to make supporting new formats and ingestion engines easy - see our [Docling Integration](docs/pipelines/docling_parser.md) for an example. Writing your own custom document analytics tools where the results get displayed beautifully over the original document [is easy](docs/walkthrough/advanced/register-doc-analyzer.md). We also support mass document [data extraction](docs/extract_and_retrieval/intro_to_django_annotation_vector_store.md) with a [LlamaIndex](https://www.llamaindex.ai/) wrapper.
+**Knowledge is power. Software is a tool.** OpenContracts is **FREE and OPEN SOURCE** software designed to put knowledge owners and subject matter experts in charge of their knowledge. Store it in an accessible and exportable format, and make it work with emerging agentic workflows and techniques.
+
+OpenContracts is a **GPL-3.0** enterprise document analytics tool. It supports multiple formats - including PDF and txt-based formats (with more on the way). It also supports multiple document ingestion pipelines with a [pluggable architecture](docs/pipelines/pipeline_overview.md) designed to make supporting new formats and ingestion engines easy - see our [Docling Integration](docs/pipelines/docling_parser.md) for an example. Writing your own custom document analytics tools where the results get displayed beautifully over the original document [is easy](docs/walkthrough/advanced/register-doc-analyzer.md). We also support mass document [data extraction](docs/extract_and_retrieval/data_extraction.md) with our custom [LLM framework](docs/architecture/llms/README.md) built on PydanticAI.
 
 ### PDF-Annotation and Analysis:
 
@@ -39,38 +41,37 @@ OpenContracts is an **GPL-3** enterprise document analytics tool. It supports mu
 
 OpenContracts provides several key features:
 
-1. **Manage Documents** - Manage document collections (`Corpuses`)
-2. **Layout Parser** - Automatically extracts layout features from PDFs
-3. **Automatic Vector Embeddings** - generated for uploaded PDFs and extracted layout blocks
-4. **Pluggable microservice analyzer architecture** - to let you analyze documents and automatically annotate them
-5. **Pluggable Parsing Pipelines** - to let you support new document formats and ingestion engines. 
-6. **Human Annotation Interface** - to manually annotated documents, including multi-page annotations.
-7. **LlamaIndex Integration** - Use our vector stores (powered by pgvector) and any manual or automatically annotated features
-   to let an LLM intelligently answer questions.
-8. **Data Extract** - ask multiple questions across hundreds of documents using complex LLM-powered querying behavior.
-   Our sample implementation uses LlamaIndex + Marvin.
-9. **Custom Data Extract** - Custom data extract pipelines can be used on the frontend to query documents in bulk.
+1. **Document Management** - Organize documents into collections (`Corpuses`) with fine-grained permissions
+2. **Custom Metadata Schemas** - Define structured metadata fields with validation for consistent data collection
+3. **Layout Parser** - Automatically extracts layout features from PDFs using modern parsing pipelines
+4. **Automatic Vector Embeddings** - Generated for uploaded documents and extracted layout blocks (powered by pgvector)
+5. **Pluggable Analyzer Architecture** - Deploy custom microservices to analyze documents and automatically annotate them
+6. **Pluggable Parsing Pipelines** - Support new document formats with modular parsers (Docling, NLM-Ingest, etc.)
+7. **Human Annotation Interface** - Manually annotate documents with multi-page annotations and collaborative features
+8. **Custom LLM Framework** - Built on PydanticAI with conversation management, structured responses, and real-time streaming
+9. **Bulk Data Extract** - Ask multiple questions across hundreds of documents using our agent-powered querying system
+10. **Custom Extract Pipelines** - Create bespoke data extraction workflows displayed directly in the frontend
 
 ## Key Docs
 
 We recommend you [browse our docs](https://jsv4.github.io/OpenContracts/) via our Mkdocs Site. You can also view the 
 docs in the repo:
 
-1. [Quickstart Guide](docs/quick-start.md) - You'll probably want to get started quickly. Setting up locally should be
+1. [Quickstart Guide](docs/quick_start.md) - You'll probably want to get started quickly. Setting up locally should be
    pretty painless if you're already running Docker.
 2. [Basic Walkthrough](docs/walkthrough/key-concepts.md) - Check out the walkthrough to step through basic usage of the
    application for document and annotation management.
-2. [PDF Annotation Data Format Overview](docs/architecture/PDF-data-layer.md) - You may be interested how we map text to
+3. [Metadata System](docs/metadata/metadata_overview.md) - Learn how to define custom metadata schemas for your documents
+   with comprehensive validation and type safety.
+4. [PDF Annotation Data Format Overview](docs/architecture/PDF-data-layer.md) - You may be interested how we map text to
    PDFs visually and the underlying data format we're using.
-3. [Django + Pgvector Powered Hybrid Vector Database](docs/extract_and_retrieval/intro_to_django_annotation_vector_store.md)
+5. [Custom LLM Framework](docs/architecture/llms/README.md) - Our PydanticAI-based framework provides 
+   document and corpus agents with conversation management, structured responses, and real-time event streaming.
+6. [Vector Store Architecture](docs/extract_and_retrieval/vector_stores.md) -
    We've used the latest open source tooling for vector storage in postgres to make it almost trivially easy to
-   combine structured metadata and vector embeddings with an API-powered application.
-4. [LlamaIndex Integration Walkthrough](docs/extract_and_retrieval/intro_to_django_annotation_vector_store.md) - We wrote a
-   wrapper for our backend database and vector store to make it simple to load our parsed annotations, embeddings and
-   text into LlamaIndex. Even better, if you have additional annotations in the document, the LLM can access those too.
-5. [Write Custom Data Extractors](docs/walkthrough/advanced/write-your-own-extractors.md) - Custom data extract tasks (which
-   can use LlamaIndex or can be totally bespoke) are automatically loaded and displayed on the frontend to let user's
-   select how to ask questions and extract data from documents.
+   combine structured metadata and vector embeddings with our LLM agents.
+7. [Write Custom Data Extractors](docs/walkthrough/advanced/write-your-own-extractors.md) - Custom data extract tasks are
+   automatically loaded and displayed on the frontend to let users select how to ask questions and extract data from documents.
 
 ## Architecture and Data Flows at a Glance
 

diff --git a/compose/local/django/Dockerfile b/compose/local/django/Dockerfile
@@ -68,6 +68,7 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
   # Translations dependencies
   gettext \
   git \
+  wget \
   poppler-utils \
   tesseract-ocr \
   libtesseract-dev  \
@@ -105,9 +106,17 @@ RUN for script in ./model_preloaders/*.py; do \
       python "$script"; \
     done
 
-# Download spacy models
-RUN python -m spacy download en_core_web_sm
-RUN python -m spacy download en_core_web_lg
+# Download spacy models with retry logic and direct pip installation
+# Using pip directly is more reliable than spacy download command in Docker
+RUN pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl && \
+    python -c "import spacy; spacy.load('en_core_web_sm')"
+
+# For the large model, we'll use wget with retry to ensure download completes
+RUN wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 5 \
+    https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl && \
+    pip install en_core_web_lg-3.8.0-py3-none-any.whl && \
+    rm en_core_web_lg-3.8.0-py3-none-any.whl && \
+    python -c "import spacy; spacy.load('en_core_web_lg')"
 
 COPY ./compose/production/django/entrypoint /entrypoint
 RUN sed -i 's/\r$//g' /entrypoint

diff --git a/config/asgi.py b/config/asgi.py
@@ -32,6 +32,9 @@
 from config.websocket.consumers.document_conversation import (  # noqa: E402
     DocumentQueryConsumer,
 )
+from config.websocket.consumers.standalone_document_conversation import (  # noqa: E402
+    StandaloneDocumentQueryConsumer,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -53,9 +56,16 @@
     CorpusQueryConsumer.as_asgi(),
 )
 
+# NEW - stand-alone document chat (no corpus_id in URL)
+standalone_document_query_pattern = re_path(
+    r"ws/standalone/document/(?P<document_id>[-a-zA-Z0-9_=]+)/query/$",
+    StandaloneDocumentQueryConsumer.as_asgi(),
+)
+
 websocket_urlpatterns = [
     document_query_pattern,
     corpus_query_pattern,
+    standalone_document_query_pattern,  # NEW stand-alone route
 ]
 
 # Log all registered websocket patterns

diff --git a/config/graphql/graphene_types.py b/config/graphql/graphene_types.py
@@ -303,7 +303,6 @@ class LabelTypeEnum(graphene.Enum):
     RELATIONSHIP_LABEL = "RELATIONSHIP_LABEL"
     DOC_TYPE_LABEL = "DOC_TYPE_LABEL"
     TOKEN_LABEL = "TOKEN_LABEL"
-    METADATA_LABEL = "METADATA_LABEL"
     SPAN_LABEL = "SPAN_LABEL"
 
 
@@ -355,7 +354,6 @@ class LabelSetType(AnnotatePermissionsForReadMixin, DjangoObjectType):
     doc_label_count = graphene.Int(description="Count of document-level type labels")
     span_label_count = graphene.Int(description="Count of span-based labels")
     token_label_count = graphene.Int(description="Count of token-level labels")
-    metadata_label_count = graphene.Int(description="Count of metadata labels")
 
     def resolve_doc_label_count(self, info):
         return self.annotation_labels.filter(label_type="DOC_TYPE_LABEL").count()
@@ -366,9 +364,6 @@ def resolve_span_label_count(self, info):
     def resolve_token_label_count(self, info):
         return self.annotation_labels.filter(label_type="TOKEN_LABEL").count()
 
-    def resolve_metadata_label_count(self, info):
-        return self.annotation_labels.filter(label_type="METADATA_LABEL").count()
-
     # To get ALL labels for a given labelset
     all_annotation_labels = graphene.Field(graphene.List(AnnotationLabelType))
 
@@ -613,17 +608,35 @@ def resolve_all_annotations(
                 annotations = self.doc_annotations.filter(structural=True)
             else:
                 corpus_pk = from_global_id(corpus_id)[1]
-                annotations = self.doc_annotations.filter(corpus_id=corpus_pk)
-
+                # Get structural annotations + corpus annotations
+                # We'll filter corpus annotations based on analysis_id below
                 if is_structural is not None:
-                    annotations = annotations.filter(structural=is_structural)
-
-            if analysis_id is not None:
-                if analysis_id == "__none__":
-                    annotations = annotations.filter(analysis__isnull=True)
+                    annotations = self.doc_annotations.filter(
+                        corpus_id=corpus_pk, structural=is_structural
+                    )
+                else:
+                    # Get both structural and corpus annotations
+                    annotations = self.doc_annotations.filter(
+                        Q(structural=True) | Q(corpus_id=corpus_pk)
+                    )
+
+            # Filter based on analysis_id
+            # IMPORTANT: When analysis_id is None (not provided), we should only show
+            # user-created annotations (analysis__isnull=True) plus structural ones
+            if (
+                corpus_id is not None
+            ):  # Only apply analysis filtering for corpus queries
+                if analysis_id is None or analysis_id == "__none__":
+                    # No analysis selected: show only user annotations (no analysis) + structural
+                    annotations = annotations.filter(
+                        Q(analysis__isnull=True) | Q(structural=True)
+                    )
                 else:
+                    # Specific analysis selected: show that analysis's annotations + structural
                     analysis_pk = from_global_id(analysis_id)[1]
-                    annotations = annotations.filter(analysis_id=analysis_pk)
+                    annotations = annotations.filter(
+                        Q(analysis_id=analysis_pk) | Q(structural=True)
+                    )
 
             return annotations.distinct()
         except Exception as e:
@@ -642,7 +655,7 @@ def resolve_all_annotations(
 
     def resolve_all_relationships(self, info, corpus_id=None, analysis_id=None):
         try:
-            # Want to limit to strucutural relationships or corpus relationships
+            # Want to limit to structural relationships or corpus relationships
             if corpus_id is None:
                 relationships = self.relationships.filter(structural=True)
             else:
@@ -651,11 +664,20 @@ def resolve_all_relationships(self, info, corpus_id=None, analysis_id=None):
                     Q(corpus_id=corpus_pk) | Q(structural=True)
                 )
 
-            if analysis_id == "__none__":
-                relationships = relationships.filter(analysis__isnull=True)
-            elif analysis_id is not None:
-                analysis_pk = from_global_id(analysis_id)[1]
-                relationships = relationships.filter(analysis_id=analysis_pk)
+                # Filter based on analysis_id
+                # IMPORTANT: When analysis_id is None (not provided), we should only show
+                # user-created relationships (analysis__isnull=True) plus structural ones
+                if analysis_id is None or analysis_id == "__none__":
+                    # No analysis selected: show only user relationships (no analysis) + structural
+                    relationships = relationships.filter(
+                        Q(analysis__isnull=True) | Q(structural=True)
+                    )
+                else:
+                    # Specific analysis selected: show that analysis's relationships + structural
+                    analysis_pk = from_global_id(analysis_id)[1]
+                    relationships = relationships.filter(
+                        Q(analysis_id=analysis_pk) | Q(structural=True)
+                    )
 
             return relationships.distinct()
         except Exception as e:
@@ -985,6 +1007,9 @@ class Meta:
 
 
 class ColumnType(AnnotatePermissionsForReadMixin, DjangoObjectType):
+    validation_config = GenericScalar()
+    default_value = GenericScalar()
+
     class Meta:
         model = Column
         interfaces = [relay.Node]