Open-Source-Legal · JSv4 · Oct 23, 2025 · Oct 20, 2025 · Oct 20, 2025 · Oct 20, 2025
diff --git a/config/graphql/mutations.py b/config/graphql/mutations.py
@@ -2405,6 +2405,8 @@ class Arguments:
         preferred_embedder = graphene.String(required=False)
         slug = graphene.String(required=False)
         is_public = graphene.Boolean(required=False)
+        corpus_agent_instructions = graphene.String(required=False)
+        document_agent_instructions = graphene.String(required=False)
 
 
 class UpdateMe(graphene.Mutation):

diff --git a/config/graphql/serializers.py b/config/graphql/serializers.py
@@ -45,6 +45,8 @@ class Meta:
             "creator",
             "creator_id",
             "preferred_embedder",
+            "corpus_agent_instructions",
+            "document_agent_instructions",
         ]
         read_only_fields = ["id"]
 

diff --git a/config/settings/base.py b/config/settings/base.py
@@ -831,6 +831,130 @@
 
 LLMS_DEFAULT_AGENT_FRAMEWORK = "pydantic_ai"
 
+# Default Agent Instructions
+# ------------------------------------------------------------------------------
+DEFAULT_DOCUMENT_AGENT_INSTRUCTIONS = """━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+⚠️  ABSOLUTE REQUIREMENTS - NO EXCEPTIONS:
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+1. You have ZERO prior knowledge of this document's contents.
+2. You MUST use tools to examine the document before answering ANY question.
+3. NEVER say you don't know what document is being discussed.
+4. NEVER refuse to answer because you 'lack context' - USE THE TOOLS to get context.
+5. Every answer MUST be grounded in information retrieved via tools with specific citations.
+
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+📋 RECOMMENDED SEARCH STRATEGY:
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+For most questions, follow this workflow:
+
+STEP 1 - GET OVERVIEW:
+  • Use `load_document_summary` to understand the document's structure and main topics
+  • Use `get_document_text_length` to check the document size
+  • This helps you plan your detailed search strategy
+
+STEP 2 - BROAD SEARCH (Semantic Understanding):
+  • Use `similarity_search` (vector search) to find semantically relevant sections
+  • Great for: conceptual questions, themes, related ideas, paraphrased content
+  • Returns: annotated passages with page numbers and similarity scores
+
+STEP 3 - DETAILED EXAMINATION:
+  • Use `load_document_text` to read large sections (5K-50K chars) of relevant areas
+  • Identify the specific character ranges from Step 1-2, then load those sections
+  • Read enough context to thoroughly understand the relevant passages
+
+  🔴 MANDATORY CITATION STEP - DO NOT SKIP:
+  After reading ANY bulk text with `load_document_text`, you MUST:
+  1. Identify the 3-5 most relevant exact quotes/passages for your answer
+  2. Extract the EXACT text of each key passage (5-50 words each)
+  3. Call `search_exact_text` with these exact strings to create proper citations
+  4. This converts raw text into citable sources with page numbers
+
+  WHY THIS MATTERS: `load_document_text` returns raw text WITHOUT creating sources.
+  Only `search_exact_text` creates proper citations. Without this step, your answer
+  will have NO SOURCES even though you read the document!
+
+STEP 4 - PRECISE LOCATION (Exact Matching):
+  • Use `search_exact_text` to find specific terms, phrases, or quoted language
+  • Great for: finding exact wording, specific terminology, quoted passages, defined terms
+  • Returns: all occurrences with page numbers and bounding boxes (PDFs)
+  • Use this to provide precise citations with exact page locations
+  • CRITICAL: Always use this AFTER bulk text loading to create proper source citations
+
+STEP 5 - CROSS-REFERENCE:
+  • Use `get_document_notes` to check for existing analysis or annotations
+  • Combine findings from multiple tools to ensure completeness
+
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+🔧 TOOL SELECTION GUIDE:
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+Use `similarity_search` when:
+  → Question asks about concepts, themes, or ideas (not exact words)
+  → You need to find related content even if worded differently
+  → Looking for passages that discuss a topic
+
+Use `search_exact_text` when:
+  → User asks about specific terms, phrases, or exact wording
+  → You need to verify if specific language appears in the document
+  → Providing citations that require exact page locations
+  → Finding defined terms or quoted material
+
+Use `load_document_text` when:
+  → You need to read substantial sections for full context
+  → Initial searches identified relevant areas to examine in detail
+  → Question requires understanding flow, structure, or relationships
+  ⚠️  ALWAYS follow with `search_exact_text` on key passages to create citations!
+
+Use `load_document_summary` when:
+  → Starting your analysis (always good first step)
+  → Need high-level overview of document structure
+  → Understanding document organization before detailed search
+
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+✅ RESPONSE REQUIREMENTS:
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+• Provide complete, accurate answers based on document contents
+• Include specific citations (page numbers, quotes) from tool results
+• 🔴 CRITICAL: If you used `load_document_text`, you MUST use `search_exact_text`
+  on key passages to generate proper citations. Otherwise your answer will have NO SOURCES.
+• If information isn't in the document, explicitly state it was not found
+• Use multiple search strategies to ensure thoroughness
+• Present findings clearly with proper attribution to sources"""
+
+DEFAULT_CORPUS_AGENT_INSTRUCTIONS = """You are a helpful corpus analysis assistant.
+Your role is to help users understand and analyze collections of documents by coordinating across
+multiple documents and using the tools available to you.
+
+**CRITICAL RULES:**
+1. ALWAYS use tools to gather information before answering
+2. You have access to multiple documents - use them effectively
+3. ALWAYS cite sources from specific documents when making claims
+
+**Available Tools:**
+- **Document-Specific Tools**: Available via `ask_document(document_id, question)`
+- **Corpus-Level Tools**: `list_documents()` to see all available documents
+- **Cross-Document Search**: Semantic search across the entire corpus
+
+**Recommended Strategy:**
+1. If the corpus has a description, use it as context
+2. If the corpus description is empty BUT has documents:
+   - Start by using `list_documents()` to see what's available
+   - Use `ask_document()` to query specific documents
+   - Use cross-document vector search for themes across documents
+3. Synthesize information from multiple sources
+4. Always cite which document(s) your information comes from
+
+**When Corpus Has No Description:**
+Don't just say "the corpus description is empty" - that's not helpful! Instead:
+1. List available documents
+2. Ask the user which documents they want to know about
+3. OR proactively examine key documents to provide a useful summary
+
+Always prioritize being helpful and use your tools to provide value."""
+
 # LLM Client Provider Settings
 # ------------------------------------------------------------------------------
 LLM_CLIENT_PROVIDER = env.str("LLM_CLIENT_PROVIDER", default="openai")

diff --git a/config/websocket/consumers/standalone_document_conversation.py b/config/websocket/consumers/standalone_document_conversation.py
@@ -169,14 +169,17 @@ async def pick_document_embedder(self) -> str:
         """
         from opencontractserver.annotations.models import Embedding
 
+        # Extract document ID in async context before passing to sync function
+        document_id = self.document.id
+
         def get_embedder_paths():
             """
             Construct AND evaluate queryset in same DB connection to avoid
             transaction isolation issues with database_sync_to_async.
             """
             return list(
                 Embedding.objects.filter(
-                    annotation__document=self.document,
+                    annotation__document_id=document_id,
                     annotation__structural=True,
                 )
                 .values_list("embedder_path", flat=True)

diff --git a/frontend/src/components/corpuses/CorpusAgentSettings.tsx b/frontend/src/components/corpuses/CorpusAgentSettings.tsx
@@ -0,0 +1,188 @@
+import React, { useState } from "react";
+import { Form, Button, Message, TextArea, Header } from "semantic-ui-react";
+import { useMutation } from "@apollo/client";
+import { toast } from "react-toastify";
+import styled from "styled-components";
+import {
+  UPDATE_CORPUS,
+  UpdateCorpusInputs,
+  UpdateCorpusOutputs,
+} from "../../graphql/mutations";
+
+interface CorpusAgentSettingsProps {
+  corpusId: string;
+  corpusAgentInstructions?: string | null;
+  documentAgentInstructions?: string | null;
+  canUpdate: boolean;
+}
+
+const Container = styled.div`
+  padding: 1.5rem;
+  background: white;
+  border-radius: 8px;
+  box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
+`;
+
+const Section = styled.div`
+  margin-bottom: 2rem;
+
+  &:last-child {
+    margin-bottom: 0;
+  }
+`;
+
+const SectionHeader = styled.div`
+  margin-bottom: 1rem;
+`;
+
+const HelperText = styled.p`
+  color: #64748b;
+  font-size: 0.875rem;
+  margin: 0.5rem 0;
+  line-height: 1.5;
+`;
+
+const ButtonGroup = styled.div`
+  display: flex;
+  gap: 0.75rem;
+  margin-top: 1rem;
+`;
+
+export const CorpusAgentSettings: React.FC<CorpusAgentSettingsProps> = ({
+  corpusId,
+  corpusAgentInstructions,
+  documentAgentInstructions,
+  canUpdate,
+}) => {
+  const [corpusInstructions, setCorpusInstructions] = useState(
+    corpusAgentInstructions || ""
+  );
+  const [documentInstructions, setDocumentInstructions] = useState(
+    documentAgentInstructions || ""
+  );
+  const [hasChanges, setHasChanges] = useState(false);
+
+  const [updateCorpus, { loading }] = useMutation<
+    UpdateCorpusOutputs,
+    UpdateCorpusInputs
+  >(UPDATE_CORPUS, {
+    onCompleted: (data) => {
+      if (data.updateCorpus.ok) {
+        toast.success("Agent instructions updated successfully");
+        setHasChanges(false);
+      } else {
+        toast.error(
+          `Failed to update: ${data.updateCorpus.message || "Unknown error"}`
+        );
+      }
+    },
+    onError: (error) => {
+      toast.error(`Error: ${error.message}`);
+    },
+  });
+
+  const handleCorpusInstructionsChange = (value: string) => {
+    setCorpusInstructions(value);
+    setHasChanges(
+      value !== (corpusAgentInstructions || "") ||
+        documentInstructions !== (documentAgentInstructions || "")
+    );
+  };
+
+  const handleDocumentInstructionsChange = (value: string) => {
+    setDocumentInstructions(value);
+    setHasChanges(
+      value !== (documentAgentInstructions || "") ||
+        corpusInstructions !== (corpusAgentInstructions || "")
+    );
+  };
+
+  const handleSave = () => {
+    updateCorpus({
+      variables: {
+        id: corpusId,
+        corpusAgentInstructions: corpusInstructions || undefined,
+        documentAgentInstructions: documentInstructions || undefined,
+      },
+    });
+  };
+
+  const handleReset = () => {
+    setCorpusInstructions(corpusAgentInstructions || "");
+    setDocumentInstructions(documentAgentInstructions || "");
+    setHasChanges(false);
+  };
+
+  if (!canUpdate) {
+    return (
+      <Container>
+        <Message info>
+          You do not have permission to update agent instructions for this
+          corpus.
+        </Message>
+      </Container>
+    );
+  }
+
+  return (
+    <Container>
+      <Header as="h3" style={{ marginTop: 0 }}>
+        Agent Instructions
+      </Header>
+      <HelperText>
+        Customize how AI agents behave when analyzing this corpus and its
+        documents. Leave blank to use system defaults.
+      </HelperText>
+
+      <Form>
+        <Section>
+          <SectionHeader>
+            <Header as="h4">Corpus Agent Instructions</Header>
+            <HelperText>
+              Controls how the corpus-level agent behaves when answering
+              questions about the collection of documents. Default instructions
+              tell the agent to examine available documents when the corpus
+              description is empty.
+            </HelperText>
+          </SectionHeader>
+          <TextArea
+            placeholder="Leave blank to use default instructions..."
+            value={corpusInstructions}
+            onChange={(e) => handleCorpusInstructionsChange(e.target.value)}
+            rows={8}
+            style={{ fontFamily: "monospace", fontSize: "0.9rem" }}
+          />
+        </Section>
+
+        <Section>
+          <SectionHeader>
+            <Header as="h4">Document Agent Instructions</Header>
+            <HelperText>
+              Controls how document-level agents behave when analyzing
+              individual documents in this corpus. Default instructions
+              emphasize using tools and citing sources with page numbers.
+            </HelperText>
+          </SectionHeader>
+          <TextArea
+            placeholder="Leave blank to use default instructions..."
+            value={documentInstructions}
+            onChange={(e) => handleDocumentInstructionsChange(e.target.value)}
+            rows={8}
+            style={{ fontFamily: "monospace", fontSize: "0.9rem" }}
+          />
+        </Section>
+
+        {hasChanges && (
+          <ButtonGroup>
+            <Button primary onClick={handleSave} loading={loading}>
+              Save Changes
+            </Button>
+            <Button onClick={handleReset} disabled={loading}>
+              Reset
+            </Button>
+          </ButtonGroup>
+        )}
+      </Form>
+    </Container>
+  );
+};
diff --git a/frontend/src/components/corpuses/CorpusSettings.tsx b/frontend/src/components/corpuses/CorpusSettings.tsx
@@ -25,6 +25,7 @@ import {
 } from "../../graphql/mutations";
 import { CreateCorpusActionModal } from "./CreateCorpusActionModal";
 import { CorpusMetadataSettings } from "./CorpusMetadataSettings";
+import { CorpusAgentSettings } from "./CorpusAgentSettings";
 import {
   UPDATE_CORPUS,
   UpdateCorpusInputs,
@@ -769,6 +770,22 @@ export const CorpusSettings: React.FC<CorpusSettingsProps> = ({ corpus }) => {
           </MetadataContent>
         </InfoSection>
 
+        <InfoSection>
+          <SectionHeader>
+            <SectionTitle>Agent Instructions</SectionTitle>
+          </SectionHeader>
+          <MetadataContent>
+            <CorpusAgentSettings
+              corpusId={corpus.id}
+              corpusAgentInstructions={(corpus as any).corpusAgentInstructions}
+              documentAgentInstructions={
+                (corpus as any).documentAgentInstructions
+              }
+              canUpdate={canUpdate}
+            />
+          </MetadataContent>
+        </InfoSection>
+
         <CreateCorpusActionModal
           corpusId={corpus.id}
           open={isModalOpen}