Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions config/graphql/mutations.py
Original file line number Diff line number Diff line change
Expand Up @@ -2405,6 +2405,8 @@ class Arguments:
preferred_embedder = graphene.String(required=False)
slug = graphene.String(required=False)
is_public = graphene.Boolean(required=False)
corpus_agent_instructions = graphene.String(required=False)
document_agent_instructions = graphene.String(required=False)


class UpdateMe(graphene.Mutation):
Expand Down
2 changes: 2 additions & 0 deletions config/graphql/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ class Meta:
"creator",
"creator_id",
"preferred_embedder",
"corpus_agent_instructions",
"document_agent_instructions",
]
read_only_fields = ["id"]

Expand Down
124 changes: 124 additions & 0 deletions config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,6 +831,130 @@

LLMS_DEFAULT_AGENT_FRAMEWORK = "pydantic_ai"

# Default Agent Instructions
# ------------------------------------------------------------------------------
DEFAULT_DOCUMENT_AGENT_INSTRUCTIONS = """━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
⚠️ ABSOLUTE REQUIREMENTS - NO EXCEPTIONS:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

1. You have ZERO prior knowledge of this document's contents.
2. You MUST use tools to examine the document before answering ANY question.
3. NEVER say you don't know what document is being discussed.
4. NEVER refuse to answer because you 'lack context' - USE THE TOOLS to get context.
5. Every answer MUST be grounded in information retrieved via tools with specific citations.

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
📋 RECOMMENDED SEARCH STRATEGY:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

For most questions, follow this workflow:

STEP 1 - GET OVERVIEW:
• Use `load_document_summary` to understand the document's structure and main topics
• Use `get_document_text_length` to check the document size
• This helps you plan your detailed search strategy

STEP 2 - BROAD SEARCH (Semantic Understanding):
• Use `similarity_search` (vector search) to find semantically relevant sections
• Great for: conceptual questions, themes, related ideas, paraphrased content
• Returns: annotated passages with page numbers and similarity scores

STEP 3 - DETAILED EXAMINATION:
• Use `load_document_text` to read large sections (5K-50K chars) of relevant areas
• Identify the specific character ranges from Step 1-2, then load those sections
• Read enough context to thoroughly understand the relevant passages

🔴 MANDATORY CITATION STEP - DO NOT SKIP:
After reading ANY bulk text with `load_document_text`, you MUST:
1. Identify the 3-5 most relevant exact quotes/passages for your answer
2. Extract the EXACT text of each key passage (5-50 words each)
3. Call `search_exact_text` with these exact strings to create proper citations
4. This converts raw text into citable sources with page numbers

WHY THIS MATTERS: `load_document_text` returns raw text WITHOUT creating sources.
Only `search_exact_text` creates proper citations. Without this step, your answer
will have NO SOURCES even though you read the document!

STEP 4 - PRECISE LOCATION (Exact Matching):
• Use `search_exact_text` to find specific terms, phrases, or quoted language
• Great for: finding exact wording, specific terminology, quoted passages, defined terms
• Returns: all occurrences with page numbers and bounding boxes (PDFs)
• Use this to provide precise citations with exact page locations
• CRITICAL: Always use this AFTER bulk text loading to create proper source citations

STEP 5 - CROSS-REFERENCE:
• Use `get_document_notes` to check for existing analysis or annotations
• Combine findings from multiple tools to ensure completeness

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🔧 TOOL SELECTION GUIDE:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Use `similarity_search` when:
→ Question asks about concepts, themes, or ideas (not exact words)
→ You need to find related content even if worded differently
→ Looking for passages that discuss a topic

Use `search_exact_text` when:
→ User asks about specific terms, phrases, or exact wording
→ You need to verify if specific language appears in the document
→ Providing citations that require exact page locations
→ Finding defined terms or quoted material

Use `load_document_text` when:
→ You need to read substantial sections for full context
→ Initial searches identified relevant areas to examine in detail
→ Question requires understanding flow, structure, or relationships
⚠️ ALWAYS follow with `search_exact_text` on key passages to create citations!

Use `load_document_summary` when:
→ Starting your analysis (always good first step)
→ Need high-level overview of document structure
→ Understanding document organization before detailed search

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
✅ RESPONSE REQUIREMENTS:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

• Provide complete, accurate answers based on document contents
• Include specific citations (page numbers, quotes) from tool results
• 🔴 CRITICAL: If you used `load_document_text`, you MUST use `search_exact_text`
on key passages to generate proper citations. Otherwise your answer will have NO SOURCES.
• If information isn't in the document, explicitly state it was not found
• Use multiple search strategies to ensure thoroughness
• Present findings clearly with proper attribution to sources"""

DEFAULT_CORPUS_AGENT_INSTRUCTIONS = """You are a helpful corpus analysis assistant.
Your role is to help users understand and analyze collections of documents by coordinating across
multiple documents and using the tools available to you.

**CRITICAL RULES:**
1. ALWAYS use tools to gather information before answering
2. You have access to multiple documents - use them effectively
3. ALWAYS cite sources from specific documents when making claims

**Available Tools:**
- **Document-Specific Tools**: Available via `ask_document(document_id, question)`
- **Corpus-Level Tools**: `list_documents()` to see all available documents
- **Cross-Document Search**: Semantic search across the entire corpus

**Recommended Strategy:**
1. If the corpus has a description, use it as context
2. If the corpus description is empty BUT has documents:
- Start by using `list_documents()` to see what's available
- Use `ask_document()` to query specific documents
- Use cross-document vector search for themes across documents
3. Synthesize information from multiple sources
4. Always cite which document(s) your information comes from

**When Corpus Has No Description:**
Don't just say "the corpus description is empty" - that's not helpful! Instead:
1. List available documents
2. Ask the user which documents they want to know about
3. OR proactively examine key documents to provide a useful summary

Always prioritize being helpful and use your tools to provide value."""

# LLM Client Provider Settings
# ------------------------------------------------------------------------------
LLM_CLIENT_PROVIDER = env.str("LLM_CLIENT_PROVIDER", default="openai")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,14 +169,17 @@ async def pick_document_embedder(self) -> str:
"""
from opencontractserver.annotations.models import Embedding

# Extract document ID in async context before passing to sync function
document_id = self.document.id

def get_embedder_paths():
"""
Construct AND evaluate queryset in same DB connection to avoid
transaction isolation issues with database_sync_to_async.
"""
return list(
Embedding.objects.filter(
annotation__document=self.document,
annotation__document_id=document_id,
annotation__structural=True,
)
.values_list("embedder_path", flat=True)
Expand Down
188 changes: 188 additions & 0 deletions frontend/src/components/corpuses/CorpusAgentSettings.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
import React, { useState } from "react";
import { Form, Button, Message, TextArea, Header } from "semantic-ui-react";
import { useMutation } from "@apollo/client";
import { toast } from "react-toastify";
import styled from "styled-components";
import {
UPDATE_CORPUS,
UpdateCorpusInputs,
UpdateCorpusOutputs,
} from "../../graphql/mutations";

interface CorpusAgentSettingsProps {
corpusId: string;
corpusAgentInstructions?: string | null;
documentAgentInstructions?: string | null;
canUpdate: boolean;
}

const Container = styled.div`
padding: 1.5rem;
background: white;
border-radius: 8px;
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
`;

const Section = styled.div`
margin-bottom: 2rem;

&:last-child {
margin-bottom: 0;
}
`;

const SectionHeader = styled.div`
margin-bottom: 1rem;
`;

const HelperText = styled.p`
color: #64748b;
font-size: 0.875rem;
margin: 0.5rem 0;
line-height: 1.5;
`;

const ButtonGroup = styled.div`
display: flex;
gap: 0.75rem;
margin-top: 1rem;
`;

export const CorpusAgentSettings: React.FC<CorpusAgentSettingsProps> = ({
corpusId,
corpusAgentInstructions,
documentAgentInstructions,
canUpdate,
}) => {
const [corpusInstructions, setCorpusInstructions] = useState(
corpusAgentInstructions || ""
);
const [documentInstructions, setDocumentInstructions] = useState(
documentAgentInstructions || ""
);
const [hasChanges, setHasChanges] = useState(false);

const [updateCorpus, { loading }] = useMutation<
UpdateCorpusOutputs,
UpdateCorpusInputs
>(UPDATE_CORPUS, {
onCompleted: (data) => {
if (data.updateCorpus.ok) {
toast.success("Agent instructions updated successfully");
setHasChanges(false);
} else {
toast.error(
`Failed to update: ${data.updateCorpus.message || "Unknown error"}`
);
}
},
onError: (error) => {
toast.error(`Error: ${error.message}`);
},
});

const handleCorpusInstructionsChange = (value: string) => {
setCorpusInstructions(value);
setHasChanges(
value !== (corpusAgentInstructions || "") ||
documentInstructions !== (documentAgentInstructions || "")
);
};

const handleDocumentInstructionsChange = (value: string) => {
setDocumentInstructions(value);
setHasChanges(
value !== (documentAgentInstructions || "") ||
corpusInstructions !== (corpusAgentInstructions || "")
);
};

const handleSave = () => {
updateCorpus({
variables: {
id: corpusId,
corpusAgentInstructions: corpusInstructions || undefined,
documentAgentInstructions: documentInstructions || undefined,
},
});
};

const handleReset = () => {
setCorpusInstructions(corpusAgentInstructions || "");
setDocumentInstructions(documentAgentInstructions || "");
setHasChanges(false);
};

if (!canUpdate) {
return (
<Container>
<Message info>
You do not have permission to update agent instructions for this
corpus.
</Message>
</Container>
);
}

return (
<Container>
<Header as="h3" style={{ marginTop: 0 }}>
Agent Instructions
</Header>
<HelperText>
Customize how AI agents behave when analyzing this corpus and its
documents. Leave blank to use system defaults.
</HelperText>

<Form>
<Section>
<SectionHeader>
<Header as="h4">Corpus Agent Instructions</Header>
<HelperText>
Controls how the corpus-level agent behaves when answering
questions about the collection of documents. Default instructions
tell the agent to examine available documents when the corpus
description is empty.
</HelperText>
</SectionHeader>
<TextArea
placeholder="Leave blank to use default instructions..."
value={corpusInstructions}
onChange={(e) => handleCorpusInstructionsChange(e.target.value)}
rows={8}
style={{ fontFamily: "monospace", fontSize: "0.9rem" }}
/>
</Section>

<Section>
<SectionHeader>
<Header as="h4">Document Agent Instructions</Header>
<HelperText>
Controls how document-level agents behave when analyzing
individual documents in this corpus. Default instructions
emphasize using tools and citing sources with page numbers.
</HelperText>
</SectionHeader>
<TextArea
placeholder="Leave blank to use default instructions..."
value={documentInstructions}
onChange={(e) => handleDocumentInstructionsChange(e.target.value)}
rows={8}
style={{ fontFamily: "monospace", fontSize: "0.9rem" }}
/>
</Section>

{hasChanges && (
<ButtonGroup>
<Button primary onClick={handleSave} loading={loading}>
Save Changes
</Button>
<Button onClick={handleReset} disabled={loading}>
Reset
</Button>
</ButtonGroup>
)}
</Form>
</Container>
);
};
17 changes: 17 additions & 0 deletions frontend/src/components/corpuses/CorpusSettings.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import {
} from "../../graphql/mutations";
import { CreateCorpusActionModal } from "./CreateCorpusActionModal";
import { CorpusMetadataSettings } from "./CorpusMetadataSettings";
import { CorpusAgentSettings } from "./CorpusAgentSettings";
import {
UPDATE_CORPUS,
UpdateCorpusInputs,
Expand Down Expand Up @@ -769,6 +770,22 @@ export const CorpusSettings: React.FC<CorpusSettingsProps> = ({ corpus }) => {
</MetadataContent>
</InfoSection>

<InfoSection>
<SectionHeader>
<SectionTitle>Agent Instructions</SectionTitle>
</SectionHeader>
<MetadataContent>
<CorpusAgentSettings
corpusId={corpus.id}
corpusAgentInstructions={(corpus as any).corpusAgentInstructions}
documentAgentInstructions={
(corpus as any).documentAgentInstructions
}
canUpdate={canUpdate}
/>
</MetadataContent>
</InfoSection>

<CreateCorpusActionModal
corpusId={corpus.id}
open={isModalOpen}
Expand Down
Loading
Loading