Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: include category #2021

Merged
merged 11 commits into from
Oct 23, 2024
2 changes: 1 addition & 1 deletion .vscode/tasks.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@
"command": "azure-dev.commands.getDotEnvFilePath"
}
]
}
}
2 changes: 1 addition & 1 deletion SECURITY.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,4 @@ We prefer all communications to be in English.

Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/msrc/cvd).

<!-- END MICROSOFT SECURITY.MD BLOCK -->
<!-- END MICROSOFT SECURITY.MD BLOCK -->
3 changes: 3 additions & 0 deletions app/backend/approaches/approach.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,12 @@ def __init__(
self.vision_token_provider = vision_token_provider

def build_filter(self, overrides: dict[str, Any], auth_claims: dict[str, Any]) -> Optional[str]:
include_category = overrides.get("include_category")
exclude_category = overrides.get("exclude_category")
security_filter = self.auth_helper.build_security_filters(overrides, auth_claims)
filters = []
if include_category:
filters.append("category eq '{}'".format(include_category.replace("'", "''")))
if exclude_category:
filters.append("category ne '{}'".format(exclude_category.replace("'", "''")))
if security_filter:
Expand Down
1 change: 1 addition & 0 deletions app/frontend/src/api/models.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ export type ChatAppRequestOverrides = {
retrieval_mode?: RetrievalMode;
semantic_ranker?: boolean;
semantic_captions?: boolean;
include_category?: string;
exclude_category?: string;
seed?: number;
top?: number;
Expand Down
6 changes: 6 additions & 0 deletions app/frontend/src/locales/en/translation.json
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@
"minimumSearchScore": "Minimum search score",
"minimumRerankerScore": "Minimum reranker score",
"retrieveCount": "Retrieve this many search results:",
"includeCategory": "Include category",
"includeCategoryOptions": {
"all": "All"
},
"excludeCategory": "Exclude category",
"useSemanticRanker": "Use semantic ranker for retrieval",
"useSemanticCaptions": "Use semantic captions",
Expand Down Expand Up @@ -127,6 +131,8 @@
"Sets a minimum score for search results coming back from the semantic reranker. The score always ranges between 0-4. The higher the score, the more semantically relevant the result is to the question.",
"retrieveNumber":
"Sets the number of search results to retrieve from Azure AI search. More results may increase the likelihood of finding the correct answer, but may lead to the model getting 'lost in the middle'.",
"includeCategory":
"Specifies a category to include in the search results. There are no categories used in the default data set.",
"excludeCategory":
"Specifies a category to exclude from the search results. There are no categories used in the default data set.",
"useSemanticReranker":
Expand Down
6 changes: 6 additions & 0 deletions app/frontend/src/locales/es/translation.json
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@
"minimumSearchScore": "Puntaje mínimo de búsqueda",
"minimumRerankerScore": "Puntaje mínimo de re-clasificación",
"retrieveCount": "Obtén éste número resultados de búsqueda:",
bnodir marked this conversation as resolved.
Show resolved Hide resolved
"includeCategory": "Incluir categoría",
bnodir marked this conversation as resolved.
Show resolved Hide resolved
bnodir marked this conversation as resolved.
Show resolved Hide resolved
"includeCategoryOptions": {
"all": "Todos"
bnodir marked this conversation as resolved.
Show resolved Hide resolved
},
"excludeCategory": "Excluir categoría",
"useSemanticRanker": "Usar clasificador semántico para la recuperación",
"useSemanticCaptions": "Usar subtítulos semánticos",
Expand Down Expand Up @@ -128,6 +132,8 @@
"Establece una puntuación mínima para los resultados de búsqueda que vuelven del re-clasificador semántico. La puntuación siempre varía entre 0-4. Cuanto mayor es la puntuación, más relevante es semánticamente el resultado a la pregunta.",
"retrieveNumber":
"Establece el número de resultados de búsqueda para recuperar de Azure AI search. Más resultados pueden aumentar la probabilidad de encontrar la respuesta correcta, pero pueden provocar que el modelo se 'pierda en el medio'.",
"includeCategory":
"Especifica una categoría para incluir en los resultados de búsqueda. No se utilizan categorías en el conjunto de datos predeterminado.",
bnodir marked this conversation as resolved.
Show resolved Hide resolved
bnodir marked this conversation as resolved.
Show resolved Hide resolved
"excludeCategory":
"Especifica una categoría para excluir de los resultados de búsqueda. No se utilizan categorías en el conjunto de datos predeterminado.",
"useSemanticReranker":
Expand Down
6 changes: 6 additions & 0 deletions app/frontend/src/locales/fr/translation.json
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@
"minimumSearchScore": "Score de recherche minimum",
"minimumRerankerScore": "Score minimum du reclasseur sémantique",
"retrieveCount": "Récupérer ce nombre de résultats de recherche :",
"includeCategory": "Inclure la catégorie",
"includeCategoryOptions": {
"all": "Tous"
bnodir marked this conversation as resolved.
Show resolved Hide resolved
},
"excludeCategory": "Exclure la catégorie",
"useSemanticRanker": "Utiliser le reclasseur sémantique",
"useSemanticCaptions": "Utiliser les titres sémantiques",
Expand Down Expand Up @@ -128,6 +132,8 @@
"Définit un score minimum pour les résultats de recherche provenant du reranker sémantique. Le score varie toujours entre 0 et 4. Plus le score est élevé, plus le résultat est sémantiquement pertinent par rapport à la question.",
"retrieveNumber":
"Définit le nombre de résultats de recherche à récupérer d'Azure AI Search. Plus de résultats peuvent augmenter la probabilité de trouver la bonne réponse, mais peuvent amener le modèle à se 'perdre au milieu'.",
"includeCategory":
"Spécifie une catégorie à inclure dans les résultats de recherche. Il n'y a aucune catégorie utilisée dans l'ensemble de données par défaut.",
"excludeCategory":
"Spécifie une catégorie à exclure des résultats de recherche. Il n'y a aucune catégorie utilisée dans l'ensemble de données par défaut.",
"useSemanticReranker":
Expand Down
5 changes: 5 additions & 0 deletions app/frontend/src/locales/ja/translation.json
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@
"minimumSearchScore": "最小検索スコア",
"minimumRerankerScore": "最小リランキング・スコア",
"retrieveCount": "ここで指定する検索結果数を取得:",
"includeCategory": "カテゴリを指定",
"includeCategoryOptions": {
"all": "全て"
},
"excludeCategory": "カテゴリを除外",
"useSemanticRanker": "取得にセマンティック・ランカーを使用",
"useSemanticCaptions": "セマンティック・キャプションを使用",
Expand Down Expand Up @@ -127,6 +131,7 @@
"セマンティック・リランカーから返される検索結果の最小スコアを設定します。スコアの値は0から4の範囲で変更できます。スコアの値が大きいほど、質問に対する結果の意味的な関連性が高まります。",
"retrieveNumber":
"Azure AI Searchの検索結果から取得する数を設定します。結果が多ければ多いほど、正しい答えを見つける可能性は高まるかもしれませんが、モデルが「途中で迷子になる」可能性もあります。",
"includeCategory": "検索結果に含めるカテゴリを指定します。デフォルトのデータセットはカテゴリを使用していません。",
"excludeCategory": "検索結果から除外するカテゴリを指定します。デフォルトのデータセットはカテゴリを使用していません。",
"useSemanticReranker":
"Azure AI Searchのセマンティック・ランカーを有効にします(ユーザーのクエリに対するセマンティック類似性に基づいて検索結果をリランク付けするモデル)。",
Expand Down
39 changes: 38 additions & 1 deletion app/frontend/src/pages/ask/Ask.tsx
Original file line number Diff line number Diff line change
@@ -1,7 +1,18 @@
import { useContext, useEffect, useRef, useState } from "react";
import { useTranslation } from "react-i18next";
import { Helmet } from "react-helmet-async";
import { Checkbox, Panel, DefaultButton, Spinner, TextField, ICheckboxProps, ITextFieldProps } from "@fluentui/react";
import {
Checkbox,
Panel,
DefaultButton,
Spinner,
TextField,
ICheckboxProps,
ITextFieldProps,
Dropdown,
IDropdownOption,
IDropdownProps
} from "@fluentui/react";
import { useId } from "@fluentui/react-hooks";

import styles from "./Ask.module.css";
Expand Down Expand Up @@ -38,6 +49,7 @@ export function Component(): JSX.Element {
const [useSemanticCaptions, setUseSemanticCaptions] = useState<boolean>(false);
const [useGPT4V, setUseGPT4V] = useState<boolean>(false);
const [gpt4vInput, setGPT4VInput] = useState<GPT4VInput>(GPT4VInput.TextAndImages);
const [includeCategory, setIncludeCategory] = useState<string>("");
const [excludeCategory, setExcludeCategory] = useState<string>("");
const [question, setQuestion] = useState<string>("");
const [vectorFieldList, setVectorFieldList] = useState<VectorFieldOptions[]>([VectorFieldOptions.Embedding, VectorFieldOptions.ImageEmbedding]);
Expand Down Expand Up @@ -120,6 +132,7 @@ export function Component(): JSX.Element {
prompt_template: promptTemplate.length === 0 ? undefined : promptTemplate,
prompt_template_prefix: promptTemplatePrefix.length === 0 ? undefined : promptTemplatePrefix,
prompt_template_suffix: promptTemplateSuffix.length === 0 ? undefined : promptTemplateSuffix,
include_category: includeCategory.length === 0 ? undefined : includeCategory,
exclude_category: excludeCategory.length === 0 ? undefined : excludeCategory,
top: retrieveCount,
temperature: temperature,
Expand Down Expand Up @@ -181,6 +194,10 @@ export function Component(): JSX.Element {
setUseSemanticCaptions(!!checked);
};

const onIncludeCategoryChanged = (_ev?: React.FormEvent<HTMLElement | HTMLInputElement>, option?: IDropdownOption) => {
setIncludeCategory((option?.key as string) || "");
};

const onExcludeCategoryChanged = (_ev?: React.FormEvent, newValue?: string) => {
setExcludeCategory(newValue || "");
};
Expand Down Expand Up @@ -228,6 +245,8 @@ export function Component(): JSX.Element {
const rerankerScoreFieldId = useId("rerankerScoreField");
const retrieveCountId = useId("retrieveCount");
const retrieveCountFieldId = useId("retrieveCountField");
const includeCategoryId = useId("includeCategory");
const includeCategoryFieldId = useId("includeCategoryField");
const excludeCategoryId = useId("excludeCategory");
const excludeCategoryFieldId = useId("excludeCategoryField");
const semanticRankerId = useId("semanticRanker");
Expand Down Expand Up @@ -407,6 +426,24 @@ export function Component(): JSX.Element {
)}
/>

<Dropdown
id={includeCategoryFieldId}
className={styles.chatSettingsSeparator}
label={t("labels.includeCategory")}
selectedKey={includeCategory}
onChange={onIncludeCategoryChanged}
aria-labelledby={includeCategoryId}
options={[{ key: "", text: t("labels.includeCategoryOptions.all") }]}
onRenderLabel={(props: IDropdownProps | undefined) => (
<HelpCallout
labelId={includeCategoryId}
fieldId={includeCategoryFieldId}
helpText={t("helpTexts.includeCategory")}
label={props?.label}
/>
)}
/>

<TextField
id={excludeCategoryFieldId}
className={styles.chatSettingsSeparator}
Expand Down
34 changes: 33 additions & 1 deletion app/frontend/src/pages/chat/Chat.tsx
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { useRef, useState, useEffect, useContext } from "react";
import { useTranslation } from "react-i18next";
import { Helmet } from "react-helmet-async";
import { Checkbox, Panel, DefaultButton, TextField, ITextFieldProps, ICheckboxProps } from "@fluentui/react";
import { Checkbox, Panel, DefaultButton, TextField, ITextFieldProps, ICheckboxProps, Dropdown, IDropdownOption, IDropdownProps } from "@fluentui/react";
import { SparkleFilled } from "@fluentui/react-icons";
import { useId } from "@fluentui/react-hooks";
import readNDJSONStream from "ndjson-readablestream";
Expand Down Expand Up @@ -53,6 +53,7 @@ const Chat = () => {
const [useSemanticRanker, setUseSemanticRanker] = useState<boolean>(true);
const [shouldStream, setShouldStream] = useState<boolean>(true);
const [useSemanticCaptions, setUseSemanticCaptions] = useState<boolean>(false);
const [includeCategory, setIncludeCategory] = useState<string>("");
const [excludeCategory, setExcludeCategory] = useState<string>("");
const [useSuggestFollowupQuestions, setUseSuggestFollowupQuestions] = useState<boolean>(false);
const [vectorFieldList, setVectorFieldList] = useState<VectorFieldOptions[]>([VectorFieldOptions.Embedding]);
Expand Down Expand Up @@ -184,6 +185,7 @@ const Chat = () => {
context: {
overrides: {
prompt_template: promptTemplate.length === 0 ? undefined : promptTemplate,
include_category: includeCategory.length === 0 ? undefined : includeCategory,
exclude_category: excludeCategory.length === 0 ? undefined : excludeCategory,
top: retrieveCount,
temperature: temperature,
Expand Down Expand Up @@ -291,6 +293,10 @@ const Chat = () => {
setShouldStream(!!checked);
};

const onIncludeCategoryChanged = (_ev?: React.FormEvent<HTMLElement | HTMLInputElement>, option?: IDropdownOption) => {
setIncludeCategory((option?.key as string) || "");
};

const onExcludeCategoryChanged = (_ev?: React.FormEvent, newValue?: string) => {
setExcludeCategory(newValue || "");
};
Expand Down Expand Up @@ -345,6 +351,8 @@ const Chat = () => {
const rerankerScoreFieldId = useId("rerankerScoreField");
const retrieveCountId = useId("retrieveCount");
const retrieveCountFieldId = useId("retrieveCountField");
const includeCategoryId = useId("includeCategory");
const includeCategoryFieldId = useId("includeCategoryField");
const excludeCategoryId = useId("excludeCategory");
const excludeCategoryFieldId = useId("excludeCategoryField");
const semanticRankerId = useId("semanticRanker");
Expand Down Expand Up @@ -607,6 +615,30 @@ const Chat = () => {
)}
/>

<Dropdown
bnodir marked this conversation as resolved.
Show resolved Hide resolved
id={includeCategoryFieldId}
className={styles.chatSettingsSeparator}
label={t("labels.includeCategory")}
selectedKey={includeCategory}
onChange={onIncludeCategoryChanged}
aria-labelledby={includeCategoryId}
options={[
{ key: "", text: t("labels.includeCategoryOptions.all") }
// You can add a category key here for ingested data like below:
// { key: 'categoryName', text: 'Meaningful Category Name' }
// Alternatively, display the key to guide the user on what to type
// in the "Exclude category" field (e.g., 'Meaningful Category Name(categoryName)').
]}
onRenderLabel={(props: IDropdownProps | undefined) => (
<HelpCallout
labelId={includeCategoryId}
fieldId={includeCategoryFieldId}
helpText={t("helpTexts.includeCategory")}
label={props?.label}
/>
)}
/>

<TextField
id={excludeCategoryFieldId}
className={styles.chatSettingsSeparator}
Expand Down
7 changes: 7 additions & 0 deletions docs/data_ingestion.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ This guide provides more details for using the `prepdocs` script to index docume
- [Supported document formats](#supported-document-formats)
- [Overview of the manual indexing process](#overview-of-the-manual-indexing-process)
- [Chunking](#chunking)
- [Categorizing data for enhanced search](#enhancing-search-functionality-with-data-categorization)
- [Indexing additional documents](#indexing-additional-documents)
- [Removing documents](#removing-documents)
- [Overview of Integrated Vectorization](#overview-of-integrated-vectorization)
Expand Down Expand Up @@ -41,6 +42,12 @@ The script uses the following steps to index documents:
3. Split the PDFs into chunks of text.
4. Upload the chunks to Azure AI Search. If using vectors (the default), also compute the embeddings and upload those alongside the text.

### Enhancing search functionality with data categorization

To enhance search functionality, categorize data during the ingestion process with `--category` argument, for example `scripts/prepdocs.ps1 --category ExampleCategoryName`. This argument specifies the category to which the data belongs, enabling you to filter search results based on these categories.

After running the script with the desired category, ensure these categories are added to the "Include Category" dropdown list in the developer settings. The default option for this dropdown is "All". By including specific categories, you can refine your search results more effectively.
bnodir marked this conversation as resolved.
Show resolved Hide resolved

### Chunking

We're often asked why we need to break up the PDFs into chunks when Azure AI Search supports searching large documents.
Expand Down
4 changes: 2 additions & 2 deletions infra/core/host/container-app.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ var keyvaultIdentitySecrets = [for secret in items(keyvaultIdentities): {
name: secret.key
keyVaultUrl: secret.value.keyVaultUrl
identity: secret.value.identity
}]
}]

module containerRegistryAccess '../security/registry-access.bicep' = if (usePrivateRegistry) {
name: '${deployment().name}-registry-access'
Expand All @@ -116,7 +116,7 @@ resource app 'Microsoft.App/containerApps@2023-05-02-preview' = {
tags: tags
// It is critical that the identity is granted ACR pull access before the app is created
// otherwise the container app will throw a provision error
// This also forces us to use an user assigned managed identity since there would no way to
// This also forces us to use an user assigned managed identity since there would no way to
// provide the system assigned identity with the ACR pull access before the app is created
dependsOn: usePrivateRegistry ? [ containerRegistryAccess ] : []
identity: {
Expand Down
2 changes: 1 addition & 1 deletion infra/core/networking/private-dns-zones.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,4 @@ resource privateDnsZoneLink 'Microsoft.Network/privateDnsZones/virtualNetworkLin
}

output privateDnsZoneName string = dnsZone.name
output id string = dnsZone.id
output id string = dnsZone.id
2 changes: 1 addition & 1 deletion infra/core/networking/private-endpoint.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,4 @@ resource privateEndpoint 'Microsoft.Network/privateEndpoints@2021-02-01' = {
}

output name string = privateEndpoint.name
output id string = privateEndpoint.id
output id string = privateEndpoint.id
2 changes: 1 addition & 1 deletion infra/core/networking/vnet.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,4 @@ output subnetids array = [for (name, i) in subnets: {

output id string = vnet.id
output name string = vnet.name
output vnetSubnets array = vnet.properties.subnets
output vnetSubnets array = vnet.properties.subnets
2 changes: 1 addition & 1 deletion infra/core/search/search-services.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,4 @@ resource search 'Microsoft.Search/searchServices@2023-11-01' = {
output id string = search.id
output endpoint string = 'https://${name}.search.windows.net/'
output name string = search.name
output principalId string = !empty(searchIdentityProvider) ? search.identity.principalId : ''
output principalId string = !empty(searchIdentityProvider) ? search.identity.principalId : ''
4 changes: 2 additions & 2 deletions infra/private-endpoints.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ resource monitorPrivateLinkScope 'microsoft.insights/privateLinkScopes@2021-07-0
}
}

// Private endpoint
// Private endpoint
module monitorPrivateEndpoint './core/networking/private-endpoint.bicep' = {
name: 'monitor-privatendpoint'
params: {
Expand Down Expand Up @@ -158,4 +158,4 @@ module monitorPrivateEndpoint './core/networking/private-endpoint.bicep' = {
]
}
dependsOn: [ monitorDnsZones, dnsZones ]
}
}
Loading