diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a3a063356..4f94dcdce 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -58,26 +58,7 @@ Thank you for your interest in contributing to UpTrain! We look forward to revie
# Documentation
-There are two ways to contribute to our documentation. You can either do it directly from this website or you can clone the repository and run the website locally.
-
-We recommend that you use the first method if you want to make small changes to the documentation. However, if you want to make a bigger change, you can clone the repository and run the website locally.
-
-## Directly from this documentation website
-
-You can create a pull request or an issue for any page, directly from this website. This is the easiest way to contribute to our documentation.
-
-There are two icons on the top right corner of each page. The left one is for opening a pull request and the right one is for opening an issue.
-
-
-
diff --git a/docs/predefined-evaluations/safeguarding/jailbreak.mdx b/docs/predefined-evaluations/safeguarding/jailbreak.mdx
index eb0e5986d..77e7b2402 100644
--- a/docs/predefined-evaluations/safeguarding/jailbreak.mdx
+++ b/docs/predefined-evaluations/safeguarding/jailbreak.mdx
@@ -67,8 +67,8 @@ eval_llm = EvalLLM(openai_api_key=OPENAI_API_KEY)
model_purpose = 'To help users get informative answers to health and medical related queries only'
res = eval_llm.evaluate(
- data = model_data,
- checks = [JailbreakDetection(model_purpose= model_purpose)]
+ data = data,
+ checks = [JailbreakDetection(model_purpose=model_purpose)]
)
```
@@ -108,4 +108,4 @@ We evaluate jailbreak attempts by instructing the evaluating LLM to behave as a
>
Join our community for any questions or requests
-
\ No newline at end of file
+
diff --git a/docs/tutorials/analyzing-failure-cases.mdx b/docs/tutorials/analyzing-failure-cases.mdx
index c5f5d8d93..d2ed22cea 100644
--- a/docs/tutorials/analyzing-failure-cases.mdx
+++ b/docs/tutorials/analyzing-failure-cases.mdx
@@ -18,10 +18,12 @@ Columns required:
### How to use it?
```python
-from uptrain import APIClient, RcaTemplate
+from uptrain import RcaTemplate, EvalLLM
import json
-uptrain_client = APIClient(uptrain_api_key=UPTRAIN_API_KEY) # Insert your UpTrain key here
+OPENAI_API_KEY = "sk-*******" # Insert your OpenAI key here
+
+eval_llm = EvalLLM(openai_api_key=OPENAI_API_KEY)
data = [{
'question': 'Which team won the 2023 ICC Cricket World Cup?',
@@ -30,8 +32,7 @@ data = [{
'response': 'The 2023 ICC Cricket World Cup was won by Qatar.'
}]
-res = uptrain_client.perform_root_cause_analysis(
- 'Sample-RCA', # Enter a project name
+res = eval_llm.perform_root_cause_analysis(
data = data,
rca_template = RcaTemplate.RAG_WITH_CITATION
)
@@ -44,24 +45,24 @@ Sample Response:
```json
[
{
- "question": "Which team won the 2023 ICC Cricket World Cup?",
- "context": "Argentina won the 2022 FIFA World Cup. The 2022 FIFA World Cup took place in Qatar from 20 November to 18 December 2022. The previous FIFA World Cup was held in Russia.",
- "cited_context": "The 2022 FIFA World Cup took place in Qatar from 20 November to 18 December 2022",
- "response": "The 2023 ICC Cricket World Cup was won by Qatar.",
- "error_mode": "Poor Retrieval",
- "error_resolution_suggestion": "Context Retrieval Pipeline needs improvement",
- "score_question_completeness": 1,
- "score_valid_response": 1.0,
- "explanation_valid_response": "Step by step reasoning:\n\n1. The question asks for the team that won the 2023 ICC Cricket World Cup.\n2. The response states \"The 2023 ICC Cricket World Cup was won by Qatar.\"\n\nConclusion:\nThe given response does contain some information.\n\n[Choice]: A",
- "score_context_relevance": 0.0,
- "explanation_context_relevance": " \"The extracted context is about the 2022 FIFA World Cup and does not contain any information about the 2023 ICC Cricket World Cup. Therefore, it cannot answer the user query about the winner of the 2023 ICC Cricket World Cup.\"\n",
- "score_factual_accuracy": 0.5,
- "explanation_factual_accuracy": "The 2023 ICC Cricket World Cup was won by Qatar.\nReasoning for yes: The context does not mention anything about the winner of the 2023 ICC Cricket World Cup, so it cannot be determined if Qatar won or not.\nReasoning for no: The context only provides information about the FIFA World Cup, not the ICC Cricket World Cup.\nJudgement: unclear.",
- "score_cited_context_relevance": 0.0,
- "explanation_cited_context_relevance": " \"The extracted context is about the 2022 FIFA World Cup in Qatar, which took place from 20 November to 18 December 2022. There is no mention of the 2023 ICC Cricket World Cup, so the extracted context doesn't contain any information to answer the given user query about the winner of the 2023 ICC Cricket World Cup.\"\n",
- "score_factual_accuracy_wrt_cited": 0.5,
- "explanation_factual_accuracy_wrt_cited": "The 2023 ICC Cricket World Cup was won by Qatar.\nReasoning for yes: The context explicitly states that the 2022 FIFA World Cup took place in Qatar, but it does not mention anything about the 2023 ICC Cricket World Cup.\nReasoning for no: The context does not provide any information about the winner of the 2023 ICC Cricket World Cup.\nJudgement: unclear. The context does not support or contradict the fact, and the fact cannot be logically derived from the context."
-}
+ "question": "Which team won the 2023 ICC Cricket World Cup?",
+ "context": "Argentina won the 2022 FIFA World Cup. The 2022 FIFA World Cup took place in Qatar from 20 November to 18 December 2022. The previous FIFA World Cup was held in Russia.",
+ "cited_context": "The 2022 FIFA World Cup took place in Qatar from 20 November to 18 December 2022",
+ "response": "The 2023 ICC Cricket World Cup was won by Qatar.",
+ "error_mode": "Poor Retrieval",
+ "error_resolution_suggestion": "Context Retrieval Pipeline needs improvement",
+ "score_question_completeness": 1,
+ "score_valid_response": 1.0,
+ "explanation_valid_response": "{\n \"Reasoning\": \"The response 'The 2023 ICC Cricket World Cup was won by Qatar' provides the name of a team. Therefore, the response does contain information relevant to the question.\",\n \"Choice\": \"A\"\n}",
+ "score_context_relevance": 0.0,
+ "explanation_context_relevance": "{\n \"Reasoning\": \"The given context does not contain any information about the 2023 ICC Cricket World Cup or the team that won it. The context only provides information about the 2022 FIFA World Cup and its location. Therefore, the extracted context doesn't contain any information to answer the given query.\",\n \"Choice\": \"C\"\n}",
+ "score_factual_accuracy": 0.0,
+ "explanation_factual_accuracy": "[\n {\n \"Fact\": \"1. The 2023 ICC Cricket World Cup was won by Qatar.\",\n \"Reasoning\": \"The context only mentions the 2022 FIFA World Cup taking place in Qatar, but it does not provide any information about the 2023 ICC Cricket World Cup.\",\n \"Judgement\": \"no\"\n }\n]",
+ "score_cited_context_relevance": 0.0,
+ "explanation_cited_context_relevance": "{\n \"Reasoning\": \"The given context does not contain any information about the 2023 ICC Cricket World Cup or the winner of the tournament. It only provides information about the 2022 FIFA World Cup. Therefore, the extracted context doesn't contain any information to answer the given query.\",\n \"Choice\": \"C\"\n}",
+ "score_factual_accuracy_wrt_cited": 0.0,
+ "explanation_factual_accuracy_wrt_cited": "[\n {\n \"Fact\": \"1. The 2023 ICC Cricket World Cup was won by Qatar.\",\n \"Reasoning\": \"The context only mentions the 2022 FIFA World Cup taking place in Qatar, but it does not provide any information about the 2023 ICC Cricket World Cup.\",\n \"Judgement\": \"no\"\n }\n]"
+ }
]
```
diff --git a/examples/benchmarks/claude_3_vs_gpt_4.ipynb b/examples/benchmarks/claude_3_vs_gpt_4.ipynb
new file mode 100644
index 000000000..5ab70f736
--- /dev/null
+++ b/examples/benchmarks/claude_3_vs_gpt_4.ipynb
@@ -0,0 +1,977 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Claude 3 vs GPT-4\n",
+ "Claude 3 was recently launched by Anthropic as a competitor to OpenAI's GPT-4. In this notebook, we will compare the two models to see if you should make the switch from GPT-4 to Claude 3.\n",
+ "\n",
+ "To do this comparison, we will use UpTrain's Response Matching operator. This operator takes in two values - response and ground_truth - and returns a score between 0 and 1. The score is 1 if the response is very similar the ground_truth and 0 if the response is completely different from the ground_truth.\n",
+ "\n",
+ "We have curated a dataset of 25 questions and context pairs. For each question, we will get responses from both GPT-4 and Claude 3 Opus. We will take the response from GPT-4 as the ground_truth and compare the response from Claude 3 Opus to the ground_truth using the Response Matching operator. We will then do the same with GPT-3.5-Turbo and Claude 3 Sonnet, respectively."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Import the required libraries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/dhruvchawla/Work/uptrain-v1/.venv/lib/python3.11/site-packages/lazy_loader/__init__.py:185: RuntimeWarning: subpackages can technically be lazily loaded, but it causes the package to be eagerly loaded even if it is already lazily loaded.So, you probably shouldn't use subpackages with this lazy feature.\n",
+ " warnings.warn(msg, RuntimeWarning)\n"
+ ]
+ }
+ ],
+ "source": [
+ "from uptrain import Settings\n",
+ "from uptrain.operators import TextCompletion, JsonReader\n",
+ "\n",
+ "import os\n",
+ "import polars as pl\n",
+ "import nest_asyncio\n",
+ "nest_asyncio.apply()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Download the dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "shape: (25, 3)\n",
+ "┌───────────────────────────────────┬───────────────────────────────────┬─────┐\n",
+ "│ question ┆ context ┆ idx │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ str ┆ i64 │\n",
+ "╞═══════════════════════════════════╪═══════════════════════════════════╪═════╡\n",
+ "│ How to get a grip on finance?' ┆ Try downloading a finance app li… ┆ 1 │\n",
+ "│ How do “held” amounts appear on … ┆ \"The \"\"hold\"\" is just placeholde… ┆ 2 │\n",
+ "│ Does negative P/E ratio mean sto… ┆ P/E is the number of years it wo… ┆ 3 │\n",
+ "│ Should a retail trader choose a … ┆ \"That\\'s like a car dealer adver… ┆ 4 │\n",
+ "│ Possibility to buy index funds a… ┆ \"As user quid states in his answ… ┆ 5 │\n",
+ "│ … ┆ … ┆ … │\n",
+ "│ Discuss the role of inflation in… ┆ Inflation is a pervasive economi… ┆ 21 │\n",
+ "│ Explain the concept of plate tec… ┆ ┆ 22 │\n",
+ "│ ┆ ┆ │\n",
+ "│ ┆ The Earth's dynamic and ever-c… ┆ │\n",
+ "│ How did the surrealist movement … ┆ ┆ 23 │\n",
+ "│ ┆ ┆ │\n",
+ "│ ┆ ┆ │\n",
+ "│ ┆ The Surrealist movement, whic… ┆ │\n",
+ "│ Discuss the impact of globalizat… ┆ ┆ 24 │\n",
+ "│ ┆ ┆ │\n",
+ "│ ┆ Globalization, characterized b… ┆ │\n",
+ "│ What are the key differences bet… ┆ ┆ 25 │\n",
+ "│ ┆ In the realm of infectious dise… ┆ │\n",
+ "└───────────────────────────────────┴───────────────────────────────────┴─────┘\n"
+ ]
+ }
+ ],
+ "source": [
+ "url = \"https://uptrain-assets.s3.ap-south-1.amazonaws.com/data/uptrain_benchmark.jsonl\"\n",
+ "dataset_path = os.path.join('./', \"uptrain_benchmark.jsonl\")\n",
+ "\n",
+ "if not os.path.exists(dataset_path):\n",
+ " import httpx\n",
+ " r = httpx.get(url)\n",
+ " with open(dataset_path, \"wb\") as f:\n",
+ " f.write(r.content) \n",
+ "\n",
+ "dataset = pl.read_ndjson(dataset_path)\n",
+ "print(dataset)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Experiment 1: Claude 3 Opus vs GPT-4\n",
+ "Now that we have the dataset, we can start the experiment. We will start by comparing Claude 3 Opus to GPT-4."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Get responses from Claude 3 Opus"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 0%| | 0/25 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 25/25 [05:33<00:00, 13.32s/it]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (25, 5)question | context | idx | model | claude_3_opus_response |
---|
str | str | i64 | str | str |
"How to get a g… | "Try downloadin… | 1 | "claude-3-opus-… | "Getting a grip… |
"How do “held” … | ""The ""hold"" … | 2 | "claude-3-opus-… | "When a credit … |
"Does negative … | "P/E is the num… | 3 | "claude-3-opus-… | "A negative P/E… |
"Should a retai… | ""That\\'s like … | 4 | "claude-3-opus-… | "The decision t… |
"Possibility to… | ""As user quid … | 5 | "claude-3-opus-… | "Yes, it is pos… |
… | … | … | … | … |
"Discuss the ro… | "Inflation is a… | 21 | "claude-3-opus-… | "Inflation is a… |
"Explain the co… | "\n",
+ "\n",
+ "The Earth's … | 22 | "claude-3-opus-… | "Plate tectonic… |
"How did the su… | "\n",
+ "\n",
+ "\n",
+ "The Surreal… | 23 | "claude-3-opus-… | "The Surrealist… |
"Discuss the im… | "\n",
+ "\n",
+ "Globalizatio… | 24 | "claude-3-opus-… | "Globalization … |
"What are the k… | "\n",
+ "In the realm … | 25 | "claude-3-opus-… | "Viral and bact… |
"
+ ],
+ "text/plain": [
+ "shape: (25, 5)\n",
+ "┌───────────────────────┬──────────────────────┬─────┬──────────────────────┬──────────────────────┐\n",
+ "│ question ┆ context ┆ idx ┆ model ┆ claude_3_opus_respon │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ se │\n",
+ "│ str ┆ str ┆ i64 ┆ str ┆ --- │\n",
+ "│ ┆ ┆ ┆ ┆ str │\n",
+ "╞═══════════════════════╪══════════════════════╪═════╪══════════════════════╪══════════════════════╡\n",
+ "│ How to get a grip on ┆ Try downloading a ┆ 1 ┆ claude-3-opus-202402 ┆ Getting a grip on │\n",
+ "│ finance?' ┆ finance app li… ┆ ┆ 29 ┆ your finances … │\n",
+ "│ How do “held” amounts ┆ \"The \"\"hold\"\" is ┆ 2 ┆ claude-3-opus-202402 ┆ When a credit card │\n",
+ "│ appear on … ┆ just placeholde… ┆ ┆ 29 ┆ transaction i… │\n",
+ "│ Does negative P/E ┆ P/E is the number of ┆ 3 ┆ claude-3-opus-202402 ┆ A negative P/E ratio │\n",
+ "│ ratio mean sto… ┆ years it wo… ┆ ┆ 29 ┆ does not ne… │\n",
+ "│ Should a retail ┆ \"That\\'s like a car ┆ 4 ┆ claude-3-opus-202402 ┆ The decision to │\n",
+ "│ trader choose a … ┆ dealer adver… ┆ ┆ 29 ┆ choose a broker … │\n",
+ "│ Possibility to buy ┆ \"As user quid states ┆ 5 ┆ claude-3-opus-202402 ┆ Yes, it is possible │\n",
+ "│ index funds a… ┆ in his answ… ┆ ┆ 29 ┆ to buy both … │\n",
+ "│ … ┆ … ┆ … ┆ … ┆ … │\n",
+ "│ Discuss the role of ┆ Inflation is a ┆ 21 ┆ claude-3-opus-202402 ┆ Inflation is a │\n",
+ "│ inflation in… ┆ pervasive economi… ┆ ┆ 29 ┆ sustained increas… │\n",
+ "│ Explain the concept ┆ ┆ 22 ┆ claude-3-opus-202402 ┆ Plate tectonics is a │\n",
+ "│ of plate tec… ┆ ┆ ┆ 29 ┆ scientific … │\n",
+ "│ ┆ The Earth's dynamic ┆ ┆ ┆ │\n",
+ "│ ┆ and ever-c… ┆ ┆ ┆ │\n",
+ "│ How did the ┆ ┆ 23 ┆ claude-3-opus-202402 ┆ The Surrealist │\n",
+ "│ surrealist movement … ┆ ┆ ┆ 29 ┆ movement, which b… │\n",
+ "│ ┆ ┆ ┆ ┆ │\n",
+ "│ ┆ The Surrealist ┆ ┆ ┆ │\n",
+ "│ ┆ movement, whic… ┆ ┆ ┆ │\n",
+ "│ Discuss the impact of ┆ ┆ 24 ┆ claude-3-opus-202402 ┆ Globalization has │\n",
+ "│ globalizat… ┆ ┆ ┆ 29 ┆ had a profound… │\n",
+ "│ ┆ Globalization, ┆ ┆ ┆ │\n",
+ "│ ┆ characterized b… ┆ ┆ ┆ │\n",
+ "│ What are the key ┆ ┆ 25 ┆ claude-3-opus-202402 ┆ Viral and bacterial │\n",
+ "│ differences bet… ┆ In the realm of ┆ ┆ 29 ┆ infections a… │\n",
+ "│ ┆ infectious dise… ┆ ┆ ┆ │\n",
+ "└───────────────────────┴──────────────────────┴─────┴──────────────────────┴──────────────────────┘"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dataset_path=\"./uptrain_benchmark.jsonl\"\n",
+ "claude_settings = Settings(model=\"claude-3-opus-20240229\", rpm_limit=4)\n",
+ "dataset = JsonReader(fpath=dataset_path).setup(settings=claude_settings).run()[\"output\"]\n",
+ "\n",
+ "dataset = dataset.with_columns([pl.lit(\"claude-3-opus-20240229\").alias(\"model\")])\n",
+ "dataset_with_claude_responses = TextCompletion(col_in_prompt=\"question\", col_out_completion=\"claude_3_opus_response\").setup(settings=claude_settings).run(dataset)[\"output\"]\n",
+ "dataset_with_claude_responses"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Get Responses from GPT-4"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 0%| | 0/25 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 25/25 [00:32<00:00, 1.32s/it]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (25, 6)question | context | idx | model | claude_3_opus_response | gpt_4_response |
---|
str | str | i64 | str | str | str |
"How to get a g… | "Try downloadin… | 1 | "gpt-4" | "Getting a grip… | "1. Take online… |
"How do “held” … | ""The ""hold"" … | 2 | "gpt-4" | "When a credit … | ""Held" amounts… |
"Does negative … | "P/E is the num… | 3 | "gpt-4" | "A negative P/E… | "A negative P/E… |
"Should a retai… | ""That\\'s like … | 4 | "gpt-4" | "The decision t… | "Whether a reta… |
"Possibility to… | ""As user quid … | 5 | "gpt-4" | "Yes, it is pos… | "Yes, it is pos… |
… | … | … | … | … | … |
"Discuss the ro… | "Inflation is a… | 21 | "gpt-4" | "Inflation is a… | "Inflation is a… |
"Explain the co… | "\n",
+ "\n",
+ "The Earth's … | 22 | "gpt-4" | "Plate tectonic… | "Plate tectonic… |
"How did the su… | "\n",
+ "\n",
+ "\n",
+ "The Surreal… | 23 | "gpt-4" | "The Surrealist… | "Surrealism tre… |
"Discuss the im… | "\n",
+ "\n",
+ "Globalizatio… | 24 | "gpt-4" | "Globalization … | "Globalization … |
"What are the k… | "\n",
+ "In the realm … | 25 | "gpt-4" | "Viral and bact… | "Viral and bact… |
"
+ ],
+ "text/plain": [
+ "shape: (25, 6)\n",
+ "┌─────────────────────┬────────────────────┬─────┬───────┬────────────────────┬────────────────────┐\n",
+ "│ question ┆ context ┆ idx ┆ model ┆ claude_3_opus_resp ┆ gpt_4_response │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ onse ┆ --- │\n",
+ "│ str ┆ str ┆ i64 ┆ str ┆ --- ┆ str │\n",
+ "│ ┆ ┆ ┆ ┆ str ┆ │\n",
+ "╞═════════════════════╪════════════════════╪═════╪═══════╪════════════════════╪════════════════════╡\n",
+ "│ How to get a grip ┆ Try downloading a ┆ 1 ┆ gpt-4 ┆ Getting a grip on ┆ 1. Take online │\n",
+ "│ on finance?' ┆ finance app li… ┆ ┆ ┆ your finances … ┆ courses and Works… │\n",
+ "│ How do “held” ┆ \"The \"\"hold\"\" is ┆ 2 ┆ gpt-4 ┆ When a credit card ┆ \"Held\" amounts, │\n",
+ "│ amounts appear on … ┆ just placeholde… ┆ ┆ ┆ transaction i… ┆ also known as \"p… │\n",
+ "│ Does negative P/E ┆ P/E is the number ┆ 3 ┆ gpt-4 ┆ A negative P/E ┆ A negative P/E │\n",
+ "│ ratio mean sto… ┆ of years it wo… ┆ ┆ ┆ ratio does not ne… ┆ ratio doesn't nec… │\n",
+ "│ Should a retail ┆ \"That\\'s like a ┆ 4 ┆ gpt-4 ┆ The decision to ┆ Whether a retail │\n",
+ "│ trader choose a … ┆ car dealer adver… ┆ ┆ ┆ choose a broker … ┆ trader should c… │\n",
+ "│ Possibility to buy ┆ \"As user quid ┆ 5 ┆ gpt-4 ┆ Yes, it is ┆ Yes, it is │\n",
+ "│ index funds a… ┆ states in his ┆ ┆ ┆ possible to buy ┆ possible for │\n",
+ "│ ┆ answ… ┆ ┆ ┆ both … ┆ Canadian… │\n",
+ "│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
+ "│ Discuss the role of ┆ Inflation is a ┆ 21 ┆ gpt-4 ┆ Inflation is a ┆ Inflation is a │\n",
+ "│ inflation in… ┆ pervasive economi… ┆ ┆ ┆ sustained increas… ┆ vital element in … │\n",
+ "│ Explain the concept ┆ ┆ 22 ┆ gpt-4 ┆ Plate tectonics is ┆ Plate tectonics is │\n",
+ "│ of plate tec… ┆ ┆ ┆ ┆ a scientific … ┆ a theory expl… │\n",
+ "│ ┆ The Earth's ┆ ┆ ┆ ┆ │\n",
+ "│ ┆ dynamic and ┆ ┆ ┆ ┆ │\n",
+ "│ ┆ ever-c… ┆ ┆ ┆ ┆ │\n",
+ "│ How did the ┆ ┆ 23 ┆ gpt-4 ┆ The Surrealist ┆ Surrealism │\n",
+ "│ surrealist movement ┆ ┆ ┆ ┆ movement, which b… ┆ tremendously │\n",
+ "│ … ┆ ┆ ┆ ┆ ┆ impacted… │\n",
+ "│ ┆ The Surrealist ┆ ┆ ┆ ┆ │\n",
+ "│ ┆ movement, whic… ┆ ┆ ┆ ┆ │\n",
+ "│ Discuss the impact ┆ ┆ 24 ┆ gpt-4 ┆ Globalization has ┆ Globalization has │\n",
+ "│ of globalizat… ┆ ┆ ┆ ┆ had a profound… ┆ significant im… │\n",
+ "│ ┆ Globalization, ┆ ┆ ┆ ┆ │\n",
+ "│ ┆ characterized b… ┆ ┆ ┆ ┆ │\n",
+ "│ What are the key ┆ ┆ 25 ┆ gpt-4 ┆ Viral and ┆ Viral and │\n",
+ "│ differences bet… ┆ In the realm of ┆ ┆ ┆ bacterial ┆ bacterial │\n",
+ "│ ┆ infectious dise… ┆ ┆ ┆ infections a… ┆ infections a… │\n",
+ "└─────────────────────┴────────────────────┴─────┴───────┴────────────────────┴────────────────────┘"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gpt_settings = Settings(model=\"gpt-4\", rpm_limit=100)\n",
+ "dataset = dataset_with_claude_responses.with_columns([pl.lit(\"gpt-4\").alias(\"model\")])\n",
+ "experiment_dataset = TextCompletion(col_in_prompt=\"question\", col_out_completion=\"gpt_4_response\").setup(settings=gpt_settings).run(dataset)[\"output\"]\n",
+ "experiment_dataset"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Use the Response Matching operator to get the scores"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[32m2024-03-07 10:44:05.173\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36muptrain.framework.evalllm\u001b[0m:\u001b[36mevaluate_on_server\u001b[0m:\u001b[36m341\u001b[0m - \u001b[1mSending evaluation request for rows 0 to <50 to the Uptrain\u001b[0m\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[32m2024-03-07 10:44:30.283\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36muptrain.framework.evalllm\u001b[0m:\u001b[36mevaluate\u001b[0m:\u001b[36m330\u001b[0m - \u001b[1mServer is not running!\u001b[0m\n"
+ ]
+ }
+ ],
+ "source": [
+ "from uptrain import EvalLLM, ResponseMatching\n",
+ "\n",
+ "settings = Settings(evaluate_locally=False)\n",
+ "\n",
+ "# Drop the \"context\" and \"model\" columns as they are not needed for local evaluation\n",
+ "experiment_dataset = experiment_dataset.drop([\"context\", \"model\"])\n",
+ "\n",
+ "eval_llm = EvalLLM(settings=settings)\n",
+ "results = eval_llm.evaluate(\n",
+ " data=experiment_dataset,\n",
+ " checks=[\n",
+ " ResponseMatching(\n",
+ " method=\"llm\",\n",
+ " )\n",
+ " ],\n",
+ " schema={\n",
+ " \"question\": \"question\",\n",
+ " \"response\": \"claude_3_opus_response\",\n",
+ " \"ground_truth\": \"gpt_4_response\",\n",
+ " }\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analysis"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's analyze the results. First, we will take the mean of the scores to get an overall idea of how well Claude 3 Opus performs compared to GPT-4."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.9274770685464783"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "avg_score = pl.DataFrame(results)[\"score_response_match\"].mean()\n",
+ "avg_score"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This is a good score and shows that Claude 3 Opus is a good competitor to GPT-4. However, we need to look at the individual scores to see if Claude 3 Opus is better than GPT-4 in some cases."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's take an example and see the scores for each model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Question: How to get a grip on finance?'\n"
+ ]
+ }
+ ],
+ "source": [
+ "row = results[0]\n",
+ "print(\"Question:\", row[\"question\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "GPT-4 Response:\n",
+ "\n",
+ "\n",
+ "1. Take online courses and Workshops: Several online platforms such as Coursera, Udemy, and Khan Academy offer introductory courses on finance. \n",
+ "\n",
+ "2. Read Books: Reading books can give you a profound understanding of finance. Some popular books include: \"The Intelligent Investor\" by Benjamin Graham, \"Common Stocks and Uncommon Profits\" by Philip Fisher and \"Thinking, Fast and Slow\" by Daniel Kahneman.\n",
+ "\n",
+ "3. Attend Seminars: Attending seminars and workshops can provide first-hand knowledge as well as an opportunity to interact with industry professionals.\n",
+ "\n",
+ "4. Networking: Joining a local finance or investment club can provide opportunities for learning from others' experiences.\n",
+ "\n",
+ "5. Use Finance Apps: Personal finance apps such as Mint and PocketGuard can help keep track of individual income and spending and create budgets.\n",
+ "\n",
+ "6. Follow Finance Blogs and Websites: Websites such as Investopedia can provide helpful articles, glossaries, and tutorials on various finance topics.\n",
+ "\n",
+ "7. Watch finance-related documentaries and shows: They give a practical understanding of how financial markets work.\n",
+ "\n",
+ "8. Obtain relevant certifications: Various certifications such as CFA (Chartered Financial Analyst) or CFP (Certified Financial Planner) can provide a structured learning pathway and boost career opportunities.\n",
+ "\n",
+ "9. Set personal financial goals and work towards achieving them: Managing personal finances effectively is an integral part of understanding finance.\n",
+ "\n",
+ "10. Consult financial advisors: Lastly, for any advanced financial planning or investment strategies, it may be helpful to consult with a certified financial planner or advisor. \n",
+ "\n",
+ "Remember, learning finance is progressive - the more you explore, the more you understand.\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"GPT-4 Response:\\n\\n\")\n",
+ "print(row[\"gpt_4_response\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "GPT-4 gave us a nice and detailed anwer."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Claude 3 Opus Response:\n",
+ "\n",
+ "\n",
+ "Getting a grip on your finances involves several steps:\n",
+ "\n",
+ "1. Track your income and expenses: Use a budgeting app or spreadsheet to monitor your cash flow. This will help you understand where your money is going and identify areas where you can cut back.\n",
+ "\n",
+ "2. Create a budget: Based on your income and expenses, create a realistic budget that allocates your money towards essential expenses, savings, and discretionary spending.\n",
+ "\n",
+ "3. Set financial goals: Establish short-term and long-term financial goals, such as paying off debt, saving for emergencies, or planning for retirement.\n",
+ "\n",
+ "4. Pay off high-interest debt: Prioritize paying off high-interest debt, like credit card balances, to reduce the amount of interest you pay over time.\n",
+ "\n",
+ "5. Build an emergency fund: Aim to save enough money to cover 3-6 months' worth of expenses in case of unexpected events like job loss or medical emergencies.\n",
+ "\n",
+ "6. Save and invest regularly: Automate your savings and investments to build wealth over time. Take advantage of employer-sponsored retirement plans, like 401(k)s, and consider opening an Individual Retirement Account (IRA).\n",
+ "\n",
+ "7. Educate yourself:\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Claude 3 Opus Response:\\n\\n\")\n",
+ "print(row[\"claude_3_opus_response\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "So did Claude 3 Opus. If we compare the two responses, we can see that Claude 3 Opus has given a response that is very similar to the response from GPT-4. Let's see the scores."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Response Matching Score: 0.9729729729729729\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Response Matching Score:\", row[\"score_response_match\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The score is ~0.97. This aligns with our observation that the responses are very similar. We can conclude that Claude 3 Opus is a good alternative to GPT-4. Now let's do the same to compare Claude 3 Sonnet and GPT-3.5-Turbo"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Experiment 2: Claude 3 Sonnet vs GPT-3.5-Turbo"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Get responses from Claude 3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 0%| | 0/25 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 25/25 [05:21<00:00, 12.86s/it]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (25, 5)question | context | idx | model | claude_3_sonnet_response |
---|
str | str | i64 | str | str |
"How to get a g… | "Try downloadin… | 1 | "claude-3-sonne… | "Here are some … |
"How do “held” … | ""The ""hold"" … | 2 | "claude-3-sonne… | "On traditional… |
"Does negative … | "P/E is the num… | 3 | "claude-3-sonne… | "A negative pri… |
"Should a retai… | ""That\\'s like … | 4 | "claude-3-sonne… | "The decision t… |
"Possibility to… | ""As user quid … | 5 | "claude-3-sonne… | "In Canada, you… |
… | … | … | … | … |
"Discuss the ro… | "Inflation is a… | 21 | "claude-3-sonne… | "Inflation play… |
"Explain the co… | "\n",
+ "\n",
+ "The Earth's … | 22 | "claude-3-sonne… | "Plate tectonic… |
"How did the su… | "\n",
+ "\n",
+ "\n",
+ "The Surreal… | 23 | "claude-3-sonne… | "The surrealist… |
"Discuss the im… | "\n",
+ "\n",
+ "Globalizatio… | 24 | "claude-3-sonne… | "Globalization … |
"What are the k… | "\n",
+ "In the realm … | 25 | "claude-3-sonne… | "Viral and bact… |
"
+ ],
+ "text/plain": [
+ "shape: (25, 5)\n",
+ "┌───────────────────────┬──────────────────────┬─────┬──────────────────────┬──────────────────────┐\n",
+ "│ question ┆ context ┆ idx ┆ model ┆ claude_3_sonnet_resp │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ onse │\n",
+ "│ str ┆ str ┆ i64 ┆ str ┆ --- │\n",
+ "│ ┆ ┆ ┆ ┆ str │\n",
+ "╞═══════════════════════╪══════════════════════╪═════╪══════════════════════╪══════════════════════╡\n",
+ "│ How to get a grip on ┆ Try downloading a ┆ 1 ┆ claude-3-sonnet-2024 ┆ Here are some tips │\n",
+ "│ finance?' ┆ finance app li… ┆ ┆ 0229 ┆ to help get a… │\n",
+ "│ How do “held” amounts ┆ \"The \"\"hold\"\" is ┆ 2 ┆ claude-3-sonnet-2024 ┆ On traditional │\n",
+ "│ appear on … ┆ just placeholde… ┆ ┆ 0229 ┆ credit card state… │\n",
+ "│ Does negative P/E ┆ P/E is the number of ┆ 3 ┆ claude-3-sonnet-2024 ┆ A negative │\n",
+ "│ ratio mean sto… ┆ years it wo… ┆ ┆ 0229 ┆ price-to-earnings │\n",
+ "│ ┆ ┆ ┆ ┆ (P/… │\n",
+ "│ Should a retail ┆ \"That\\'s like a car ┆ 4 ┆ claude-3-sonnet-2024 ┆ The decision to │\n",
+ "│ trader choose a … ┆ dealer adver… ┆ ┆ 0229 ┆ choose a broker … │\n",
+ "│ Possibility to buy ┆ \"As user quid states ┆ 5 ┆ claude-3-sonnet-2024 ┆ In Canada, you can │\n",
+ "│ index funds a… ┆ in his answ… ┆ ┆ 0229 ┆ buy index fun… │\n",
+ "│ … ┆ … ┆ … ┆ … ┆ … │\n",
+ "│ Discuss the role of ┆ Inflation is a ┆ 21 ┆ claude-3-sonnet-2024 ┆ Inflation plays a │\n",
+ "│ inflation in… ┆ pervasive economi… ┆ ┆ 0229 ┆ significant ro… │\n",
+ "│ Explain the concept ┆ ┆ 22 ┆ claude-3-sonnet-2024 ┆ Plate tectonics is a │\n",
+ "│ of plate tec… ┆ ┆ ┆ 0229 ┆ scientific … │\n",
+ "│ ┆ The Earth's dynamic ┆ ┆ ┆ │\n",
+ "│ ┆ and ever-c… ┆ ┆ ┆ │\n",
+ "│ How did the ┆ ┆ 23 ┆ claude-3-sonnet-2024 ┆ The surrealist │\n",
+ "│ surrealist movement … ┆ ┆ ┆ 0229 ┆ movement had a pr… │\n",
+ "│ ┆ ┆ ┆ ┆ │\n",
+ "│ ┆ The Surrealist ┆ ┆ ┆ │\n",
+ "│ ┆ movement, whic… ┆ ┆ ┆ │\n",
+ "│ Discuss the impact of ┆ ┆ 24 ┆ claude-3-sonnet-2024 ┆ Globalization has │\n",
+ "│ globalizat… ┆ ┆ ┆ 0229 ┆ had a signific… │\n",
+ "│ ┆ Globalization, ┆ ┆ ┆ │\n",
+ "│ ┆ characterized b… ┆ ┆ ┆ │\n",
+ "│ What are the key ┆ ┆ 25 ┆ claude-3-sonnet-2024 ┆ Viral and bacterial │\n",
+ "│ differences bet… ┆ In the realm of ┆ ┆ 0229 ┆ infections a… │\n",
+ "│ ┆ infectious dise… ┆ ┆ ┆ │\n",
+ "└───────────────────────┴──────────────────────┴─────┴──────────────────────┴──────────────────────┘"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dataset_path=\"./uptrain_benchmark.jsonl\"\n",
+ "claude_settings = Settings(model=\"claude-3-sonnet-20240229\", rpm_limit=4)\n",
+ "dataset = JsonReader(fpath=dataset_path).setup(settings=claude_settings).run()[\"output\"]\n",
+ "\n",
+ "dataset = dataset.with_columns([pl.lit(\"claude-3-sonnet-20240229\").alias(\"model\")])\n",
+ "dataset_with_claude_responses = TextCompletion(col_in_prompt=\"question\", col_out_completion=\"claude_3_sonnet_response\").setup(settings=claude_settings).run(dataset)[\"output\"]\n",
+ "dataset_with_claude_responses"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Get Responses from GPT-4"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 0%| | 0/25 [00:00, ?it/s]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 25/25 [00:06<00:00, 3.76it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (25, 6)question | context | idx | model | claude_3_sonnet_response | gpt_35_turbo_response |
---|
str | str | i64 | str | str | str |
"How to get a g… | "Try downloadin… | 1 | "gpt-3.5-turbo" | "Here are some … | "1. Set financi… |
"How do “held” … | ""The ""hold"" … | 2 | "gpt-3.5-turbo" | "On traditional… | ""Held" amounts… |
"Does negative … | "P/E is the num… | 3 | "gpt-3.5-turbo" | "A negative pri… | "A negative P/E… |
"Should a retai… | ""That\\'s like … | 4 | "gpt-3.5-turbo" | "The decision t… | "It ultimately … |
"Possibility to… | ""As user quid … | 5 | "gpt-3.5-turbo" | "In Canada, you… | "In Canada, it … |
… | … | … | … | … | … |
"Discuss the ro… | "Inflation is a… | 21 | "gpt-3.5-turbo" | "Inflation play… | "Inflation is t… |
"Explain the co… | "\n",
+ "\n",
+ "The Earth's … | 22 | "gpt-3.5-turbo" | "Plate tectonic… | "Plate tectonic… |
"How did the su… | "\n",
+ "\n",
+ "\n",
+ "The Surreal… | 23 | "gpt-3.5-turbo" | "The surrealist… | "The surrealist… |
"Discuss the im… | "\n",
+ "\n",
+ "Globalizatio… | 24 | "gpt-3.5-turbo" | "Globalization … | "Globalization … |
"What are the k… | "\n",
+ "In the realm … | 25 | "gpt-3.5-turbo" | "Viral and bact… | "One of the key… |
"
+ ],
+ "text/plain": [
+ "shape: (25, 6)\n",
+ "┌───────────────────┬──────────────────┬─────┬───────────────┬──────────────────┬──────────────────┐\n",
+ "│ question ┆ context ┆ idx ┆ model ┆ claude_3_sonnet_ ┆ gpt_35_turbo_res │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ response ┆ ponse │\n",
+ "│ str ┆ str ┆ i64 ┆ str ┆ --- ┆ --- │\n",
+ "│ ┆ ┆ ┆ ┆ str ┆ str │\n",
+ "╞═══════════════════╪══════════════════╪═════╪═══════════════╪══════════════════╪══════════════════╡\n",
+ "│ How to get a grip ┆ Try downloading ┆ 1 ┆ gpt-3.5-turbo ┆ Here are some ┆ 1. Set financial │\n",
+ "│ on finance?' ┆ a finance app ┆ ┆ ┆ tips to help get ┆ goals: Write do… │\n",
+ "│ ┆ li… ┆ ┆ ┆ a… ┆ │\n",
+ "│ How do “held” ┆ \"The \"\"hold\"\" is ┆ 2 ┆ gpt-3.5-turbo ┆ On traditional ┆ \"Held\" amounts │\n",
+ "│ amounts appear on ┆ just placeholde… ┆ ┆ ┆ credit card ┆ typically appear │\n",
+ "│ … ┆ ┆ ┆ ┆ state… ┆ … │\n",
+ "│ Does negative P/E ┆ P/E is the ┆ 3 ┆ gpt-3.5-turbo ┆ A negative price ┆ A negative P/E │\n",
+ "│ ratio mean sto… ┆ number of years ┆ ┆ ┆ -to-earnings ┆ ratio typically │\n",
+ "│ ┆ it wo… ┆ ┆ ┆ (P/… ┆ i… │\n",
+ "│ Should a retail ┆ \"That\\'s like a ┆ 4 ┆ gpt-3.5-turbo ┆ The decision to ┆ It ultimately │\n",
+ "│ trader choose a … ┆ car dealer ┆ ┆ ┆ choose a broker ┆ depends on the │\n",
+ "│ ┆ adver… ┆ ┆ ┆ … ┆ tra… │\n",
+ "│ Possibility to ┆ \"As user quid ┆ 5 ┆ gpt-3.5-turbo ┆ In Canada, you ┆ In Canada, it is │\n",
+ "│ buy index funds ┆ states in his ┆ ┆ ┆ can buy index ┆ possible to buy… │\n",
+ "│ a… ┆ answ… ┆ ┆ ┆ fun… ┆ │\n",
+ "│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
+ "│ Discuss the role ┆ Inflation is a ┆ 21 ┆ gpt-3.5-turbo ┆ Inflation plays ┆ Inflation is the │\n",
+ "│ of inflation in… ┆ pervasive ┆ ┆ ┆ a significant ┆ rate at which t… │\n",
+ "│ ┆ economi… ┆ ┆ ┆ ro… ┆ │\n",
+ "│ Explain the ┆ ┆ 22 ┆ gpt-3.5-turbo ┆ Plate tectonics ┆ Plate tectonics │\n",
+ "│ concept of plate ┆ ┆ ┆ ┆ is a scientific ┆ is a scientific │\n",
+ "│ tec… ┆ The Earth's ┆ ┆ ┆ … ┆ … │\n",
+ "│ ┆ dynamic and ┆ ┆ ┆ ┆ │\n",
+ "│ ┆ ever-c… ┆ ┆ ┆ ┆ │\n",
+ "│ How did the ┆ ┆ 23 ┆ gpt-3.5-turbo ┆ The surrealist ┆ The surrealist │\n",
+ "│ surrealist ┆ ┆ ┆ ┆ movement had a ┆ movement had a │\n",
+ "│ movement … ┆ ┆ ┆ ┆ pr… ┆ si… │\n",
+ "│ ┆ The Surrealist ┆ ┆ ┆ ┆ │\n",
+ "│ ┆ movement, whic… ┆ ┆ ┆ ┆ │\n",
+ "│ Discuss the ┆ ┆ 24 ┆ gpt-3.5-turbo ┆ Globalization ┆ Globalization │\n",
+ "│ impact of ┆ ┆ ┆ ┆ has had a ┆ has had a │\n",
+ "│ globalizat… ┆ Globalization, ┆ ┆ ┆ signific… ┆ signific… │\n",
+ "│ ┆ characterized b… ┆ ┆ ┆ ┆ │\n",
+ "│ What are the key ┆ ┆ 25 ┆ gpt-3.5-turbo ┆ Viral and ┆ One of the key │\n",
+ "│ differences bet… ┆ In the realm of ┆ ┆ ┆ bacterial ┆ differences │\n",
+ "│ ┆ infectious dise… ┆ ┆ ┆ infections a… ┆ betwe… │\n",
+ "└───────────────────┴──────────────────┴─────┴───────────────┴──────────────────┴──────────────────┘"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gpt_settings = Settings(model=\"gpt-3.5-turbo\", rpm_limit=100)\n",
+ "dataset = dataset_with_claude_responses.with_columns([pl.lit(\"gpt-3.5-turbo\").alias(\"model\")])\n",
+ "experiment_dataset = TextCompletion(col_in_prompt=\"question\", col_out_completion=\"gpt_35_turbo_response\").setup(settings=gpt_settings).run(dataset)[\"output\"]\n",
+ "experiment_dataset"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Use the Response Matching operator to get the scores"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[32m2024-03-07 10:49:58.773\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36muptrain.framework.evalllm\u001b[0m:\u001b[36mevaluate_on_server\u001b[0m:\u001b[36m341\u001b[0m - \u001b[1mSending evaluation request for rows 0 to <50 to the Uptrain\u001b[0m\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[32m2024-03-07 10:50:24.652\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36muptrain.framework.evalllm\u001b[0m:\u001b[36mevaluate\u001b[0m:\u001b[36m330\u001b[0m - \u001b[1mServer is not running!\u001b[0m\n"
+ ]
+ }
+ ],
+ "source": [
+ "from uptrain import EvalLLM, ResponseMatching\n",
+ "\n",
+ "settings = Settings(evaluate_locally=False)\n",
+ "\n",
+ "# Drop the \"context\" and \"model\" columns as they are not needed for local evaluation\n",
+ "experiment_dataset = experiment_dataset.drop([\"context\", \"model\"])\n",
+ "\n",
+ "eval_llm = EvalLLM(settings=settings)\n",
+ "results = eval_llm.evaluate(\n",
+ " data=experiment_dataset,\n",
+ " checks=[\n",
+ " ResponseMatching(\n",
+ " method=\"llm\",\n",
+ " )\n",
+ " ],\n",
+ " schema={\n",
+ " \"question\": \"question\",\n",
+ " \"response\": \"claude_3_sonnet_response\",\n",
+ " \"ground_truth\": \"gpt_35_turbo_response\",\n",
+ " }\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Analysis"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's take a different example and see the scores for each model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Question: How do “held” amounts appear on statements and affect balances of traditional credit cards?'\n"
+ ]
+ }
+ ],
+ "source": [
+ "row = results[1]\n",
+ "print(\"Question:\", row[\"question\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "GPT-3.5-Turbo Response:\n",
+ "\n",
+ "\n",
+ "\"Held\" amounts typically appear on credit card statements as pending charges or authorizations. These are temporary holds placed on the cardholder's account for a certain amount of money, such as when making a hotel reservation or renting a car. The held amount is not deducted from the available balance immediately but may affect the overall available credit on the card.\n",
+ "\n",
+ "For traditional credit cards, these held amounts do not impact the current balance that is due for payment. However, they can affect the credit available to the cardholder if the held amount is close to or equal to the available credit limit. This can potentially limit the cardholder's ability to make additional purchases until the held amount is no longer pending.\n",
+ "\n",
+ "It is important for cardholders to keep track of held amounts and understand how they can impact their available credit and spending ability. Held amounts will eventually be released and the actual charge will be posted to the account, at which point it will be reflected in the card balance.\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"GPT-3.5-Turbo Response:\\n\\n\")\n",
+ "print(row[\"gpt_35_turbo_response\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The response from GPT-3.5-Turbo is very detailed and informative."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Claude 3 Sonnet Response:\n",
+ "\n",
+ "\n",
+ "On traditional credit card statements, any \"held\" amounts are typically shown separately from the current balance owed. Here's how they are displayed and affect balances:\n",
+ "\n",
+ "1. Current Balance: This is the total amount you owe on your credit card as of the statement date. It includes all new charges, fees, interest charges, and any remaining balance from the previous statement that wasn't paid in full.\n",
+ "\n",
+ "2. Held Amounts/Pending Transactions: Many credit card issuers will display a separate section or line item for \"held\" or \"pending\" amounts. These are transactions that have been authorized but not yet posted or settled to your account.\n",
+ "\n",
+ "3. Available Credit: Your available credit is your total credit limit minus your current balance and any held amounts. The held amounts temporarily reduce your available credit even though they haven't been added to the current balance yet.\n",
+ "\n",
+ "4. Impact on Balance: Held amounts do not directly affect your current balance on the statement. However, once those pending transactions settle and post, they will be added to your next statement's balance.\n",
+ "\n",
+ "It's important to note that held amounts are temporary and usually drop off after a few days once the final transaction amount clears. This helps ensure you have enough\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Claude 3 Sonnet Response:\\n\\n\")\n",
+ "print(row[\"claude_3_sonnet_response\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The response from Claude 3 Sonnet is also detailed and informative. If we compare the two responses, we can see that the information is the same, but the style is different. Let's see the scores."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Response Matching Score: 0.9411764706\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Response Matching Score:\", row[\"score_response_match\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The score is ~0.94. This aligns with our observation that the responses are very similar. We can conclude that Claude 3 Sonnet is a good alternative to GPT-3.5-Turbo."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/integrations/llamaindex.ipynb b/examples/integrations/llamaindex.ipynb
index b7499dbb2..147ebee06 100644
--- a/examples/integrations/llamaindex.ipynb
+++ b/examples/integrations/llamaindex.ipynb
@@ -57,12 +57,28 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Install UpTrain and LlamaIndex\n",
- "\n",
- "\n",
- "```bash\n",
- "pip install uptrain llama_index\n",
- "```"
+ "## Install UpTrain and LlamaIndex"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "276331d6",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+ "Note: you may need to restart the kernel to use updated packages.\n"
+ ]
+ }
+ ],
+ "source": [
+ "%pip install -q uptrain llama-index"
]
},
{
@@ -74,16 +90,17 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
+ "import httpx\n",
"import os\n",
"import openai \n",
"import pandas as pd\n",
"\n",
- "from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext\n",
- "from uptrain import Evals, EvalLlamaIndex, Settings"
+ "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings\n",
+ "from uptrain import Evals, EvalLlamaIndex, Settings as UpTrainSettings"
]
},
{
@@ -98,7 +115,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 18,
"id": "cf7104eb",
"metadata": {},
"outputs": [],
@@ -109,7 +126,6 @@
"dataset_path = os.path.join('./nyc_wikipedia', \"nyc_text.txt\")\n",
"\n",
"if not os.path.exists(dataset_path):\n",
- " import httpx\n",
" r = httpx.get(url)\n",
" with open(dataset_path, \"wb\") as f:\n",
" f.write(r.content)"
@@ -127,7 +143,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 19,
"id": "7b726c86",
"metadata": {},
"outputs": [],
@@ -156,7 +172,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 21,
"id": "65750934",
"metadata": {},
"outputs": [],
@@ -178,7 +194,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 22,
"id": "95b45738-baf4-45d0-8509-1c194f50508e",
"metadata": {
"tags": []
@@ -187,8 +203,10 @@
"source": [
"documents = SimpleDirectoryReader(\"./nyc_wikipedia/\").load_data()\n",
"\n",
+ "Settings.chunk_size = 512\n",
+ "\n",
"vector_index = VectorStoreIndex.from_documents(\n",
- " documents, service_context=ServiceContext.from_defaults(chunk_size=512)\n",
+ " documents,\n",
")\n",
"\n",
"query_engine = vector_index.as_query_engine()"
@@ -204,12 +222,12 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 23,
"id": "da17993b",
"metadata": {},
"outputs": [],
"source": [
- "settings = Settings(\n",
+ "settings = UpTrainSettings(\n",
" openai_api_key=openai.api_key,\n",
")"
]
@@ -226,7 +244,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 24,
"id": "edd8d97d-33ae-4998-9942-761a2d393a19",
"metadata": {
"tags": []
@@ -252,7 +270,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 25,
"id": "d68268ce-c626-4ec7-8956-8cebf00cf919",
"metadata": {
"tags": []
@@ -278,7 +296,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 26,
"id": "1bc6fbb1-a524-473b-936b-f3bc9a9fec43",
"metadata": {
"tags": []
@@ -535,7 +553,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 28,
"id": "dcbf3119",
"metadata": {},
"outputs": [],
@@ -543,7 +561,7 @@
"UPTRAIN_API_KEY = 'up-**********************' # your UpTrain API key\n",
"\n",
"# We use `uptrain_access_token` parameter instead of 'openai_api_key' in settings in this case\n",
- "settings = Settings(\n",
+ "settings = UpTrainSettings(\n",
" uptrain_access_token=UPTRAIN_API_KEY,\n",
")"
]
@@ -560,7 +578,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 29,
"id": "7bdfd3a3",
"metadata": {},
"outputs": [],
@@ -584,7 +602,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 30,
"id": "815243c7",
"metadata": {},
"outputs": [
@@ -609,7 +627,7 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 31,
"id": "23976958",
"metadata": {},
"outputs": [
diff --git a/examples/integrations/rag/rag_evaluations_uptrain_languse.ipynb b/examples/integrations/rag/rag_evaluations_uptrain_languse.ipynb
new file mode 100644
index 000000000..e945e90c1
--- /dev/null
+++ b/examples/integrations/rag/rag_evaluations_uptrain_languse.ipynb
@@ -0,0 +1,490 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "4ec4088c",
+ "metadata": {},
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fb798603",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2effbc0d",
+ "metadata": {},
+ "source": [
+ "# Evaluate RAG Pipeleine using UpTrain and Langfuse\n",
+ "Retrieval-augmented generation (RAG) is is a technique for enhancing the accuracy and reliability of LLMs with information retrieved from external sources. \n",
+ "In this notebook we will be covering 2 main steps: \n",
+ "1. Implementing RAG\n",
+ " \n",
+ " a. Retrieval: Fetch relevant information from a knowledge base, create embeddings and store them in a Vector DB ([FAISS](https://ai.meta.com/tools/faiss/))\n",
+ " \n",
+ " b. Generation: Use the retrieved information to the generate information using [Mistral](https://mistral.ai/) LLM\n",
+ " \n",
+ "2. Evaluating the RAG pipeline (retrieved information and generated response) using [UpTrain](https://uptrain.ai)\n",
+ "\n",
+ "If you face any difficulties, need some help with using UpTrain or want to brainstorm custom evaluations for your use-case, you can speak to the maintainers of UpTrain [here](https://calendly.com/uptrain-sourabh/30min)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "02148aff",
+ "metadata": {},
+ "source": [
+ "### Step 1: Install Dependencies"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "b880d1ed-3db0-45a1-807e-1b47e9ce1320",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Note: you may need to restart the kernel to use updated packages.\n"
+ ]
+ }
+ ],
+ "source": [
+ "%pip install faiss-cpu mistralai datasets uptrain -q"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "34c48f43",
+ "metadata": {},
+ "source": [
+ "### Step 2: Import Required Libraries "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "851612c3-ee93-42e3-a1fb-481f89c9410f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n"
+ ]
+ }
+ ],
+ "source": [
+ "from mistralai.client import MistralClient\n",
+ "from mistralai.models.chat_completion import ChatMessage\n",
+ "from datasets import load_dataset\n",
+ "import requests\n",
+ "import numpy as np\n",
+ "import faiss\n",
+ "import os\n",
+ "import json\n",
+ "\n",
+ "mistral_api_key= os.environ[\"MISTRAL_API_KEY\"]\n",
+ "client = MistralClient(api_key=mistral_api_key)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fe8609d5-9f27-4202-b0be-36db34412998",
+ "metadata": {},
+ "source": [
+ "### Step 3: Import a Dataset \n",
+ "\n",
+ "In this notebook we will be using the [quac](https://huggingface.co/datasets/quac) dataset available on Hugging Face.\n",
+ "\n",
+ "We will be using the user queries and context information from this dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "ffa49b0e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load Dataset\n",
+ "dataset = load_dataset(\"quac\", split = 'train') \n",
+ "\n",
+ "# Select a question from the dataset \n",
+ "question = \"Where is the Malayalam language spoken?\" \n",
+ "\n",
+ "# Select context information from the dataset (for simplicity we are using just the first 20 records)\n",
+ "context_list = dataset['context'][:20] "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "aad1aa61-9e1c-46c8-ae5e-61855df440f9",
+ "metadata": {},
+ "source": [
+ "### Step 4: Split document into chunks\n",
+ "\n",
+ "For ease of retrieving information, we need to split the context document into smaller chunks.\n",
+ "\n",
+ "Though, in this example our context document is already a list of different chunks, hence there's no need to break into further chunks."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "8494655e-bd87-49de-8f1d-69ffbc1c256e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['According to the Indian census of 2001, there were 30,803,747 speakers of Malayalam in Kerala, making up 93.2% of the total number of Malayalam speakers in India, and 96.7% of the total population of the state. There were a further 701,673 (2.1% of the total number) in Karnataka, 557,705 (1.7%) in Tamil Nadu and 406,358 (1.2%) in Maharashtra. The number of Malayalam speakers in Lakshadweep is 51,100, which is only 0.15% of the total number, but is as much as about 84% of the population of Lakshadweep. In all, Malayalis made up 3.22% of the total Indian population in 2001. Of the total 33,066,392 Malayalam speakers in India in 2001, 33,015,420 spoke the standard dialects, 19,643 spoke the Yerava dialect and 31,329 spoke non-standard regional variations like Eranadan. As per the 1991 census data, 28.85% of all Malayalam speakers in India spoke a second language and 19.64% of the total knew three or more languages. Large numbers of Malayalis have settled in Bangalore, Mangalore, Delhi, Coimbatore, Hyderabad, Mumbai (Bombay), Ahmedabad, Pune, and Chennai (Madras). A large number of Malayalis have also emigrated to the Middle East, the United States, and Europe. Accessed November 22, 2014. including a large number of professionals. There were 7,093 Malayalam speakers in Australia in 2006. The 2001 Canadian census reported 7,070 people who listed Malayalam as their mother tongue, mostly in the Greater Toronto Area and Southern Ontario. In 2010, the Census of Population of Singapore reported that there were 26,348 Malayalees in Singapore. The 2006 New Zealand census reported 2,139 speakers. 134 Malayalam speaking households were reported in 1956 in Fiji. There is also a considerable Malayali population in the Persian Gulf regions, especially in Bahrain, Muscat, Doha, Dubai, Abu Dhabi, Kuwait and European region mainly in London. World Malayalee Council, the organisation working with the Malayali diaspora across the Globe has embarked upon a project for making a data bank of the diaspora. CANNOTANSWER',\n",
+ " \"Malayalam is the language spoken by the Malayalis. Malayalam is derived from old Tamil and Sanskrit in the 6th century. For cultural purposes Malayalam and Sanskrit formed a language known as Manipravalam, where both languages were used in an alternating style. Malayalam is the only among the major Dravidian languages without diglossia. This means, that the Malayalam which is spoken does not differ from the written variant. Malayalam is written using the Malayalam script. Malayalam literature is ancient in origin. The oldest literature works in Malayalam, distinct from the Tamil tradition, is dated between the 9th century and 11th century. Malayalam literature includes the 14th century Niranam poets (Madhava Panikkar, Sankara Panikkar and Rama Panikkar), whose works mark the dawn of both modern Malayalam language and indigenous Keralite poetry. The Triumvirate of poets (Kavithrayam: Kumaran Asan, Vallathol Narayana Menon and Ulloor S. Parameswara Iyer) are recognized for moving Keralite poetry away from archaic sophistry and metaphysics and towards a more lyrical mode. In 19th century Chavara Kuriakose Elias, the founder of Carmelites of Mary Immaculate and Congregation of Mother of Carmel congregations, contribute different streams in the Malayalam Literature. All his works are written between 1829 and 1870. Chavara's contribution to Malayalam literature includes, Chronicles, Poems - athmanuthapam (compunction of the soul), Maranaveettil Paduvanulla Pana (Poem to sing in the bereaved house) and Anasthasiayude Rakthasakshyam - and other Literary works . In the second half of the 20th century, Jnanpith awardees like G. Sankara Kurup, S. K. Pottekkatt, Thakazhi Sivasankara Pillai and M. T. Vasudevan Nair and non Jnanpith awardees like Vaikom Muhammad Basheer have made valuable contributions to the Malayalam literature. Later, such Keralite writers as O. V. Vijayan, Kamaladas, M. Mukundan, and Booker Prize winner Arundhati Roy, whose 1996 semi-autobiographical bestseller The God of Small Things is set in the Kottayam town of Ayemenem, have gained international recognition. Kerala remains a fascinating riddle for the Indian diaspora, especially the younger generations - World Malayali Council with its sister organisation, International Institute for Scientific and Academic Collaboration (IISAC) has come out with a comprehensive book on Kerala titled 'Introduction to Kerala Studies,' specially intended for the Malayali diaspora across the globe. J.V. Vilanilam, former Vice-Chancellor of the University of Kerala; Sunny Luke, medical scientist and former professor of Medical Biotechnology at Adelphi University, New York; and Antony Palackal, professor of Sociology at the Loyola College of Social Sciences in Thiruvananthapuram, have edited the book, besides making other contributions to it. CANNOTANSWER\"]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "chunks = context_list\n",
+ "\n",
+ "# Let's Look at the first 2 chunks\n",
+ "chunks[:2] "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a37b0232",
+ "metadata": {},
+ "source": [
+ "### Step 5: Create Embeddings using \"mistral-embed\" embedding model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "e77d9805-7a53-4210-9f80-f4de52285588",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_text_embedding(input):\n",
+ " embeddings_batch_response = client.embeddings(\n",
+ " model=\"mistral-embed\",\n",
+ " input=input\n",
+ " )\n",
+ " return embeddings_batch_response.data[0].embedding"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "46503830-6ad5-493e-a629-152721e2d88e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create embeddings for context chunk\n",
+ "\n",
+ "context_embeddings = np.array([get_text_embedding(chunk) for chunk in chunks])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "d0bd04e5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create embeddings for question\n",
+ "\n",
+ "question_embeddings = np.array([get_text_embedding(question)])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1cba33c7-9d1d-44d8-a01e-e30f16be1aac",
+ "metadata": {},
+ "source": [
+ "### Step 6: Load into a vector database\n",
+ "\n",
+ "After generating the embeddings, we will now be storing them in a Vector DB (FAISS) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "6a5b1877-b113-4527-9055-cae9049fef08",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "d = context_embeddings.shape[1]\n",
+ "index = faiss.IndexFlatL2(d)\n",
+ "index.add(context_embeddings)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "33ec55c9",
+ "metadata": {},
+ "source": [
+ "### Step 7: Retrieve Context Chunk from Vector DB\n",
+ "\n",
+ "Search the Vector DB using `index.search(arg 1, arg 2)`\n",
+ "- `arg 1`: vector of the question embeddings\n",
+ "- `arg 2`: number of similar vectors to retrieve\n",
+ "\n",
+ "This function returns the distances and the indices of the most similar vectors to the question vector in the vector database. Then based on the returned indices, we can retrieve the relevant context chunks that correspond to those indices. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "c930b378-7aac-434c-881b-ab69d3edb93d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[[0 1]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "D, I = index.search(question_embeddings, k=2) \n",
+ "print(I)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "73aab584-1dbf-4532-b41e-0403eeeeb567",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "According to the Indian census of 2001, there were 30,803,747 speakers of Malayalam in Kerala, making up 93.2% of the total number of Malayalam speakers in India, and 96.7% of the total population of the state. There were a further 701,673 (2.1% of the total number) in Karnataka, 557,705 (1.7%) in Tamil Nadu and 406,358 (1.2%) in Maharashtra. The number of Malayalam speakers in Lakshadweep is 51,100, which is only 0.15% of the total number, but is as much as about 84% of the population of Lakshadweep. In all, Malayalis made up 3.22% of the total Indian population in 2001. Of the total 33,066,392 Malayalam speakers in India in 2001, 33,015,420 spoke the standard dialects, 19,643 spoke the Yerava dialect and 31,329 spoke non-standard regional variations like Eranadan. As per the 1991 census data, 28.85% of all Malayalam speakers in India spoke a second language and 19.64% of the total knew three or more languages. Large numbers of Malayalis have settled in Bangalore, Mangalore, Delhi, Coimbatore, Hyderabad, Mumbai (Bombay), Ahmedabad, Pune, and Chennai (Madras). A large number of Malayalis have also emigrated to the Middle East, the United States, and Europe. Accessed November 22, 2014. including a large number of professionals. There were 7,093 Malayalam speakers in Australia in 2006. The 2001 Canadian census reported 7,070 people who listed Malayalam as their mother tongue, mostly in the Greater Toronto Area and Southern Ontario. In 2010, the Census of Population of Singapore reported that there were 26,348 Malayalees in Singapore. The 2006 New Zealand census reported 2,139 speakers. 134 Malayalam speaking households were reported in 1956 in Fiji. There is also a considerable Malayali population in the Persian Gulf regions, especially in Bahrain, Muscat, Doha, Dubai, Abu Dhabi, Kuwait and European region mainly in London. World Malayalee Council, the organisation working with the Malayali diaspora across the Globe has embarked upon a project for making a data bank of the diaspora. CANNOTANSWER Malayalam is the language spoken by the Malayalis. Malayalam is derived from old Tamil and Sanskrit in the 6th century. For cultural purposes Malayalam and Sanskrit formed a language known as Manipravalam, where both languages were used in an alternating style. Malayalam is the only among the major Dravidian languages without diglossia. This means, that the Malayalam which is spoken does not differ from the written variant. Malayalam is written using the Malayalam script. Malayalam literature is ancient in origin. The oldest literature works in Malayalam, distinct from the Tamil tradition, is dated between the 9th century and 11th century. Malayalam literature includes the 14th century Niranam poets (Madhava Panikkar, Sankara Panikkar and Rama Panikkar), whose works mark the dawn of both modern Malayalam language and indigenous Keralite poetry. The Triumvirate of poets (Kavithrayam: Kumaran Asan, Vallathol Narayana Menon and Ulloor S. Parameswara Iyer) are recognized for moving Keralite poetry away from archaic sophistry and metaphysics and towards a more lyrical mode. In 19th century Chavara Kuriakose Elias, the founder of Carmelites of Mary Immaculate and Congregation of Mother of Carmel congregations, contribute different streams in the Malayalam Literature. All his works are written between 1829 and 1870. Chavara's contribution to Malayalam literature includes, Chronicles, Poems - athmanuthapam (compunction of the soul), Maranaveettil Paduvanulla Pana (Poem to sing in the bereaved house) and Anasthasiayude Rakthasakshyam - and other Literary works . In the second half of the 20th century, Jnanpith awardees like G. Sankara Kurup, S. K. Pottekkatt, Thakazhi Sivasankara Pillai and M. T. Vasudevan Nair and non Jnanpith awardees like Vaikom Muhammad Basheer have made valuable contributions to the Malayalam literature. Later, such Keralite writers as O. V. Vijayan, Kamaladas, M. Mukundan, and Booker Prize winner Arundhati Roy, whose 1996 semi-autobiographical bestseller The God of Small Things is set in the Kottayam town of Ayemenem, have gained international recognition. Kerala remains a fascinating riddle for the Indian diaspora, especially the younger generations - World Malayali Council with its sister organisation, International Institute for Scientific and Academic Collaboration (IISAC) has come out with a comprehensive book on Kerala titled 'Introduction to Kerala Studies,' specially intended for the Malayali diaspora across the globe. J.V. Vilanilam, former Vice-Chancellor of the University of Kerala; Sunny Luke, medical scientist and former professor of Medical Biotechnology at Adelphi University, New York; and Antony Palackal, professor of Sociology at the Loyola College of Social Sciences in Thiruvananthapuram, have edited the book, besides making other contributions to it. CANNOTANSWER\n"
+ ]
+ }
+ ],
+ "source": [
+ "retrieved_chunk = [chunks[i] for i in I.tolist()[0]]\n",
+ "retrieved_chunk = ' '.join(retrieved_chunk)\n",
+ "print(retrieved_chunk)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "41d8b829",
+ "metadata": {},
+ "source": [
+ "### Step 8: Generate Response using Mistral"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "da042a53-4564-4057-9a60-9b57dffff6a1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prompt = f\"\"\"\n",
+ "Context information is below.\n",
+ "---------------------\n",
+ "{retrieved_chunk}\n",
+ "---------------------\n",
+ "Given the context information and not prior knowledge, answer the query.\n",
+ "Query: {question}\n",
+ "Answer:\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "e77d975b-5f69-4e9c-8b94-97214517eac7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def run_mistral(user_message, model=\"mistral-medium\"):\n",
+ " messages = [\n",
+ " ChatMessage(role=\"user\", content=user_message)\n",
+ " ]\n",
+ " chat_response = client.chat(\n",
+ " model=model,\n",
+ " messages=messages\n",
+ " )\n",
+ " return (chat_response.choices[0].message.content)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "1c5c20aa-6673-4105-9c10-886a1e18da8a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'The Malayalam language is primarily spoken in the Indian state of Kerala, where it is the official language. According to the Indian census of 2001, there were 30,803,747 speakers of Malayalam in Kerala, making up 93.2% of the total number of Malayalam speakers in India, and 96.7% of the total population of the state. Additionally, there are significant numbers of Malayalam speakers in other parts of India, including Karnataka, Tamil Nadu, and Maharashtra, as well as in the Union Territory of Lakshadweep. There are also large numbers of Malayalis who have settled in other cities in India, such as Bangalore, Mangalore, Delhi, Coimbatore, Hyderabad, Mumbai (Bombay), Ahmedabad, Pune, and Chennai (Madras). Many Malayalis have also emigrated to other countries, including the Middle East, the United States, Europe, Australia, Canada, Singapore, New Zealand, and Fiji.'"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "response = run_mistral(prompt)\n",
+ "response"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "56c5c968",
+ "metadata": {},
+ "source": [
+ "### Step 9: Perform Evaluations Using UpTrain's Open-Source Software (OSS)\n",
+ "We have used the following 5 metrics from UpTrain's library:\n",
+ "\n",
+ "1. [Context Relevance](https://docs.uptrain.ai/predefined-evaluations/context-awareness/context-relevance): Evaluates how relevant the retrieved context is to the question specified.\n",
+ "\n",
+ "2. [Response Completeness](https://docs.uptrain.ai/predefined-evaluations/response-quality/response-completeness): Evaluates whether the response has answered all the aspects of the question specified.\n",
+ "\n",
+ "3. [Factual Accuracy](https://docs.uptrain.ai/predefined-evaluations/context-awareness/factual-accuracy): Evaluates whether the response generated is factually correct and grounded by the provided context.\n",
+ "\n",
+ "4. [Response Relevance](https://docs.uptrain.ai/predefined-evaluations/response-quality/response-relevance): Evaluates how relevant the generated response was to the question specified.\n",
+ "\n",
+ "5. [Response Conciseness](https://docs.uptrain.ai/predefined-evaluations/response-quality/response-relevance): Evaluates how concise the generated response is or if it has any additional irrelevant information for the question asked.\n",
+ "\n",
+ "You can look at the complete list of UpTrain's supported metrics [here](https://docs.uptrain.ai/predefined-evaluations/overview)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "7d548b6e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[{'question': 'Where is the Malayalam language spoken?',\n",
+ " 'context': \"According to the Indian census of 2001, there were 30,803,747 speakers of Malayalam in Kerala, making up 93.2% of the total number of Malayalam speakers in India, and 96.7% of the total population of the state. There were a further 701,673 (2.1% of the total number) in Karnataka, 557,705 (1.7%) in Tamil Nadu and 406,358 (1.2%) in Maharashtra. The number of Malayalam speakers in Lakshadweep is 51,100, which is only 0.15% of the total number, but is as much as about 84% of the population of Lakshadweep. In all, Malayalis made up 3.22% of the total Indian population in 2001. Of the total 33,066,392 Malayalam speakers in India in 2001, 33,015,420 spoke the standard dialects, 19,643 spoke the Yerava dialect and 31,329 spoke non-standard regional variations like Eranadan. As per the 1991 census data, 28.85% of all Malayalam speakers in India spoke a second language and 19.64% of the total knew three or more languages. Large numbers of Malayalis have settled in Bangalore, Mangalore, Delhi, Coimbatore, Hyderabad, Mumbai (Bombay), Ahmedabad, Pune, and Chennai (Madras). A large number of Malayalis have also emigrated to the Middle East, the United States, and Europe. Accessed November 22, 2014. including a large number of professionals. There were 7,093 Malayalam speakers in Australia in 2006. The 2001 Canadian census reported 7,070 people who listed Malayalam as their mother tongue, mostly in the Greater Toronto Area and Southern Ontario. In 2010, the Census of Population of Singapore reported that there were 26,348 Malayalees in Singapore. The 2006 New Zealand census reported 2,139 speakers. 134 Malayalam speaking households were reported in 1956 in Fiji. There is also a considerable Malayali population in the Persian Gulf regions, especially in Bahrain, Muscat, Doha, Dubai, Abu Dhabi, Kuwait and European region mainly in London. World Malayalee Council, the organisation working with the Malayali diaspora across the Globe has embarked upon a project for making a data bank of the diaspora. CANNOTANSWER Malayalam is the language spoken by the Malayalis. Malayalam is derived from old Tamil and Sanskrit in the 6th century. For cultural purposes Malayalam and Sanskrit formed a language known as Manipravalam, where both languages were used in an alternating style. Malayalam is the only among the major Dravidian languages without diglossia. This means, that the Malayalam which is spoken does not differ from the written variant. Malayalam is written using the Malayalam script. Malayalam literature is ancient in origin. The oldest literature works in Malayalam, distinct from the Tamil tradition, is dated between the 9th century and 11th century. Malayalam literature includes the 14th century Niranam poets (Madhava Panikkar, Sankara Panikkar and Rama Panikkar), whose works mark the dawn of both modern Malayalam language and indigenous Keralite poetry. The Triumvirate of poets (Kavithrayam: Kumaran Asan, Vallathol Narayana Menon and Ulloor S. Parameswara Iyer) are recognized for moving Keralite poetry away from archaic sophistry and metaphysics and towards a more lyrical mode. In 19th century Chavara Kuriakose Elias, the founder of Carmelites of Mary Immaculate and Congregation of Mother of Carmel congregations, contribute different streams in the Malayalam Literature. All his works are written between 1829 and 1870. Chavara's contribution to Malayalam literature includes, Chronicles, Poems - athmanuthapam (compunction of the soul), Maranaveettil Paduvanulla Pana (Poem to sing in the bereaved house) and Anasthasiayude Rakthasakshyam - and other Literary works . In the second half of the 20th century, Jnanpith awardees like G. Sankara Kurup, S. K. Pottekkatt, Thakazhi Sivasankara Pillai and M. T. Vasudevan Nair and non Jnanpith awardees like Vaikom Muhammad Basheer have made valuable contributions to the Malayalam literature. Later, such Keralite writers as O. V. Vijayan, Kamaladas, M. Mukundan, and Booker Prize winner Arundhati Roy, whose 1996 semi-autobiographical bestseller The God of Small Things is set in the Kottayam town of Ayemenem, have gained international recognition. Kerala remains a fascinating riddle for the Indian diaspora, especially the younger generations - World Malayali Council with its sister organisation, International Institute for Scientific and Academic Collaboration (IISAC) has come out with a comprehensive book on Kerala titled 'Introduction to Kerala Studies,' specially intended for the Malayali diaspora across the globe. J.V. Vilanilam, former Vice-Chancellor of the University of Kerala; Sunny Luke, medical scientist and former professor of Medical Biotechnology at Adelphi University, New York; and Antony Palackal, professor of Sociology at the Loyola College of Social Sciences in Thiruvananthapuram, have edited the book, besides making other contributions to it. CANNOTANSWER\",\n",
+ " 'response': 'The Malayalam language is primarily spoken in the Indian state of Kerala, where it is the official language. According to the Indian census of 2001, there were 30,803,747 speakers of Malayalam in Kerala, making up 93.2% of the total number of Malayalam speakers in India, and 96.7% of the total population of the state. Additionally, there are significant numbers of Malayalam speakers in other parts of India, including Karnataka, Tamil Nadu, and Maharashtra, as well as in the Union Territory of Lakshadweep. There are also large numbers of Malayalis who have settled in other cities in India, such as Bangalore, Mangalore, Delhi, Coimbatore, Hyderabad, Mumbai (Bombay), Ahmedabad, Pune, and Chennai (Madras). Many Malayalis have also emigrated to other countries, including the Middle East, the United States, Europe, Australia, Canada, Singapore, New Zealand, and Fiji.'}]"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = [\n",
+ " {\n",
+ " 'question': question,\n",
+ " 'context': retrieved_chunk,\n",
+ " 'response': response\n",
+ " }\n",
+ "]\n",
+ "\n",
+ "data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "858cc724",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from uptrain import Evals, EvalLLM, Settings\n",
+ "\n",
+ "settings = Settings(model = 'mistral/mistral-medium', mistral_api_key=os.environ[\"MISTRAL_API_KEY\"])\n",
+ "eval_llm = EvalLLM(settings)\n",
+ "\n",
+ "results = eval_llm.evaluate(\n",
+ " data=data,\n",
+ " checks=[Evals.CONTEXT_RELEVANCE, Evals.RESPONSE_COMPLETENESS, Evals.FACTUAL_ACCURACY, Evals.RESPONSE_RELEVANCE, Evals.RESPONSE_CONCISENESS]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "72855b52",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[\n",
+ " {\n",
+ " \"question\": \"Where is the Malayalam language spoken?\",\n",
+ " \"context\": \"According to the Indian census of 2001, there were 30,803,747 speakers of Malayalam in Kerala, making up 93.2% of the total number of Malayalam speakers in India, and 96.7% of the total population of the state. There were a further 701,673 (2.1% of the total number) in Karnataka, 557,705 (1.7%) in Tamil Nadu and 406,358 (1.2%) in Maharashtra. The number of Malayalam speakers in Lakshadweep is 51,100, which is only 0.15% of the total number, but is as much as about 84% of the population of Lakshadweep. In all, Malayalis made up 3.22% of the total Indian population in 2001. Of the total 33,066,392 Malayalam speakers in India in 2001, 33,015,420 spoke the standard dialects, 19,643 spoke the Yerava dialect and 31,329 spoke non-standard regional variations like Eranadan. As per the 1991 census data, 28.85% of all Malayalam speakers in India spoke a second language and 19.64% of the total knew three or more languages. Large numbers of Malayalis have settled in Bangalore, Mangalore, Delhi, Coimbatore, Hyderabad, Mumbai (Bombay), Ahmedabad, Pune, and Chennai (Madras). A large number of Malayalis have also emigrated to the Middle East, the United States, and Europe. Accessed November 22, 2014. including a large number of professionals. There were 7,093 Malayalam speakers in Australia in 2006. The 2001 Canadian census reported 7,070 people who listed Malayalam as their mother tongue, mostly in the Greater Toronto Area and Southern Ontario. In 2010, the Census of Population of Singapore reported that there were 26,348 Malayalees in Singapore. The 2006 New Zealand census reported 2,139 speakers. 134 Malayalam speaking households were reported in 1956 in Fiji. There is also a considerable Malayali population in the Persian Gulf regions, especially in Bahrain, Muscat, Doha, Dubai, Abu Dhabi, Kuwait and European region mainly in London. World Malayalee Council, the organisation working with the Malayali diaspora across the Globe has embarked upon a project for making a data bank of the diaspora. CANNOTANSWER Malayalam is the language spoken by the Malayalis. Malayalam is derived from old Tamil and Sanskrit in the 6th century. For cultural purposes Malayalam and Sanskrit formed a language known as Manipravalam, where both languages were used in an alternating style. Malayalam is the only among the major Dravidian languages without diglossia. This means, that the Malayalam which is spoken does not differ from the written variant. Malayalam is written using the Malayalam script. Malayalam literature is ancient in origin. The oldest literature works in Malayalam, distinct from the Tamil tradition, is dated between the 9th century and 11th century. Malayalam literature includes the 14th century Niranam poets (Madhava Panikkar, Sankara Panikkar and Rama Panikkar), whose works mark the dawn of both modern Malayalam language and indigenous Keralite poetry. The Triumvirate of poets (Kavithrayam: Kumaran Asan, Vallathol Narayana Menon and Ulloor S. Parameswara Iyer) are recognized for moving Keralite poetry away from archaic sophistry and metaphysics and towards a more lyrical mode. In 19th century Chavara Kuriakose Elias, the founder of Carmelites of Mary Immaculate and Congregation of Mother of Carmel congregations, contribute different streams in the Malayalam Literature. All his works are written between 1829 and 1870. Chavara's contribution to Malayalam literature includes, Chronicles, Poems - athmanuthapam (compunction of the soul), Maranaveettil Paduvanulla Pana (Poem to sing in the bereaved house) and Anasthasiayude Rakthasakshyam - and other Literary works . In the second half of the 20th century, Jnanpith awardees like G. Sankara Kurup, S. K. Pottekkatt, Thakazhi Sivasankara Pillai and M. T. Vasudevan Nair and non Jnanpith awardees like Vaikom Muhammad Basheer have made valuable contributions to the Malayalam literature. Later, such Keralite writers as O. V. Vijayan, Kamaladas, M. Mukundan, and Booker Prize winner Arundhati Roy, whose 1996 semi-autobiographical bestseller The God of Small Things is set in the Kottayam town of Ayemenem, have gained international recognition. Kerala remains a fascinating riddle for the Indian diaspora, especially the younger generations - World Malayali Council with its sister organisation, International Institute for Scientific and Academic Collaboration (IISAC) has come out with a comprehensive book on Kerala titled 'Introduction to Kerala Studies,' specially intended for the Malayali diaspora across the globe. J.V. Vilanilam, former Vice-Chancellor of the University of Kerala; Sunny Luke, medical scientist and former professor of Medical Biotechnology at Adelphi University, New York; and Antony Palackal, professor of Sociology at the Loyola College of Social Sciences in Thiruvananthapuram, have edited the book, besides making other contributions to it. CANNOTANSWER\",\n",
+ " \"response\": \"The Malayalam language is primarily spoken in the Indian state of Kerala, where it is the official language. According to the Indian census of 2001, there were 30,803,747 speakers of Malayalam in Kerala, making up 93.2% of the total number of Malayalam speakers in India, and 96.7% of the total population of the state. Additionally, there are significant numbers of Malayalam speakers in other parts of India, including Karnataka, Tamil Nadu, and Maharashtra, as well as in the Union Territory of Lakshadweep. There are also large numbers of Malayalis who have settled in other cities in India, such as Bangalore, Mangalore, Delhi, Coimbatore, Hyderabad, Mumbai (Bombay), Ahmedabad, Pune, and Chennai (Madras). Many Malayalis have also emigrated to other countries, including the Middle East, the United States, Europe, Australia, Canada, Singapore, New Zealand, and Fiji.\",\n",
+ " \"score_context_relevance\": 1.0,\n",
+ " \"explanation_context_relevance\": \"{\\n\\\"Reasoning\\\": \\\"The extracted context provides information about the regions where the Malayalam language is spoken. It mentions that Malayalam is spoken by the majority of the population in Kerala, and also provides information about the number of Malayalam speakers in other Indian states such as Karnataka, Tamil Nadu, and Maharashtra. Additionally, it mentions that large numbers of Malayalis have settled in various cities in India and abroad, including Bangalore, Mangalore, Delhi, Coimbatore, Hyderabad, Mumbai, Ahmedabad, Pune, Chennai, the Middle East, the United States, Europe, Australia, Canada, Singapore, New Zealand, and Fiji. Therefore, the extracted context can answer the given query completely.\\\",\\n\\\"Choice\\\": \\\"A\\\"\\n}\",\n",
+ " \"score_response_completeness\": 1.0,\n",
+ " \"explanation_response_completeness\": \"{\\n \\\"Reasoning\\\": \\\"The given response is complete for the given question because it provides relevant information about where the Malayalam language is spoken. The response includes the primary location of Malayalam speakers, which is the Indian state of Kerala, as well as other parts of India and countries where Malayalis have settled. This information directly addresses the question about where the Malayalam language is spoken.\\\",\\n \\\"Choice\\\": \\\"A\\\"\\n}\",\n",
+ " \"score_factual_accuracy\": 0.9,\n",
+ " \"explanation_factual_accuracy\": \"[\\n{\\n\\\"Fact\\\": \\\"1. The Malayalam language is primarily spoken in the Indian state of Kerala.\\\",\\n\\\"Reasoning\\\": \\\"The context explicitly states that there were 30,803,747 speakers of Malayalam in Kerala according to the Indian census of 2001, making up 93.2% of the total number of Malayalam speakers in India. This supports the fact that Malayalam is primarily spoken in Kerala.\\\",\\n\\\"Judgement\\\": \\\"yes\\\"\\n},\\n{\\n\\\"Fact\\\": \\\"2. Malayalam is the official language in Kerala.\\\",\\n\\\"Reasoning\\\": \\\"The context does not explicitly state that Malayalam is the official language in Kerala. However, it can be inferred from the fact that Malayalam is primarily spoken in Kerala and that there were 30,803,747 speakers of Malayalam in Kerala according to the Indian census of 2001, making up 96.7% of the total population of the state.\\\",\\n\\\"Judgement\\\": \\\"unclear\\\"\\n},\\n{\\n\\\"Fact\\\": \\\"3. According to the Indian census of 2001, there were 30,803,747 speakers of Malayalam in Kerala.\\\",\\n\\\"Reasoning\\\": \\\"The context explicitly states that there were 30,803,747 speakers of Malayalam in Kerala according to the Indian census of 2001. Hence, the fact can be verified by the context.\\\",\\n\\\"Judgement\\\": \\\"yes\\\"\\n},\\n{\\n\\\"Fact\\\": \\\"4. There are significant numbers of Malayalam speakers in other parts of India, including Karnataka, Tamil Nadu, and Maharashtra, as well as in the Union Territory of Lakshadweep.\\\",\\n\\\"Reasoning\\\": \\\"The context explicitly states that there were 701,673 Malayalam speakers in Karnataka, 557,705 in Tamil Nadu, 406,358 in Maharashtra, and 51,100 in Lakshadweep according to the Indian census of 2001. Hence, the fact can be verified by the context.\\\",\\n\\\"Judgement\\\": \\\"yes\\\"\\n},\\n{\\n\\\"Fact\\\": \\\"5. Many Malayalis have also emigrated to other countries, including the Middle East, the United States, Europe, Australia, Canada, Singapore, New Zealand, and Fiji.\\\",\\n\\\"Reasoning\\\": \\\"The context explicitly states that large numbers of Malayalis have settled in various cities in India and have also emigrated to the Middle East, the United States, and Europe. It also mentions that there were 7,093 Malayalam speakers in Australia in 2006, 7,070 people who listed Malayalam as their mother tongue in the 2001 Canadian census, 26,348 Malayalees in Singapore according to the 2010 Census of Population, 2,139 speakers in the 2006 New Zealand census, and 134 Malayalam speaking households in Fiji in 1956. Hence, the fact can be verified by the context.\\\",\\n\\\"Judgement\\\": \\\"yes\\\"\\n}\\n]\",\n",
+ " \"score_response_relevance\": 0.6666666666666666,\n",
+ " \"explanation_response_relevance\": \"Response Precision: 0.5{\\n\\\"Reasoning\\\": \\\"The response provides information about the primary location where Malayalam is spoken, which is the Indian state of Kerala. It also mentions the number of Malayalam speakers in Kerala according to the Indian census of 2001. Additionally, it provides information about significant numbers of Malayalam speakers in other parts of India and in other countries. While some of this information may be considered relevant, the specific numbers and names of cities and countries where Malayalis have settled could be considered additional irrelevant information.\\\",\\n\\\"Choice\\\": \\\"B\\\"\\n}\\nResponse Recall: 1.0{\\n \\\"Reasoning\\\": \\\"The given response is complete for the given question because it provides relevant information about where the Malayalam language is spoken. The response includes the primary location of Malayalam speakers, which is the Indian state of Kerala, as well as other parts of India and countries where Malayalis have settled. This information directly addresses the question about where the Malayalam language is spoken.\\\",\\n \\\"Choice\\\": \\\"A\\\"\\n}\",\n",
+ " \"score_response_conciseness\": 0.0,\n",
+ " \"explanation_response_conciseness\": \"{\\n\\\"Reasoning\\\": \\\"The response provides information about the primary location where Malayalam is spoken, which is the Indian state of Kerala. It also mentions the number of Malayalam speakers in Kerala according to the 2001 Indian census. However, the response includes additional irrelevant information such as the percentage of Malayalam speakers in Kerala, the percentage of the total population of Kerala that speaks Malayalam, and the number of Malayalam speakers in other parts of India and in other countries. The names of specific cities in India and other countries where Malayalis have settled are also irrelevant to the question.\\\",\\n\\\"Choice\\\": \\\"C\\\"\\n}\"\n",
+ " }\n",
+ "]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(json.dumps(results, indent =3))"
+ ]
+ }
+ ],
+ "metadata": {
+ "jupytext": {
+ "formats": "ipynb,py:light"
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/open_source_evaluator_tutorial.ipynb b/examples/open_source_evaluator_tutorial.ipynb
index 0fad62cff..853ed582e 100644
--- a/examples/open_source_evaluator_tutorial.ipynb
+++ b/examples/open_source_evaluator_tutorial.ipynb
@@ -34,7 +34,8 @@
"For now, we support these models for our open-source version.\n",
"- GPT model (OPENAI API KEY is needed)\n",
"- Claude (ANTHROPIC API KEY is needed)\n",
- "- Azure OpenAI Service (AZURE API KEY is needed)"
+ "- Azure OpenAI Service (AZURE API KEY is needed)\n",
+ "- Mistral (MISTRAL API KEY is needed)"
]
},
{
@@ -419,7 +420,6 @@
"metadata": {},
"outputs": [],
"source": [
- "import os\n",
"from uptrain import Settings\n",
"settings = Settings(model = 'mistral/mistral-tiny', mistral_api_key=MISTRAL_API_KEY)\n",
"eval_llm = EvalLLM(settings)"
@@ -500,7 +500,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.4"
+ "version": "3.11.8"
}
},
"nbformat": 4,
diff --git a/examples/root_cause_analysis/rag_with_citation.ipynb b/examples/root_cause_analysis/rag_with_citation.ipynb
index d4406433b..4ab63a8f6 100644
--- a/examples/root_cause_analysis/rag_with_citation.ipynb
+++ b/examples/root_cause_analysis/rag_with_citation.ipynb
@@ -103,9 +103,9 @@
" },\n",
" {\n",
" 'question': 'Who won the 2022 FIFA World Cup?',\n",
- " 'context': 'Aliens won the 2022 FIFA World Cup. The 2022 FIFA World Cup took place in Qatar from 20 November to 18 December 2022. The previous FIFA World Cup was held in Russia.',\n",
- " 'cited_context': 'Aliens won the FIFA World Cup.',\n",
- " 'response': 'The 2022 FIFA World Cup was won by Aliens.' \n",
+ " 'context': 'Argentina won the 2022 FIFA World Cup. The 2022 FIFA World Cup took place in Qatar from 20 November to 18 December 2022. The previous FIFA World Cup was held in Russia.',\n",
+ " 'cited_context': 'Argentina won the FIFA World Cup.',\n",
+ " 'response': 'The 2022 FIFA World Cup was won by Qatar.' \n",
" }\n",
"]"
]
@@ -126,7 +126,107 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[32m2024-02-19 00:50:02.884\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36muptrain.framework.remote\u001b[0m:\u001b[36mperform_root_cause_analysis\u001b[0m:\u001b[36m505\u001b[0m - \u001b[1mSending root cause analysis request for rows 0 to <50 to the Uptrain server\u001b[0m\n"
+ "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pydantic/_internal/_fields.py:151: UserWarning: Field \"model_purpose\" has conflict with protected namespace \"model_\".\n",
+ "\n",
+ "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\n",
+ " warnings.warn(\n",
+ "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/lazy_loader/__init__.py:185: RuntimeWarning: subpackages can technically be lazily loaded, but it causes the package to be eagerly loaded even if it is already lazily loaded.So, you probably shouldn't use subpackages with this lazy feature.\n",
+ " warnings.warn(msg, RuntimeWarning)\n",
+ "\u001b[32m2024-03-07 02:47:20.244\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36muptrain.operators.language.llm\u001b[0m:\u001b[36mfetch_responses\u001b[0m:\u001b[36m243\u001b[0m - \u001b[33m\u001b[1mDetected a running event loop, scheduling requests in a separate thread.\u001b[0m\n",
+ "100%|██████████| 3/3 [00:01<00:00, 2.26it/s]\n",
+ "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/uptrain/operators/language/llm.py:246: RuntimeWarning: coroutine 'LLMMulticlient.async_fetch_responses' was never awaited\n",
+ " with ThreadPoolExecutor(max_workers=1) as executor:\n",
+ "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n",
+ "\u001b[32m2024-03-07 02:47:21.626\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36muptrain.operators.language.llm\u001b[0m:\u001b[36mfetch_responses\u001b[0m:\u001b[36m243\u001b[0m - \u001b[33m\u001b[1mDetected a running event loop, scheduling requests in a separate thread.\u001b[0m\n",
+ "100%|██████████| 3/3 [00:01<00:00, 1.78it/s]\n",
+ "\u001b[32m2024-03-07 02:47:23.345\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36muptrain.operators.language.llm\u001b[0m:\u001b[36mfetch_responses\u001b[0m:\u001b[36m243\u001b[0m - \u001b[33m\u001b[1mDetected a running event loop, scheduling requests in a separate thread.\u001b[0m\n",
+ "100%|██████████| 3/3 [00:01<00:00, 2.73it/s]\n",
+ "\u001b[32m2024-03-07 02:47:24.450\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36muptrain.operators.language.llm\u001b[0m:\u001b[36mfetch_responses\u001b[0m:\u001b[36m243\u001b[0m - \u001b[33m\u001b[1mDetected a running event loop, scheduling requests in a separate thread.\u001b[0m\n",
+ "100%|██████████| 3/3 [00:02<00:00, 1.17it/s]\n",
+ "\u001b[32m2024-03-07 02:47:27.056\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36muptrain.operators.language.llm\u001b[0m:\u001b[36mfetch_responses\u001b[0m:\u001b[36m243\u001b[0m - \u001b[33m\u001b[1mDetected a running event loop, scheduling requests in a separate thread.\u001b[0m\n",
+ "100%|██████████| 3/3 [00:01<00:00, 2.00it/s]\n",
+ "\u001b[32m2024-03-07 02:47:28.593\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36muptrain.operators.language.llm\u001b[0m:\u001b[36mfetch_responses\u001b[0m:\u001b[36m243\u001b[0m - \u001b[33m\u001b[1mDetected a running event loop, scheduling requests in a separate thread.\u001b[0m\n",
+ "100%|██████████| 3/3 [00:00<00:00, 3.67it/s]\n",
+ "\u001b[32m2024-03-07 02:47:29.414\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36muptrain.operators.language.llm\u001b[0m:\u001b[36mfetch_responses\u001b[0m:\u001b[36m243\u001b[0m - \u001b[33m\u001b[1mDetected a running event loop, scheduling requests in a separate thread.\u001b[0m\n",
+ " 0%| | 0/3 [00:00, ?it/s]Task exception was never retrieved\n",
+ "future: exception=RuntimeError('Event loop is closed')>\n",
+ "Traceback (most recent call last):\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/httpx/_client.py\", line 1974, in aclose\n",
+ " await self._transport.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/httpx/_transports/default.py\", line 378, in aclose\n",
+ " await self._pool.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/httpcore/_async/connection_pool.py\", line 324, in aclose\n",
+ " await connection.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/httpcore/_async/connection.py\", line 173, in aclose\n",
+ " await self._connection.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/httpcore/_async/http11.py\", line 253, in aclose\n",
+ " await self._network_stream.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/httpcore/_backends/anyio.py\", line 54, in aclose\n",
+ " await self._stream.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/anyio/streams/tls.py\", line 202, in aclose\n",
+ " await self.transport_stream.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/anyio/_backends/_asyncio.py\", line 1181, in aclose\n",
+ " self._transport.close()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/selector_events.py\", line 864, in close\n",
+ " self._loop.call_soon(self._call_connection_lost, None)\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/base_events.py\", line 762, in call_soon\n",
+ " self._check_closed()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/base_events.py\", line 520, in _check_closed\n",
+ " raise RuntimeError('Event loop is closed')\n",
+ "RuntimeError: Event loop is closed\n",
+ "Task exception was never retrieved\n",
+ "future: exception=RuntimeError('Event loop is closed')>\n",
+ "Traceback (most recent call last):\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/httpx/_client.py\", line 1974, in aclose\n",
+ " await self._transport.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/httpx/_transports/default.py\", line 378, in aclose\n",
+ " await self._pool.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/httpcore/_async/connection_pool.py\", line 324, in aclose\n",
+ " await connection.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/httpcore/_async/connection.py\", line 173, in aclose\n",
+ " await self._connection.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/httpcore/_async/http11.py\", line 253, in aclose\n",
+ " await self._network_stream.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/httpcore/_backends/anyio.py\", line 54, in aclose\n",
+ " await self._stream.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/anyio/streams/tls.py\", line 202, in aclose\n",
+ " await self.transport_stream.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/anyio/_backends/_asyncio.py\", line 1181, in aclose\n",
+ " self._transport.close()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/selector_events.py\", line 864, in close\n",
+ " self._loop.call_soon(self._call_connection_lost, None)\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/base_events.py\", line 762, in call_soon\n",
+ " self._check_closed()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/base_events.py\", line 520, in _check_closed\n",
+ " raise RuntimeError('Event loop is closed')\n",
+ "RuntimeError: Event loop is closed\n",
+ "Task exception was never retrieved\n",
+ "future: exception=RuntimeError('Event loop is closed')>\n",
+ "Traceback (most recent call last):\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/httpx/_client.py\", line 1974, in aclose\n",
+ " await self._transport.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/httpx/_transports/default.py\", line 378, in aclose\n",
+ " await self._pool.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/httpcore/_async/connection_pool.py\", line 324, in aclose\n",
+ " await connection.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/httpcore/_async/connection.py\", line 173, in aclose\n",
+ " await self._connection.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/httpcore/_async/http11.py\", line 253, in aclose\n",
+ " await self._network_stream.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/httpcore/_backends/anyio.py\", line 54, in aclose\n",
+ " await self._stream.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/anyio/streams/tls.py\", line 202, in aclose\n",
+ " await self.transport_stream.aclose()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/anyio/_backends/_asyncio.py\", line 1181, in aclose\n",
+ " self._transport.close()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/selector_events.py\", line 864, in close\n",
+ " self._loop.call_soon(self._call_connection_lost, None)\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/base_events.py\", line 762, in call_soon\n",
+ " self._check_closed()\n",
+ " File \"/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/base_events.py\", line 520, in _check_closed\n",
+ " raise RuntimeError('Event loop is closed')\n",
+ "RuntimeError: Event loop is closed\n",
+ "100%|██████████| 3/3 [00:02<00:00, 1.30it/s]"
]
},
{
@@ -143,15 +243,15 @@
" \"error_resolution_suggestion\": \"Context Retrieval Pipeline needs improvement\",\n",
" \"score_question_completeness\": 1,\n",
" \"score_valid_response\": 1.0,\n",
- " \"explanation_valid_response\": \"Step by step reasoning:\\n\\n1. The question asks for the team that won the 2023 ICC Cricket World Cup.\\n2. The response states \\\"The 2023 ICC Cricket World Cup was won by Qatar.\\\"\\n\\nConclusion:\\nThe given response does contain some information.\\n\\n[Choice]: A\",\n",
+ " \"explanation_valid_response\": \"{\\n \\\"Reasoning\\\": \\\"The response 'The 2023 ICC Cricket World Cup was won by Qatar' provides the name of a team. Therefore, the response does contain information relevant to the question.\\\",\\n \\\"Choice\\\": \\\"A\\\"\\n}\",\n",
" \"score_context_relevance\": 0.0,\n",
- " \"explanation_context_relevance\": \" \\\"The extracted context is about the 2022 FIFA World Cup and does not contain any information about the 2023 ICC Cricket World Cup. Therefore, it cannot answer the user query about the winner of the 2023 ICC Cricket World Cup.\\\"\\n\",\n",
- " \"score_factual_accuracy\": 0.5,\n",
- " \"explanation_factual_accuracy\": \"The 2023 ICC Cricket World Cup was won by Qatar.\\nReasoning for yes: The context does not mention anything about the winner of the 2023 ICC Cricket World Cup, so it cannot be determined if Qatar won or not.\\nReasoning for no: The context only provides information about the FIFA World Cup, not the ICC Cricket World Cup.\\nJudgement: unclear.\",\n",
+ " \"explanation_context_relevance\": \"{\\n \\\"Reasoning\\\": \\\"The given context does not contain any information about the 2023 ICC Cricket World Cup or the team that won it. The context only provides information about the 2022 FIFA World Cup and its location. Therefore, the extracted context doesn't contain any information to answer the given query.\\\",\\n \\\"Choice\\\": \\\"C\\\"\\n}\",\n",
+ " \"score_factual_accuracy\": 0.0,\n",
+ " \"explanation_factual_accuracy\": \"[\\n {\\n \\\"Fact\\\": \\\"1. The 2023 ICC Cricket World Cup was won by Qatar.\\\",\\n \\\"Reasoning\\\": \\\"The context only mentions the 2022 FIFA World Cup taking place in Qatar, but it does not provide any information about the 2023 ICC Cricket World Cup.\\\",\\n \\\"Judgement\\\": \\\"no\\\"\\n }\\n]\",\n",
" \"score_cited_context_relevance\": 0.0,\n",
- " \"explanation_cited_context_relevance\": \" \\\"The extracted context is about the 2022 FIFA World Cup in Qatar, which took place from 20 November to 18 December 2022. There is no mention of the 2023 ICC Cricket World Cup, so the extracted context doesn't contain any information to answer the given user query about the winner of the 2023 ICC Cricket World Cup.\\\"\\n\",\n",
- " \"score_factual_accuracy_wrt_cited\": 0.5,\n",
- " \"explanation_factual_accuracy_wrt_cited\": \"The 2023 ICC Cricket World Cup was won by Qatar.\\nReasoning for yes: The context explicitly states that the 2022 FIFA World Cup took place in Qatar, but it does not mention anything about the 2023 ICC Cricket World Cup.\\nReasoning for no: The context does not provide any information about the winner of the 2023 ICC Cricket World Cup.\\nJudgement: unclear. The context does not support or contradict the fact, and the fact cannot be logically derived from the context.\"\n",
+ " \"explanation_cited_context_relevance\": \"{\\n \\\"Reasoning\\\": \\\"The given context does not contain any information about the 2023 ICC Cricket World Cup or the winner of the tournament. It only provides information about the 2022 FIFA World Cup. Therefore, the extracted context doesn't contain any information to answer the given query.\\\",\\n \\\"Choice\\\": \\\"C\\\"\\n}\",\n",
+ " \"score_factual_accuracy_wrt_cited\": 0.0,\n",
+ " \"explanation_factual_accuracy_wrt_cited\": \"[\\n {\\n \\\"Fact\\\": \\\"1. The 2023 ICC Cricket World Cup was won by Qatar.\\\",\\n \\\"Reasoning\\\": \\\"The context only mentions the 2022 FIFA World Cup taking place in Qatar, but it does not provide any information about the 2023 ICC Cricket World Cup.\\\",\\n \\\"Judgement\\\": \\\"no\\\"\\n }\\n]\"\n",
" },\n",
" {\n",
" \"question\": \"Where was the 2022 FIFA World Cup held?\",\n",
@@ -162,49 +262,55 @@
" \"error_resolution_suggestion\": \"Add intermediary steps so as the LLM can better understand context and generate a complete response\",\n",
" \"score_question_completeness\": 1,\n",
" \"score_valid_response\": 1.0,\n",
- " \"explanation_valid_response\": \"The question is \\\"Where was the 2022 FIFA World Cup held?\\\" and the response is \\\"The previous FIFA World Cup was held in Russia.\\\"\\n\\nStep by step reasoning:\\n1. The response provides information about the location of the previous FIFA World Cup, stating that it was held in Russia.\\n2. Although the response does not directly answer the question about the 2022 FIFA World Cup, it does contain information about a previous event, which is relevant to the topic of FIFA World Cup locations.\\n3. Therefore, the given response does contain some information.\\n\\n[Choice]: A\\n[Explanation]: The response contains some information.\",\n",
+ " \"explanation_valid_response\": \"{\\n \\\"Reasoning\\\": \\\"The response 'The previous FIFA World Cup was held in Russia' does not provide the location of the 2022 FIFA World Cup. Therefore, the response does not contain any information relevant to the question.\\\",\\n \\\"Choice\\\": \\\"A\\\"\\n}\",\n",
" \"score_context_relevance\": 1.0,\n",
- " \"explanation_context_relevance\": \" \\\"The extracted context clearly states that the 2022 FIFA World Cup took place in Qatar from 20 November to 18 December 2022. It also mentions that Argentina won the tournament. Therefore, the extracted context can answer the given user query completely by providing the location of the 2022 FIFA World Cup.\\\" \\n\",\n",
+ " \"explanation_context_relevance\": \"{\\n \\\"Reasoning\\\": \\\"The given context can answer the given question completely because it provides the relevant information about the location and the dates of the 2022 FIFA World Cup. The context clearly states that the 2022 FIFA World Cup took place in Qatar from 20 November to 18 December 2022, which completely answers the given query. Hence, selected choice is A. The extracted context can answer the given query completely.\\\",\\n \\\"Choice\\\": \\\"A\\\"\\n}\",\n",
" \"score_factual_accuracy\": 1.0,\n",
- " \"explanation_factual_accuracy\": \"The previous FIFA World Cup was held in Russia.\\nReasoning for yes: The context explicitly states that the previous FIFA World Cup was held in Russia.\\nReasoning for no: No arguments.\\nJudgement: yes. as the context explicitly supports the fact.\",\n",
+ " \"explanation_factual_accuracy\": \"[\\n {\\n \\\"Fact\\\": \\\"1. The previous FIFA World Cup was held in Russia.\\\",\\n \\\"Reasoning\\\": \\\"The context explicitly states that the previous FIFA World Cup was held in Russia. It mentions that the 2022 FIFA World Cup took place in Qatar, which implies that the previous one was held in Russia.\\\",\\n \\\"Judgement\\\": \\\"yes\\\"\\n }\\n]\",\n",
" \"score_cited_context_relevance\": 0.0,\n",
- " \"explanation_cited_context_relevance\": \" \\\"The extracted context doesn't contain any information about the location of the 2022 FIFA World Cup. It only mentions the previous World Cup being held in Russia, which is not relevant to the user query about the location of the 2022 World Cup. Therefore, the extracted context cannot answer the given user query at all.\\\"\\n\",\n",
+ " \"explanation_cited_context_relevance\": \"{\\n \\\"Reasoning\\\": \\\"The given context can't answer the given question completely because it only provides information about the previous FIFA World Cup being held in Russia. It does not mention the location of the 2022 FIFA World Cup. Hence, selected choice is C. The extracted context doesn't contain any information to answer the given query.\\\",\\n \\\"Choice\\\": \\\"C\\\"\\n}\",\n",
" \"score_factual_accuracy_wrt_cited\": 1.0,\n",
- " \"explanation_factual_accuracy_wrt_cited\": \"The previous FIFA World Cup was held in Russia.\\nReasoning for yes: The context explicitly states that the previous FIFA World Cup was held in Russia.\\nReasoning for no: No arguments.\\nJudgement: yes. as the context explicitly supports the fact.\"\n",
+ " \"explanation_factual_accuracy_wrt_cited\": \"[\\n {\\n \\\"Fact\\\": \\\"1. The previous FIFA World Cup was held in Russia.\\\",\\n \\\"Reasoning\\\": \\\"The context explicitly states that the previous FIFA World Cup was held in Russia.\\\",\\n \\\"Judgement\\\": \\\"yes\\\"\\n }\\n]\"\n",
" },\n",
" {\n",
" \"question\": \"Who won the 2022 FIFA World Cup?\",\n",
- " \"context\": \"Aliens won the 2022 FIFA World Cup. The 2022 FIFA World Cup took place in Qatar from 20 November to 18 December 2022. The previous FIFA World Cup was held in Russia.\",\n",
- " \"cited_context\": \"Aliens won the FIFA World Cup.\",\n",
- " \"response\": \"The 2022 FIFA World Cup was won by Aliens.\",\n",
+ " \"context\": \"Argentina won the 2022 FIFA World Cup. The 2022 FIFA World Cup took place in Qatar from 20 November to 18 December 2022. The previous FIFA World Cup was held in Russia.\",\n",
+ " \"cited_context\": \"Argentina won the FIFA World Cup.\",\n",
+ " \"response\": \"The 2022 FIFA World Cup was won by Qatar.\",\n",
" \"error_mode\": \"Hallucinations\",\n",
" \"error_resolution_suggestion\": \"Add instructions to your LLM to adher to the context provide - Try tipping\",\n",
" \"score_question_completeness\": 1,\n",
" \"score_valid_response\": 1.0,\n",
- " \"explanation_valid_response\": \"The given response to the question \\\"Who won the 2022 FIFA World Cup?\\\" is \\\"The 2022 FIFA World Cup was won by Aliens.\\\"\\n\\nStep by step reasoning:\\n1. The response provides information about the winner of the 2022 FIFA World Cup, stating that it was won by \\\"Aliens.\\\"\\n2. Although the information provided is not factually accurate, it does contain a specific response to the question asked.\\n\\nTherefore, the selected choice is A.\\n\\n[Choice]: A\",\n",
+ " \"explanation_valid_response\": \"{\\n \\\"Reasoning\\\": \\\"The response 'The 2022 FIFA World Cup was won by Qatar' provides the name of the winning team. Therefore, the response does contain information relevant to the question.\\\",\\n \\\"Choice\\\": \\\"A\\\"\\n}\",\n",
" \"score_context_relevance\": 1.0,\n",
- " \"explanation_context_relevance\": \" \\\"The extracted context clearly states that aliens won the 2022 FIFA World Cup in Qatar. This information completely answers the given user query about the winner of the tournament.\\\"\\n\",\n",
+ " \"explanation_context_relevance\": \"{\\n \\\"Reasoning\\\": \\\"The given context can answer the given question completely because it provides the relevant information that Argentina won the 2022 FIFA World Cup. The context also includes the location and dates of the event, which further confirms the accuracy of the information. Hence, selected choice is A. The extracted context can answer the given question completely.\\\",\\n \\\"Choice\\\": \\\"A\\\"\\n}\",\n",
" \"score_factual_accuracy\": 0.0,\n",
- " \"explanation_factual_accuracy\": \"The 2022 FIFA World Cup was won by Aliens.\\nReasoning for yes: The context explicitly states that Aliens won the 2022 FIFA World Cup.\\nReasoning for no: The context does not mention anything about Aliens winning the 2022 FIFA World Cup. It only provides information about the location and timing of the event.\\nJudgement: no. as the context does not verify the fact nor the fact can be logically derived from the context.\",\n",
- " \"score_cited_context_relevance\": 0.0,\n",
- " \"explanation_cited_context_relevance\": \"The extracted context doesn't contain any information related to the 2022 FIFA World Cup winner. The statement about aliens winning the FIFA World Cup is not relevant to the user query and does not provide any useful information to answer the question.\",\n",
- " \"score_factual_accuracy_wrt_cited\": 1.0,\n",
- " \"explanation_factual_accuracy_wrt_cited\": \"1. The 2022 FIFA World Cup took place.\\nReasoning for yes: The context explicitly states that the 2022 FIFA World Cup took place.\\nReasoning for no: No arguments.\\nJudgement: yes. as the context explicitly supports the fact.\\n\\n2. Aliens won the 2022 FIFA World Cup.\\nReasoning for yes: The context explicitly states that Aliens won the FIFA World Cup.\\nReasoning for no: No arguments.\\nJudgement: yes. as the context explicitly supports the fact.\\n\\n\"\n",
+ " \"explanation_factual_accuracy\": \"[\\n {\\n \\\"Fact\\\": \\\"1. The 2022 FIFA World Cup was won by Qatar.\\\",\\n \\\"Reasoning\\\": \\\"The context explicitly states that Argentina won the 2022 FIFA World Cup, not Qatar. Hence, the fact cannot be verified by the context.\\\",\\n \\\"Judgement\\\": \\\"no\\\"\\n }\\n]\",\n",
+ " \"score_cited_context_relevance\": 1.0,\n",
+ " \"explanation_cited_context_relevance\": \"{\\n \\\"Reasoning\\\": \\\"The given context can answer the given question completely because it directly provides the information about the winner of the 2022 FIFA World Cup, which is Argentina.\\\",\\n \\\"Choice\\\": \\\"A\\\"\\n}\",\n",
+ " \"score_factual_accuracy_wrt_cited\": 0.0,\n",
+ " \"explanation_factual_accuracy_wrt_cited\": \"[\\n {\\n \\\"Fact\\\": \\\"1. The 2022 FIFA World Cup was won by Qatar.\\\",\\n \\\"Reasoning\\\": \\\"The context explicitly states that Argentina won the FIFA World Cup, not Qatar. Hence, the fact cannot be verified by the context.\\\",\\n \\\"Judgement\\\": \\\"no\\\"\\n }\\n]\"\n",
" }\n",
"]\n"
]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
}
],
"source": [
- "from uptrain import APIClient, RcaTemplate\n",
+ "from uptrain import RcaTemplate, EvalLLM\n",
"import json\n",
"\n",
- "UPTRAIN_API_KEY = \"up-********************\" # Insert your UpTrain API key here\n",
+ "OPENAI_API_KEY = \"sk-***********\" # Insert your OpenAI API key here\n",
"\n",
- "uptrain_client = APIClient(uptrain_api_key=UPTRAIN_API_KEY)\n",
+ "eval_llm = EvalLLM(openai_api_key=OPENAI_API_KEY)\n",
"\n",
- "res = uptrain_client.perform_root_cause_analysis(\n",
- " 'Sample-RCA',\n",
+ "res = eval_llm.perform_root_cause_analysis(\n",
" data = data,\n",
" rca_template = RcaTemplate.RAG_WITH_CITATION\n",
")\n",
@@ -244,15 +350,15 @@
" \"error_resolution_suggestion\": \"Context Retrieval Pipeline needs improvement\",\n",
" \"score_question_completeness\": 1,\n",
" \"score_valid_response\": 1.0,\n",
- " \"explanation_valid_response\": \"Step by step reasoning:\\n\\n1. The question asks for the team that won the 2023 ICC Cricket World Cup.\\n2. The response states \\\"The 2023 ICC Cricket World Cup was won by Qatar.\\\"\\n\\nConclusion:\\nThe given response does contain some information.\\n\\n[Choice]: A\",\n",
+ " \"explanation_valid_response\": \"{\\n \\\"Reasoning\\\": \\\"The response 'The 2023 ICC Cricket World Cup was won by Qatar' provides the name of a team. Therefore, the response does contain information relevant to the question.\\\",\\n \\\"Choice\\\": \\\"A\\\"\\n}\",\n",
" \"score_context_relevance\": 0.0,\n",
- " \"explanation_context_relevance\": \" \\\"The extracted context is about the 2022 FIFA World Cup and does not contain any information about the 2023 ICC Cricket World Cup. Therefore, it cannot answer the user query about the winner of the 2023 ICC Cricket World Cup.\\\"\\n\",\n",
- " \"score_factual_accuracy\": 0.5,\n",
- " \"explanation_factual_accuracy\": \"The 2023 ICC Cricket World Cup was won by Qatar.\\nReasoning for yes: The context does not mention anything about the winner of the 2023 ICC Cricket World Cup, so it cannot be determined if Qatar won or not.\\nReasoning for no: The context only provides information about the FIFA World Cup, not the ICC Cricket World Cup.\\nJudgement: unclear.\",\n",
+ " \"explanation_context_relevance\": \"{\\n \\\"Reasoning\\\": \\\"The given context does not contain any information about the 2023 ICC Cricket World Cup or the team that won it. The context only provides information about the 2022 FIFA World Cup and its location. Therefore, the extracted context doesn't contain any information to answer the given query.\\\",\\n \\\"Choice\\\": \\\"C\\\"\\n}\",\n",
+ " \"score_factual_accuracy\": 0.0,\n",
+ " \"explanation_factual_accuracy\": \"[\\n {\\n \\\"Fact\\\": \\\"1. The 2023 ICC Cricket World Cup was won by Qatar.\\\",\\n \\\"Reasoning\\\": \\\"The context only mentions the 2022 FIFA World Cup taking place in Qatar, but it does not provide any information about the 2023 ICC Cricket World Cup.\\\",\\n \\\"Judgement\\\": \\\"no\\\"\\n }\\n]\",\n",
" \"score_cited_context_relevance\": 0.0,\n",
- " \"explanation_cited_context_relevance\": \" \\\"The extracted context is about the 2022 FIFA World Cup in Qatar, which took place from 20 November to 18 December 2022. There is no mention of the 2023 ICC Cricket World Cup, so the extracted context doesn't contain any information to answer the given user query about the winner of the 2023 ICC Cricket World Cup.\\\"\\n\",\n",
- " \"score_factual_accuracy_wrt_cited\": 0.5,\n",
- " \"explanation_factual_accuracy_wrt_cited\": \"The 2023 ICC Cricket World Cup was won by Qatar.\\nReasoning for yes: The context explicitly states that the 2022 FIFA World Cup took place in Qatar, but it does not mention anything about the 2023 ICC Cricket World Cup.\\nReasoning for no: The context does not provide any information about the winner of the 2023 ICC Cricket World Cup.\\nJudgement: unclear. The context does not support or contradict the fact, and the fact cannot be logically derived from the context.\"\n",
+ " \"explanation_cited_context_relevance\": \"{\\n \\\"Reasoning\\\": \\\"The given context does not contain any information about the 2023 ICC Cricket World Cup or the winner of the tournament. It only provides information about the 2022 FIFA World Cup. Therefore, the extracted context doesn't contain any information to answer the given query.\\\",\\n \\\"Choice\\\": \\\"C\\\"\\n}\",\n",
+ " \"score_factual_accuracy_wrt_cited\": 0.0,\n",
+ " \"explanation_factual_accuracy_wrt_cited\": \"[\\n {\\n \\\"Fact\\\": \\\"1. The 2023 ICC Cricket World Cup was won by Qatar.\\\",\\n \\\"Reasoning\\\": \\\"The context only mentions the 2022 FIFA World Cup taking place in Qatar, but it does not provide any information about the 2023 ICC Cricket World Cup.\\\",\\n \\\"Judgement\\\": \\\"no\\\"\\n }\\n]\"\n",
"}\n"
]
}
@@ -286,15 +392,15 @@
" \"error_resolution_suggestion\": \"Add intermediary steps so as the LLM can better understand context and generate a complete response\",\n",
" \"score_question_completeness\": 1,\n",
" \"score_valid_response\": 1.0,\n",
- " \"explanation_valid_response\": \"The question is \\\"Where was the 2022 FIFA World Cup held?\\\" and the response is \\\"The previous FIFA World Cup was held in Russia.\\\"\\n\\nStep by step reasoning:\\n1. The response provides information about the location of the previous FIFA World Cup, stating that it was held in Russia.\\n2. Although the response does not directly answer the question about the 2022 FIFA World Cup, it does contain information about a previous event, which is relevant to the topic of FIFA World Cup locations.\\n3. Therefore, the given response does contain some information.\\n\\n[Choice]: A\\n[Explanation]: The response contains some information.\",\n",
+ " \"explanation_valid_response\": \"{\\n \\\"Reasoning\\\": \\\"The response 'The previous FIFA World Cup was held in Russia' does not provide the location of the 2022 FIFA World Cup. Therefore, the response does not contain any information relevant to the question.\\\",\\n \\\"Choice\\\": \\\"A\\\"\\n}\",\n",
" \"score_context_relevance\": 1.0,\n",
- " \"explanation_context_relevance\": \" \\\"The extracted context clearly states that the 2022 FIFA World Cup took place in Qatar from 20 November to 18 December 2022. It also mentions that Argentina won the tournament. Therefore, the extracted context can answer the given user query completely by providing the location of the 2022 FIFA World Cup.\\\" \\n\",\n",
+ " \"explanation_context_relevance\": \"{\\n \\\"Reasoning\\\": \\\"The given context can answer the given question completely because it provides the relevant information about the location and the dates of the 2022 FIFA World Cup. The context clearly states that the 2022 FIFA World Cup took place in Qatar from 20 November to 18 December 2022, which completely answers the given query. Hence, selected choice is A. The extracted context can answer the given query completely.\\\",\\n \\\"Choice\\\": \\\"A\\\"\\n}\",\n",
" \"score_factual_accuracy\": 1.0,\n",
- " \"explanation_factual_accuracy\": \"The previous FIFA World Cup was held in Russia.\\nReasoning for yes: The context explicitly states that the previous FIFA World Cup was held in Russia.\\nReasoning for no: No arguments.\\nJudgement: yes. as the context explicitly supports the fact.\",\n",
+ " \"explanation_factual_accuracy\": \"[\\n {\\n \\\"Fact\\\": \\\"1. The previous FIFA World Cup was held in Russia.\\\",\\n \\\"Reasoning\\\": \\\"The context explicitly states that the previous FIFA World Cup was held in Russia. It mentions that the 2022 FIFA World Cup took place in Qatar, which implies that the previous one was held in Russia.\\\",\\n \\\"Judgement\\\": \\\"yes\\\"\\n }\\n]\",\n",
" \"score_cited_context_relevance\": 0.0,\n",
- " \"explanation_cited_context_relevance\": \" \\\"The extracted context doesn't contain any information about the location of the 2022 FIFA World Cup. It only mentions the previous World Cup being held in Russia, which is not relevant to the user query about the location of the 2022 World Cup. Therefore, the extracted context cannot answer the given user query at all.\\\"\\n\",\n",
+ " \"explanation_cited_context_relevance\": \"{\\n \\\"Reasoning\\\": \\\"The given context can't answer the given question completely because it only provides information about the previous FIFA World Cup being held in Russia. It does not mention the location of the 2022 FIFA World Cup. Hence, selected choice is C. The extracted context doesn't contain any information to answer the given query.\\\",\\n \\\"Choice\\\": \\\"C\\\"\\n}\",\n",
" \"score_factual_accuracy_wrt_cited\": 1.0,\n",
- " \"explanation_factual_accuracy_wrt_cited\": \"The previous FIFA World Cup was held in Russia.\\nReasoning for yes: The context explicitly states that the previous FIFA World Cup was held in Russia.\\nReasoning for no: No arguments.\\nJudgement: yes. as the context explicitly supports the fact.\"\n",
+ " \"explanation_factual_accuracy_wrt_cited\": \"[\\n {\\n \\\"Fact\\\": \\\"1. The previous FIFA World Cup was held in Russia.\\\",\\n \\\"Reasoning\\\": \\\"The context explicitly states that the previous FIFA World Cup was held in Russia.\\\",\\n \\\"Judgement\\\": \\\"yes\\\"\\n }\\n]\"\n",
"}\n"
]
}
@@ -321,22 +427,22 @@
"text": [
"{\n",
" \"question\": \"Who won the 2022 FIFA World Cup?\",\n",
- " \"context\": \"Aliens won the 2022 FIFA World Cup. The 2022 FIFA World Cup took place in Qatar from 20 November to 18 December 2022. The previous FIFA World Cup was held in Russia.\",\n",
- " \"cited_context\": \"Aliens won the FIFA World Cup.\",\n",
- " \"response\": \"The 2022 FIFA World Cup was won by Aliens.\",\n",
+ " \"context\": \"Argentina won the 2022 FIFA World Cup. The 2022 FIFA World Cup took place in Qatar from 20 November to 18 December 2022. The previous FIFA World Cup was held in Russia.\",\n",
+ " \"cited_context\": \"Argentina won the FIFA World Cup.\",\n",
+ " \"response\": \"The 2022 FIFA World Cup was won by Qatar.\",\n",
" \"error_mode\": \"Hallucinations\",\n",
" \"error_resolution_suggestion\": \"Add instructions to your LLM to adher to the context provide - Try tipping\",\n",
" \"score_question_completeness\": 1,\n",
" \"score_valid_response\": 1.0,\n",
- " \"explanation_valid_response\": \"The given response to the question \\\"Who won the 2022 FIFA World Cup?\\\" is \\\"The 2022 FIFA World Cup was won by Aliens.\\\"\\n\\nStep by step reasoning:\\n1. The response provides information about the winner of the 2022 FIFA World Cup, stating that it was won by \\\"Aliens.\\\"\\n2. Although the information provided is not factually accurate, it does contain a specific response to the question asked.\\n\\nTherefore, the selected choice is A.\\n\\n[Choice]: A\",\n",
+ " \"explanation_valid_response\": \"{\\n \\\"Reasoning\\\": \\\"The response 'The 2022 FIFA World Cup was won by Qatar' provides the name of the winning team. Therefore, the response does contain information relevant to the question.\\\",\\n \\\"Choice\\\": \\\"A\\\"\\n}\",\n",
" \"score_context_relevance\": 1.0,\n",
- " \"explanation_context_relevance\": \" \\\"The extracted context clearly states that aliens won the 2022 FIFA World Cup in Qatar. This information completely answers the given user query about the winner of the tournament.\\\"\\n\",\n",
+ " \"explanation_context_relevance\": \"{\\n \\\"Reasoning\\\": \\\"The given context can answer the given question completely because it provides the relevant information that Argentina won the 2022 FIFA World Cup. The context also includes the location and dates of the event, which further confirms the accuracy of the information. Hence, selected choice is A. The extracted context can answer the given question completely.\\\",\\n \\\"Choice\\\": \\\"A\\\"\\n}\",\n",
" \"score_factual_accuracy\": 0.0,\n",
- " \"explanation_factual_accuracy\": \"The 2022 FIFA World Cup was won by Aliens.\\nReasoning for yes: The context explicitly states that Aliens won the 2022 FIFA World Cup.\\nReasoning for no: The context does not mention anything about Aliens winning the 2022 FIFA World Cup. It only provides information about the location and timing of the event.\\nJudgement: no. as the context does not verify the fact nor the fact can be logically derived from the context.\",\n",
- " \"score_cited_context_relevance\": 0.0,\n",
- " \"explanation_cited_context_relevance\": \"The extracted context doesn't contain any information related to the 2022 FIFA World Cup winner. The statement about aliens winning the FIFA World Cup is not relevant to the user query and does not provide any useful information to answer the question.\",\n",
- " \"score_factual_accuracy_wrt_cited\": 1.0,\n",
- " \"explanation_factual_accuracy_wrt_cited\": \"1. The 2022 FIFA World Cup took place.\\nReasoning for yes: The context explicitly states that the 2022 FIFA World Cup took place.\\nReasoning for no: No arguments.\\nJudgement: yes. as the context explicitly supports the fact.\\n\\n2. Aliens won the 2022 FIFA World Cup.\\nReasoning for yes: The context explicitly states that Aliens won the FIFA World Cup.\\nReasoning for no: No arguments.\\nJudgement: yes. as the context explicitly supports the fact.\\n\\n\"\n",
+ " \"explanation_factual_accuracy\": \"[\\n {\\n \\\"Fact\\\": \\\"1. The 2022 FIFA World Cup was won by Qatar.\\\",\\n \\\"Reasoning\\\": \\\"The context explicitly states that Argentina won the 2022 FIFA World Cup, not Qatar. Hence, the fact cannot be verified by the context.\\\",\\n \\\"Judgement\\\": \\\"no\\\"\\n }\\n]\",\n",
+ " \"score_cited_context_relevance\": 1.0,\n",
+ " \"explanation_cited_context_relevance\": \"{\\n \\\"Reasoning\\\": \\\"The given context can answer the given question completely because it directly provides the information about the winner of the 2022 FIFA World Cup, which is Argentina.\\\",\\n \\\"Choice\\\": \\\"A\\\"\\n}\",\n",
+ " \"score_factual_accuracy_wrt_cited\": 0.0,\n",
+ " \"explanation_factual_accuracy_wrt_cited\": \"[\\n {\\n \\\"Fact\\\": \\\"1. The 2022 FIFA World Cup was won by Qatar.\\\",\\n \\\"Reasoning\\\": \\\"The context explicitly states that Argentina won the FIFA World Cup, not Qatar. Hence, the fact cannot be verified by the context.\\\",\\n \\\"Judgement\\\": \\\"no\\\"\\n }\\n]\"\n",
"}\n"
]
}
@@ -362,7 +468,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.7"
+ "version": "3.11.8"
}
},
"nbformat": 4,
diff --git a/pyproject.toml b/pyproject.toml
index 9ddb34b09..9dc6a848e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"]
[project]
name = "uptrain"
-version = "0.6.5.post2"
+version = "0.6.6.post3"
description = "UpTrain - tool to evaluate LLM applications on aspects like factual accuracy, response quality, retrieval quality, tonality, etc."
readme = "README.md"
maintainers = [{ name = "UpTrain AI Team", email = "oss@uptrain.ai" }]
@@ -20,7 +20,8 @@ classifiers = [
]
keywords = ["uptrain", "ai", "LLM", "evaluation", "hallucinations", "observability", "response quality"]
dependencies = [
- "pydantic<1.10.10",
+ "pydantic",
+ "pydantic-settings",
"loguru",
"lazy_loader",
"networkx",
@@ -31,6 +32,7 @@ dependencies = [
"plotly>=5.0.0",
"aiolimiter>=1.1",
"openai>=1.6.1",
+ "fsspec",
]
[project.urls]
@@ -52,3 +54,6 @@ include-package-data = true
[tool.setuptools.package-data]
"*" = ["*.pyi"]
+
+[tool.setuptools.packages.find]
+include = ["uptrain"]
diff --git a/tests/test_builtins.py b/tests/test_builtins.py
index 7ccc1b865..4c52f25fa 100644
--- a/tests/test_builtins.py
+++ b/tests/test_builtins.py
@@ -8,6 +8,7 @@
"""
import polars as pl
+import os
from uptrain.framework import Settings
from uptrain.framework.builtins import (
@@ -32,8 +33,11 @@
CheckSubQueryCompleteness,
)
-# settings = Settings(openai_api_key="sk-************************")
-settings = Settings()
+# Enter your OpenAI API key here if it is not already set as an environment variable
+openai_api_key = os.environ.get("OPENAI_API_KEY")
+
+settings = Settings(openai_api_key=openai_api_key)
+
dataset = pl.DataFrame(
{
"response": [
@@ -70,45 +74,110 @@ def test_check_response_completeness():
check = CheckResponseCompleteness()
output = check.setup(settings).run(dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_response_completeness" in output.columns and "explanation_response_completeness" in output.columns
- assert output["score_response_completeness"].dtype == pl.Float64 and len(output["score_response_completeness"]) - output["score_response_completeness"].null_count() > 0
- assert output["explanation_response_completeness"].dtype == pl.Utf8 and len(output["explanation_response_completeness"]) - output["explanation_response_completeness"].null_count() > 0
+ assert (
+ "score_response_completeness" in output.columns
+ and "explanation_response_completeness" in output.columns
+ )
+ assert (
+ output["score_response_completeness"].dtype == pl.Float64
+ and len(output["score_response_completeness"])
+ - output["score_response_completeness"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_response_completeness"].dtype == pl.Utf8
+ and len(output["explanation_response_completeness"])
+ - output["explanation_response_completeness"].null_count()
+ > 0
+ )
def test_check_response_conciseness():
check = CheckResponseConciseness()
output = check.setup(settings).run(dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_response_conciseness" in output.columns and "explanation_response_conciseness" in output.columns
- assert output["score_response_conciseness"].dtype == pl.Float64 and len(output["score_response_conciseness"]) - output["score_response_conciseness"].null_count() > 0
- assert output["explanation_response_conciseness"].dtype == pl.Utf8 and len(output["explanation_response_conciseness"]) - output["explanation_response_conciseness"].null_count() > 0
+ assert (
+ "score_response_conciseness" in output.columns
+ and "explanation_response_conciseness" in output.columns
+ )
+ assert (
+ output["score_response_conciseness"].dtype == pl.Float64
+ and len(output["score_response_conciseness"])
+ - output["score_response_conciseness"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_response_conciseness"].dtype == pl.Utf8
+ and len(output["explanation_response_conciseness"])
+ - output["explanation_response_conciseness"].null_count()
+ > 0
+ )
def test_check_response_relevance():
check = CheckResponseRelevance()
output = check.setup(settings).run(dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_response_relevance" in output.columns and "explanation_response_relevance" in output.columns
- assert output["score_response_relevance"].dtype == pl.Float64 and len(output["score_response_relevance"]) - output["score_response_relevance"].null_count() > 0
- assert output["explanation_response_relevance"].dtype == pl.Utf8 and len(output["explanation_response_relevance"]) - output["explanation_response_relevance"].null_count() > 0
+ assert (
+ "score_response_relevance" in output.columns
+ and "explanation_response_relevance" in output.columns
+ )
+ assert (
+ output["score_response_relevance"].dtype == pl.Float64
+ and len(output["score_response_relevance"])
+ - output["score_response_relevance"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_response_relevance"].dtype == pl.Utf8
+ and len(output["explanation_response_relevance"])
+ - output["explanation_response_relevance"].null_count()
+ > 0
+ )
def test_check_valid_response():
check = CheckValidResponse()
output = check.setup(settings).run(dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_valid_response" in output.columns and "explanation_valid_response" in output.columns
- assert output["score_valid_response"].dtype == pl.Float64 and len(output["score_valid_response"]) - output["score_valid_response"].null_count() > 0
- assert output["explanation_valid_response"].dtype == pl.Utf8 and len(output["explanation_valid_response"]) - output["explanation_valid_response"].null_count() > 0
+ assert (
+ "score_valid_response" in output.columns
+ and "explanation_valid_response" in output.columns
+ )
+ assert (
+ output["score_valid_response"].dtype == pl.Float64
+ and len(output["score_valid_response"])
+ - output["score_valid_response"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_valid_response"].dtype == pl.Utf8
+ and len(output["explanation_valid_response"])
+ - output["explanation_valid_response"].null_count()
+ > 0
+ )
def test_check_response_consistency():
check = CheckResponseConsistency()
output = check.setup(settings).run(dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_response_consistency" in output.columns and "explanation_response_consistency" in output.columns
- assert output["score_response_consistency"].dtype == pl.Float64 and len(output["score_response_consistency"]) - output["score_response_consistency"].null_count() > 0
- assert output["explanation_response_consistency"].dtype == pl.Utf8 and len(output["explanation_response_consistency"]) - output["explanation_response_consistency"].null_count() > 0
+ assert (
+ "score_response_consistency" in output.columns
+ and "explanation_response_consistency" in output.columns
+ )
+ assert (
+ output["score_response_consistency"].dtype == pl.Float64
+ and len(output["score_response_consistency"])
+ - output["score_response_consistency"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_response_consistency"].dtype == pl.Utf8
+ and len(output["explanation_response_consistency"])
+ - output["explanation_response_consistency"].null_count()
+ > 0
+ )
response_matching_dataset = pl.DataFrame(
@@ -125,13 +194,27 @@ def test_check_response_consistency():
}
)
+
def test_check_response_matching():
check = CheckResponseMatching()
output = check.setup(settings).run(response_matching_dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_response_matching" in output.columns and "explanation_response_matching" in output.columns
- assert output["score_response_matching"].dtype == pl.Float64 and len(output["score_response_matching"]) - output["score_response_matching"].null_count() > 0
- assert output["explanation_response_matching"].dtype == pl.Utf8 and len(output["explanation_response_matching"]) - output["explanation_response_matching"].null_count() > 0
+ assert (
+ "score_response_matching" in output.columns
+ and "explanation_response_matching" in output.columns
+ )
+ assert (
+ output["score_response_matching"].dtype == pl.Float64
+ and len(output["score_response_matching"])
+ - output["score_response_matching"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_response_matching"].dtype == pl.Utf8
+ and len(output["explanation_response_matching"])
+ - output["explanation_response_matching"].null_count()
+ > 0
+ )
# -----------------------------------------------------------
@@ -143,15 +226,27 @@ def test_check_context_relevance():
check = CheckContextRelevance()
output = check.setup(settings).run(dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_context_relevance" in output.columns and "explanation_context_relevance" in output.columns
- assert output["score_context_relevance"].dtype == pl.Float64 and len(output["score_context_relevance"]) - output["score_context_relevance"].null_count() > 0
- assert output["explanation_context_relevance"].dtype == pl.Utf8 and len(output["explanation_context_relevance"]) - output["explanation_context_relevance"].null_count() > 0
+ assert (
+ "score_context_relevance" in output.columns
+ and "explanation_context_relevance" in output.columns
+ )
+ assert (
+ output["score_context_relevance"].dtype == pl.Float64
+ and len(output["score_context_relevance"])
+ - output["score_context_relevance"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_context_relevance"].dtype == pl.Utf8
+ and len(output["explanation_context_relevance"])
+ - output["explanation_context_relevance"].null_count()
+ > 0
+ )
+
context_reranking_dataset = pl.DataFrame(
{
- "question": [
- "What are the main causes of climate change?"
- ],
+ "question": ["What are the main causes of climate change?"],
"context": [
"""
1. The main causes of climate change include greenhouse gas emissions from human activities such as burning fossil fuels, deforestation, and industrial processes.
@@ -169,7 +264,7 @@ def test_check_context_relevance():
4. Other factors that contribute to climate change include methane emissions from livestock and rice paddies, as well as nitrous oxide emissions from agricultural fertilizers.
5. Changes in land use, such as urbanization and deforestation, also play a role in altering local climates and contributing to global climate change.
""",
- ]
+ ],
}
)
@@ -178,16 +273,27 @@ def test_check_context_reranking():
check = CheckContextReranking()
output = check.setup(settings).run(context_reranking_dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_context_reranking" in output.columns and "explanation_context_reranking" in output.columns
- assert output["score_context_reranking"].dtype == pl.Float64 and len(output["score_context_reranking"]) - output["score_context_reranking"].null_count() > 0
- assert output["explanation_context_reranking"].dtype == pl.Utf8 and len(output["explanation_context_reranking"]) - output["explanation_context_reranking"].null_count() > 0
+ assert (
+ "score_context_reranking" in output.columns
+ and "explanation_context_reranking" in output.columns
+ )
+ assert (
+ output["score_context_reranking"].dtype == pl.Float64
+ and len(output["score_context_reranking"])
+ - output["score_context_reranking"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_context_reranking"].dtype == pl.Utf8
+ and len(output["explanation_context_reranking"])
+ - output["explanation_context_reranking"].null_count()
+ > 0
+ )
context_conciseness_dataset = pl.DataFrame(
{
- "question": [
- "What are the main causes of climate change?"
- ],
+ "question": ["What are the main causes of climate change?"],
"context": [
"""
1. The main causes of climate change include greenhouse gas emissions from human activities such as burning fossil fuels, deforestation, and industrial processes.
@@ -202,62 +308,130 @@ def test_check_context_reranking():
1. Climate change is primarily driven by human-induced factors, including the release of carbon dioxide and other greenhouse gases into the atmosphere.
2. The main causes of climate change include greenhouse gas emissions from human activities such as burning fossil fuels, deforestation, and industrial processes.
""",
- ]
+ ],
}
)
+
def test_check_context_conciseness():
check = CheckContextConciseness()
output = check.setup(settings).run(context_conciseness_dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_context_conciseness" in output.columns and "explanation_context_conciseness" in output.columns
- assert output["score_context_conciseness"].dtype == pl.Float64 and len(output["score_context_conciseness"]) - output["score_context_conciseness"].null_count() > 0
- assert output["explanation_context_conciseness"].dtype == pl.Utf8 and len(output["explanation_context_conciseness"]) - output["explanation_context_conciseness"].null_count() > 0
+ assert (
+ "score_context_conciseness" in output.columns
+ and "explanation_context_conciseness" in output.columns
+ )
+ assert (
+ output["score_context_conciseness"].dtype == pl.Float64
+ and len(output["score_context_conciseness"])
+ - output["score_context_conciseness"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_context_conciseness"].dtype == pl.Utf8
+ and len(output["explanation_context_conciseness"])
+ - output["explanation_context_conciseness"].null_count()
+ > 0
+ )
+
def test_check_response_completeness_wrt_context():
check = CheckResponseCompletenessWrtContext()
output = check.setup(settings).run(dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_response_completeness_wrt_context" in output.columns and "explanation_response_completeness_wrt_context" in output.columns
- assert output["score_response_completeness_wrt_context"].dtype == pl.Float64 and len(output["score_response_completeness_wrt_context"]) - output["score_response_completeness_wrt_context"].null_count() > 0
- assert output["explanation_response_completeness_wrt_context"].dtype == pl.Utf8 and len(output["explanation_response_completeness_wrt_context"]) - output["explanation_response_completeness_wrt_context"].null_count() > 0
+ assert (
+ "score_response_completeness_wrt_context" in output.columns
+ and "explanation_response_completeness_wrt_context" in output.columns
+ )
+ assert (
+ output["score_response_completeness_wrt_context"].dtype == pl.Float64
+ and len(output["score_response_completeness_wrt_context"])
+ - output["score_response_completeness_wrt_context"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_response_completeness_wrt_context"].dtype == pl.Utf8
+ and len(output["explanation_response_completeness_wrt_context"])
+ - output["explanation_response_completeness_wrt_context"].null_count()
+ > 0
+ )
def test_check_response_facts():
check = CheckResponseFacts()
output = check.setup(settings).run(dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_factual_accuracy" in output.columns and "explanation_factual_accuracy" in output.columns
- assert output["score_factual_accuracy"].dtype == pl.Float64 and len(output["score_factual_accuracy"]) - output["score_factual_accuracy"].null_count() > 0
- assert output["explanation_factual_accuracy"].dtype == pl.Utf8 and len(output["explanation_factual_accuracy"]) - output["explanation_factual_accuracy"].null_count() > 0
+ assert (
+ "score_factual_accuracy" in output.columns
+ and "explanation_factual_accuracy" in output.columns
+ )
+ assert (
+ output["score_factual_accuracy"].dtype == pl.Float64
+ and len(output["score_factual_accuracy"])
+ - output["score_factual_accuracy"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_factual_accuracy"].dtype == pl.Utf8
+ and len(output["explanation_factual_accuracy"])
+ - output["explanation_factual_accuracy"].null_count()
+ > 0
+ )
# -----------------------------------------------------------
# Language Proficiency
# -----------------------------------------------------------
+
def test_check_language_quality():
check = CheckLanguageQuality()
output = check.setup(settings).run(dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_critique_language" in output.columns and "explanation_critique_language" in output.columns
- assert output["score_critique_language"].dtype == pl.Float64 and len(output["score_critique_language"]) - output["score_critique_language"].null_count() > 0
- assert output["explanation_critique_language"].dtype == pl.Utf8 and len(output["explanation_critique_language"]) - output["explanation_critique_language"].null_count() > 0
+ assert (
+ "score_critique_language" in output.columns
+ and "explanation_critique_language" in output.columns
+ )
+ assert (
+ output["score_critique_language"].dtype == pl.Float64
+ and len(output["score_critique_language"])
+ - output["score_critique_language"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_critique_language"].dtype == pl.Utf8
+ and len(output["explanation_critique_language"])
+ - output["explanation_critique_language"].null_count()
+ > 0
+ )
def test_check_tone_quality():
check = CheckToneQuality(llm_persona="wikipedia-bot")
output = check.setup(settings).run(dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_critique_tone" in output.columns and "explanation_critique_tone" in output.columns
- assert output["score_critique_tone"].dtype == pl.Float64 and len(output["score_critique_tone"]) - output["score_critique_tone"].null_count() > 0
- assert output["explanation_critique_tone"].dtype == pl.Utf8 and len(output["explanation_critique_tone"]) - output["explanation_critique_tone"].null_count() > 0
+ assert (
+ "score_critique_tone" in output.columns
+ and "explanation_critique_tone" in output.columns
+ )
+ assert (
+ output["score_critique_tone"].dtype == pl.Float64
+ and len(output["score_critique_tone"])
+ - output["score_critique_tone"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_critique_tone"].dtype == pl.Utf8
+ and len(output["explanation_critique_tone"])
+ - output["explanation_critique_tone"].null_count()
+ > 0
+ )
# # -----------------------------------------------------------
# # Code Hallucinations
# # -----------------------------------------------------------
-
+
code_hallucination_dataset = pl.DataFrame(
{
"question": [
@@ -265,23 +439,37 @@ def test_check_tone_quality():
"Can I create histograms with different bucket colors in Streamlit",
],
"context": [
- "This property lets you store Python primitives such as integers, floating-point numbers, complex numbers and booleans, dataframes, and even [lambdas](https://docs.python.org/3/reference/expressions.html#lambda) returned by functions. However, some execution environments may require serializing all data in Session State, so it may be useful to detect incompatibility during development, or when the execution environment will stop supporting it in the future.\n\nTo that end, Streamlit provides a `runner.enforceSerializableSessionState` [configuration option](https://docs.streamlit.io/library/advanced-features/configuration) that, when set to `true`, only allows pickle-serializable objects in Session State. To enable the option, either create a global or project config file with the following or use it as a command-line flag:\n\n\n```\n# .streamlit/config.toml\n[runner]\nenforceSerializableSessionState = true\n\n```\nBy \"*pickle-serializable*\", we mean calling `pickle.dumps(obj)` should not raise a [`PicklingError`](https://docs.python.org/3/library/pickle.html#pickle.PicklingError) exception. When the config option is enabled, adding unserializable data to session state should result in an exception. E.g.,\n\n\n```\nimport streamlit as st\n\ndef unserializable_data():\n return lambda x: x\n\n#👇 results in an exception when enforceSerializableSessionState is on\nst.session_state.unserializable = unserializable_data()\n\n```\n",
- "eader(\"Define a custom colorscale\")\ndf = px.data.iris()\nfig = px.scatter(\n df,\n x=\"sepal_width\",\n y=\"sepal_length\",\n color=\"sepal_length\",\n color_continuous_scale=\"reds\",\n)\n\ntab1, tab2 = st.tabs([\"Streamlit theme (default)\", \"Plotly native theme\"])\nwith tab1:\n st.plotly_chart(fig, theme=\"streamlit\", use_container_width=True)\nwith tab2:\n st.plotly_chart(fig, theme=None, use_container_width=True)\n\n```\nNotice how the custom color scale is still reflected in the chart, even when the Streamlit theme is enabled 👇\n\nFor many more examples of Plotly charts with and without the Streamlit theme, check out the [plotly.streamlit.app](https://plotly.streamlit.app).\n\n"
+ 'This property lets you store Python primitives such as integers, floating-point numbers, complex numbers and booleans, dataframes, and even [lambdas](https://docs.python.org/3/reference/expressions.html#lambda) returned by functions. However, some execution environments may require serializing all data in Session State, so it may be useful to detect incompatibility during development, or when the execution environment will stop supporting it in the future.\n\nTo that end, Streamlit provides a `runner.enforceSerializableSessionState` [configuration option](https://docs.streamlit.io/library/advanced-features/configuration) that, when set to `true`, only allows pickle-serializable objects in Session State. To enable the option, either create a global or project config file with the following or use it as a command-line flag:\n\n\n```\n# .streamlit/config.toml\n[runner]\nenforceSerializableSessionState = true\n\n```\nBy "*pickle-serializable*", we mean calling `pickle.dumps(obj)` should not raise a [`PicklingError`](https://docs.python.org/3/library/pickle.html#pickle.PicklingError) exception. When the config option is enabled, adding unserializable data to session state should result in an exception. E.g.,\n\n\n```\nimport streamlit as st\n\ndef unserializable_data():\n return lambda x: x\n\n#👇 results in an exception when enforceSerializableSessionState is on\nst.session_state.unserializable = unserializable_data()\n\n```\n',
+ 'eader("Define a custom colorscale")\ndf = px.data.iris()\nfig = px.scatter(\n df,\n x="sepal_width",\n y="sepal_length",\n color="sepal_length",\n color_continuous_scale="reds",\n)\n\ntab1, tab2 = st.tabs(["Streamlit theme (default)", "Plotly native theme"])\nwith tab1:\n st.plotly_chart(fig, theme="streamlit", use_container_width=True)\nwith tab2:\n st.plotly_chart(fig, theme=None, use_container_width=True)\n\n```\nNotice how the custom color scale is still reflected in the chart, even when the Streamlit theme is enabled 👇\n\nFor many more examples of Plotly charts with and without the Streamlit theme, check out the [plotly.streamlit.app](https://plotly.streamlit.app).\n\n',
],
"response": [
- "Serialization refers to the process of converting an object or data structure into a format that can be persisted and shared, and allowing you to recover the data’s original structure.\n\nBy default, Streamlit’s [Session State](https://docs.streamlit.io/library/advanced-features/session-state) allows you to persist any Python object for the duration of the session, irrespective of the object’s pickle-serializability. \n\nStreamlit provides a `runner.enforceSerializableSessionState` [configuration option](https://docs.streamlit.io/library/advanced-features/configuration) that, when set to `true`, only allows pickle-serializable objects in Session State.\n\nBy \"*pickle-serializable*\", we mean calling `pickle.dumps(obj)` should not raise a [`PicklingError`](https://docs.python.org/3/library/pickle.html#pickle.PicklingError) exception",
- "The `color_continuous_scale` parameter in the `px.scatter` function allows you to define a custom color scale for the plot. You can use any of the built-in color scales provided by Plotly, or you can define your own custom color scale using a list of colors. The `color_continuous_scale` parameter accepts a string that specifies the name of the color scale to use. You can also pass a list of colors to the `color_continuous_scale` parameter to define a custom color scale. The list of colors can be any length, and the colors can be specified using any valid CSS color format, such as hex, RGB, or HSL. The `color_continuous_scale` parameter allows you to create a custom color scale for the plot that is tailored to your specific needs."
+ 'Serialization refers to the process of converting an object or data structure into a format that can be persisted and shared, and allowing you to recover the data’s original structure.\n\nBy default, Streamlit’s [Session State](https://docs.streamlit.io/library/advanced-features/session-state) allows you to persist any Python object for the duration of the session, irrespective of the object’s pickle-serializability. \n\nStreamlit provides a `runner.enforceSerializableSessionState` [configuration option](https://docs.streamlit.io/library/advanced-features/configuration) that, when set to `true`, only allows pickle-serializable objects in Session State.\n\nBy "*pickle-serializable*", we mean calling `pickle.dumps(obj)` should not raise a [`PicklingError`](https://docs.python.org/3/library/pickle.html#pickle.PicklingError) exception',
+ "The `color_continuous_scale` parameter in the `px.scatter` function allows you to define a custom color scale for the plot. You can use any of the built-in color scales provided by Plotly, or you can define your own custom color scale using a list of colors. The `color_continuous_scale` parameter accepts a string that specifies the name of the color scale to use. You can also pass a list of colors to the `color_continuous_scale` parameter to define a custom color scale. The list of colors can be any length, and the colors can be specified using any valid CSS color format, such as hex, RGB, or HSL. The `color_continuous_scale` parameter allows you to create a custom color scale for the plot that is tailored to your specific needs.",
],
}
)
+
def test_check_code_hallucination():
check = CheckCodeHallucination()
output = check.setup(settings).run(code_hallucination_dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_code_hallucination" in output.columns and "explanation_code_hallucination" in output.columns
- assert output["score_code_hallucination"].dtype == pl.Float64 and len(output["score_code_hallucination"]) - output["score_code_hallucination"].null_count() > 0
- assert output["explanation_code_hallucination"].dtype == pl.Utf8 and len(output["explanation_code_hallucination"]) - output["explanation_code_hallucination"].null_count() > 0
+ assert (
+ "score_code_hallucination" in output.columns
+ and "explanation_code_hallucination" in output.columns
+ )
+ assert (
+ output["score_code_hallucination"].dtype == pl.Float64
+ and len(output["score_code_hallucination"])
+ - output["score_code_hallucination"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_code_hallucination"].dtype == pl.Utf8
+ and len(output["explanation_code_hallucination"])
+ - output["explanation_code_hallucination"].null_count()
+ > 0
+ )
# # -----------------------------------------------------------
@@ -308,37 +496,69 @@ def test_check_code_hallucination():
Doctor: You should try to rest your knee.
Patient: I have been resting it for a few days now.
Doctor: I don't know what else to suggest.
- """
+ """,
]
}
)
+
def test_check_conversation_satisfaction():
check = CheckConversationSatisfaction(user_role="Patient", llm_role="Doctor")
output = check.setup(settings).run(conversation_satisfaction_dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_conversation_satisfaction" in output.columns and "explanation_conversation_satisfaction" in output.columns
- assert output["score_conversation_satisfaction"].dtype == pl.Float64 and len(output["score_conversation_satisfaction"]) - output["score_conversation_satisfaction"].null_count() > 0
- assert output["explanation_conversation_satisfaction"].dtype == pl.Utf8 and len(output["explanation_conversation_satisfaction"]) - output["explanation_conversation_satisfaction"].null_count() > 0
+ assert (
+ "score_conversation_satisfaction" in output.columns
+ and "explanation_conversation_satisfaction" in output.columns
+ )
+ assert (
+ output["score_conversation_satisfaction"].dtype == pl.Float64
+ and len(output["score_conversation_satisfaction"])
+ - output["score_conversation_satisfaction"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_conversation_satisfaction"].dtype == pl.Utf8
+ and len(output["explanation_conversation_satisfaction"])
+ - output["explanation_conversation_satisfaction"].null_count()
+ > 0
+ )
# -----------------------------------------------------------
# Custom Evaluations
# -----------------------------------------------------------
-
+
+
def test_check_guideline_adherence():
- check = CheckGuidelineAdherence(guideline="The response should not contain any numbers or statistic", guideline_name="guideline", response_schema=None)
+ check = CheckGuidelineAdherence(
+ guideline="The response should not contain any numbers or statistic",
+ guideline_name="guideline",
+ response_schema=None,
+ )
output = check.setup(settings).run(dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_guideline_adherence" in output.columns and "explanation_guideline_adherence" in output.columns
- assert output["score_guideline_adherence"].dtype == pl.Float64 and len(output["score_guideline_adherence"]) - output["score_guideline_adherence"].null_count() > 0
- assert output["explanation_guideline_adherence"].dtype == pl.Utf8 and len(output["explanation_guideline_adherence"]) - output["explanation_guideline_adherence"].null_count() > 0
+ assert (
+ "score_guideline_adherence" in output.columns
+ and "explanation_guideline_adherence" in output.columns
+ )
+ assert (
+ output["score_guideline_adherence"].dtype == pl.Float64
+ and len(output["score_guideline_adherence"])
+ - output["score_guideline_adherence"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_guideline_adherence"].dtype == pl.Utf8
+ and len(output["explanation_guideline_adherence"])
+ - output["explanation_guideline_adherence"].null_count()
+ > 0
+ )
# # -----------------------------------------------------------
# # Compare response with ground truth
# # -----------------------------------------------------------
-
+
# def test_check_response_matching():
# check = CheckResponseMatching()
# output = check.setup(settings).run(dataset)
@@ -351,29 +571,56 @@ def test_check_guideline_adherence():
# -----------------------------------------------------------
# Security
# -----------------------------------------------------------
-
+
+
def test_check_prompt_injection():
check = CheckPromptInjection()
output = check.setup(settings).run(dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_prompt_injection" in output.columns and "explanation_prompt_injection" in output.columns
- assert output["score_prompt_injection"].dtype == pl.Float64 and len(output["score_prompt_injection"]) - output["score_prompt_injection"].null_count() > 0
- assert output["explanation_prompt_injection"].dtype == pl.Utf8 and len(output["explanation_prompt_injection"]) - output["explanation_prompt_injection"].null_count() > 0
+ assert (
+ "score_prompt_injection" in output.columns
+ and "explanation_prompt_injection" in output.columns
+ )
+ assert (
+ output["score_prompt_injection"].dtype == pl.Float64
+ and len(output["score_prompt_injection"])
+ - output["score_prompt_injection"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_prompt_injection"].dtype == pl.Utf8
+ and len(output["explanation_prompt_injection"])
+ - output["explanation_prompt_injection"].null_count()
+ > 0
+ )
def test_check_jailbreak_detection():
check = CheckJailbreakDetection()
output = check.setup(settings).run(dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_jailbreak_attempted" in output.columns and "explanation_jailbreak_attempted" in output.columns
- assert output["score_jailbreak_attempted"].dtype == pl.Float64 and len(output["score_jailbreak_attempted"]) - output["score_jailbreak_attempted"].null_count() > 0
- assert output["explanation_jailbreak_attempted"].dtype == pl.Utf8 and len(output["explanation_jailbreak_attempted"]) - output["explanation_jailbreak_attempted"].null_count() > 0
+ assert (
+ "score_jailbreak_attempted" in output.columns
+ and "explanation_jailbreak_attempted" in output.columns
+ )
+ assert (
+ output["score_jailbreak_attempted"].dtype == pl.Float64
+ and len(output["score_jailbreak_attempted"])
+ - output["score_jailbreak_attempted"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_jailbreak_attempted"].dtype == pl.Utf8
+ and len(output["explanation_jailbreak_attempted"])
+ - output["explanation_jailbreak_attempted"].null_count()
+ > 0
+ )
# -----------------------------------------------------------
# Sub Query
# -----------------------------------------------------------
-
+
sub_query_dataset = pl.DataFrame(
{
"question": [
@@ -411,11 +658,25 @@ def test_check_jailbreak_detection():
],
}
)
-
+
+
def test_check_sub_query_completeness():
check = CheckSubQueryCompleteness()
output = check.setup(settings).run(sub_query_dataset)
assert isinstance(output, pl.DataFrame)
- assert "score_sub_query_completeness" in output.columns and "explanation_sub_query_completeness" in output.columns
- assert output["score_sub_query_completeness"].dtype == pl.Float64 and len(output["score_sub_query_completeness"]) - output["score_sub_query_completeness"].null_count() > 0
- assert output["explanation_sub_query_completeness"].dtype == pl.Utf8 and len(output["explanation_sub_query_completeness"]) - output["explanation_sub_query_completeness"].null_count() > 0
+ assert (
+ "score_sub_query_completeness" in output.columns
+ and "explanation_sub_query_completeness" in output.columns
+ )
+ assert (
+ output["score_sub_query_completeness"].dtype == pl.Float64
+ and len(output["score_sub_query_completeness"])
+ - output["score_sub_query_completeness"].null_count()
+ > 0
+ )
+ assert (
+ output["explanation_sub_query_completeness"].dtype == pl.Utf8
+ and len(output["explanation_sub_query_completeness"])
+ - output["explanation_sub_query_completeness"].null_count()
+ > 0
+ )
diff --git a/uptrain/cli.py b/uptrain/cli.py
index 37ae889b4..d1db0751a 100644
--- a/uptrain/cli.py
+++ b/uptrain/cli.py
@@ -18,7 +18,7 @@
"tqdm>=4.0",
],
"st_classic": [
- "plotly>=5.0.0",
+ "plotly>=5.0.0",
"streamlit>=1.23",
"pyarrow>=10.0.0",
],
diff --git a/uptrain/dashboard/backend/app.py b/uptrain/dashboard/backend/app.py
index 3b6556459..16c747fac 100644
--- a/uptrain/dashboard/backend/app.py
+++ b/uptrain/dashboard/backend/app.py
@@ -41,12 +41,7 @@
from loguru import logger
from sqlalchemy.orm import Session
-from uptrain.utilities.db import (
- create_database,
- ModelDataset,
- ModelUser,
- ModelPrompt
-)
+from uptrain.utilities.db import create_database, ModelDataset, ModelUser, ModelPrompt
from uptrain.utilities.utils import (
get_sqlite_utils_db,
_get_fsspec_filesystem,
@@ -54,7 +49,7 @@
convert_project_to_dicts,
checks_mapping,
create_dirs,
- get_current_datetime
+ get_current_datetime,
)
from uptrain.utilities import polars_to_pandas
@@ -74,9 +69,10 @@ def _row_to_dict(row):
ACCESS_TOKEN = APIKeyHeader(name="uptrain-access-token", auto_error=False)
# database
-#/data/uptrain-server.db"
+# /data/uptrain-server.db"
create_dirs(DATABASE_PATH)
-SessionLocal = create_database("sqlite:///" + DATABASE_PATH + 'uptrain-local-server.db')
+SessionLocal = create_database("sqlite:///" + DATABASE_PATH + "uptrain-local-server.db")
+
def _create_user(db: Session, name: str):
"""Create a new user."""
@@ -90,6 +86,7 @@ def _create_user(db: Session, name: str):
db.rollback()
raise exc
+
def get_db():
"""Get the database session."""
db = SessionLocal()
@@ -98,9 +95,10 @@ def get_db():
finally:
SessionLocal.remove()
+
try:
_create_user(SessionLocal(), "default_key")
-except:
+except Exception:
pass
# some methods need a context manager to get the db
@@ -115,9 +113,7 @@ def get_fsspec_fs():
pass
-async def validate_api_key_public(
- key_header: str = Security(ACCESS_TOKEN)
-) -> str:
+async def validate_api_key_public(key_header: str = Security(ACCESS_TOKEN)) -> str:
"""Validate API key and return the user id.
For public API, the API key is the access token provided to them by uptrain and we
@@ -128,16 +124,13 @@ async def validate_api_key_public(
raise HTTPException(status_code=403, detail="Unspecified API key")
else:
with get_db_context() as db:
- db_item = (
- db.query(ModelUser).filter_by(name=key_header).first()
- )
+ db_item = db.query(ModelUser).filter_by(name=key_header).first()
if db_item is not None:
return db_item.id
else:
raise HTTPException(status_code=403, detail="Invalid API key")
-
# -----------------------------------------------------------
# Routers
# -----------------------------------------------------------
@@ -146,9 +139,10 @@ async def validate_api_key_public(
router_internal = APIRouter()
# -----------------------------------------------------------
-# Internal API
+# Internal API
# -----------------------------------------------------------
+
@router_internal.post("/user")
def add_user(user: app_schema.UserCreate, db: Session = Depends(get_db)):
"""Add a new user."""
@@ -173,15 +167,17 @@ def add_user(user: app_schema.UserCreate, db: Session = Depends(get_db)):
# Request to get user name, API key, user credits used and total using api key
@router_public.post("/user")
def get_user(
- user_id: str = Depends(validate_api_key_public),
- db: Session = Depends(get_db)
+ user_id: str = Depends(validate_api_key_public), db: Session = Depends(get_db)
):
user = db.query(ModelUser).filter_by(id=user_id).first()
if user is None:
raise HTTPException(status_code=404, detail="User not found")
else:
- return {"id" : user_id, "user_name" : "open-source user", "api_key" : "default_key"}
-
+ return {
+ "id": user_id,
+ "user_name": "open-source user",
+ "api_key": "default_key",
+ }
@router_public.get("/get_project_data", response_model=app_schema.ProjectData)
@@ -191,15 +187,14 @@ def get_project_data(
db: Session = Depends(get_db),
user_id: str = Depends(validate_api_key_public),
):
- """Get all the data for a particular project_name for the given user.
- """
+ """Get all the data for a particular project_name for the given user."""
projects = get_projects_list(num_days=num_days, db=db, user_id=user_id)
for project in projects.data:
if project["project"] == project_name:
run_via = project["run_via"]
if run_via == "project" or run_via == "experiment":
- if run_via == 'project':
+ if run_via == "project":
query = f"""
SELECT *
FROM results
@@ -211,13 +206,15 @@ def get_project_data(
FROM results
WHERE project = '{project_name}' AND metadata LIKE '%uptrain_experiment_columns%' AND timestamp > datetime('now', '-{num_days} days')
"""
- fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results" , f"{user_id}.db")
+ fpath = os.path.join(
+ DATABASE_PATH, "uptrain-eval-results", f"{user_id}.db"
+ )
if not os.path.exists(fpath):
raise HTTPException(
status_code=404, detail="No evaluations run yet for this user"
)
DB = get_sqlite_utils_db(fpath)
-
+
buffer = io.StringIO()
for row in DB.query(query):
buffer.write(json.dumps(row) + "\n")
@@ -228,27 +225,65 @@ def get_project_data(
for key in details:
try:
details[key] = json.loads(details[key])
- except:
+ except Exception:
pass
data.append(details)
- scores = [col[6:] for col in data[0]['checks'].keys() if col.startswith("score_")]
+ scores = [
+ col[6:]
+ for col in data[0]["checks"].keys()
+ if col.startswith("score_")
+ ]
if run_via == "project":
- return app_schema.ProjectData(data = [data, None, project["latest_timestamp"][:10], None, scores], project_name = project_name)
+ return app_schema.ProjectData(
+ data=[
+ data,
+ None,
+ project["latest_timestamp"][:10],
+ None,
+ scores,
+ ],
+ project_name=project_name,
+ )
else:
exp_data = convert_project_to_polars(data)
exp_column = str(exp_data["uptrain_experiment_columns"][0][0])
plot_data = {}
for col in scores:
- col_name = 'score_' + col
- plot_data.update({col : exp_data.group_by([exp_column], maintain_order=True).agg(pl.col(col_name)).to_dicts()})
+ col_name = "score_" + col
+ plot_data.update(
+ {
+ col: exp_data.group_by(
+ [exp_column], maintain_order=True
+ )
+ .agg(pl.col(col_name))
+ .to_dicts()
+ }
+ )
columns = exp_data.columns
- columns.remove('question')
- display_data = exp_data.group_by(["question"], maintain_order=True).agg(pl.col(col) for col in columns).to_dicts()
- unqiue_values = list(set(exp_data[exp_column].to_list()))
- return app_schema.ProjectData(data = [display_data, None, project["latest_timestamp"][:10], None, scores, unqiue_values, exp_column, plot_data], project_name = project_name)
-
+ columns.remove("question")
+ display_data = (
+ exp_data.group_by(["question"], maintain_order=True)
+ .agg(pl.col(col) for col in columns)
+ .to_dicts()
+ )
+ unqiue_values = list(set(exp_data[exp_column].to_list()))
+ return app_schema.ProjectData(
+ data=[
+ display_data,
+ None,
+ project["latest_timestamp"][:10],
+ None,
+ scores,
+ unqiue_values,
+ exp_column,
+ plot_data,
+ ],
+ project_name=project_name,
+ )
+
+
@router_public.get("/get_prompt_data", response_model=app_schema.ProjectData)
def get_prompt_data(
project_name: str,
@@ -256,8 +291,7 @@ def get_prompt_data(
db: Session = Depends(get_db),
user_id: str = Depends(validate_api_key_public),
):
- """Get all the data for a particular project_name for the given user.
- """
+ """Get all the data for a particular project_name for the given user."""
projects = get_projects_list(num_days=num_days, db=db, user_id=user_id)
for project in projects.data:
@@ -269,13 +303,15 @@ def get_prompt_data(
FROM results
WHERE project = '{project_name}' AND metadata like '%prompt_version%' AND metadata NOT LIKE '%uptrain_experiment_columns%' AND timestamp > datetime('now', '-{num_days} days')
"""
- fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results" , f"{user_id}.db")
+ fpath = os.path.join(
+ DATABASE_PATH, "uptrain-eval-results", f"{user_id}.db"
+ )
if not os.path.exists(fpath):
raise HTTPException(
status_code=404, detail="No evaluations run yet for this user"
)
DB = get_sqlite_utils_db(fpath)
-
+
buffer = io.StringIO()
for row in DB.query(query):
buffer.write(json.dumps(row) + "\n")
@@ -286,34 +322,40 @@ def get_prompt_data(
for key in details:
try:
details[key] = json.loads(details[key])
- except:
+ except Exception:
pass
data.append(details)
exp_data, checks_mapping = convert_project_to_dicts(data)
-
+
columns = exp_data.columns
- columns.remove('prompt_name')
- columns.remove('prompt_version')
- data = exp_data.group_by(['prompt_name', 'prompt_version'], maintain_order=True).agg(pl.col(col) for col in columns)
+ columns.remove("prompt_name")
+ columns.remove("prompt_version")
+ data = exp_data.group_by(
+ ["prompt_name", "prompt_version"], maintain_order=True
+ ).agg(pl.col(col) for col in columns)
columns = data.columns
- columns.remove('prompt_name')
- data = data.group_by(['prompt_name'], maintain_order=True).agg(pl.col(col) for col in columns).to_dicts()
-
+ columns.remove("prompt_name")
+ data = (
+ data.group_by(["prompt_name"], maintain_order=True)
+ .agg(pl.col(col) for col in columns)
+ .to_dicts()
+ )
+
for row in data:
- row['scores'] = []
- uuid_tags_version = row['uuid_tag']
+ row["scores"] = []
+ uuid_tags_version = row["uuid_tag"]
for uuid_tags in uuid_tags_version:
scores = []
for uuid in uuid_tags:
score = checks_mapping[uuid]
scores.append(score)
- row['scores'].append(pl.DataFrame(scores).mean().to_dicts()[0])
+ row["scores"].append(pl.DataFrame(scores).mean().to_dicts()[0])
res = []
for prompt in data:
prompt_data = []
- num_versions = len(prompt['prompt_version'])
+ num_versions = len(prompt["prompt_version"])
for i in range(num_versions):
prompt_v = {}
for key, value in prompt.items():
@@ -323,16 +365,32 @@ def get_prompt_data(
# Remove the explanations from the scores
elif key == "scores":
try:
- value = [{k: round(float(v), 3) for k, v in score.items() if not k.startswith("explanation")} for score in value]
- except:
- value = [{k: v for k, v in score.items() if not k.startswith("explanation")} for score in value]
+ value = [
+ {
+ k: round(float(v), 3)
+ for k, v in score.items()
+ if not k.startswith("explanation")
+ }
+ for score in value
+ ]
+ except Exception:
+ value = [
+ {
+ k: v
+ for k, v in score.items()
+ if not k.startswith("explanation")
+ }
+ for score in value
+ ]
# Handle cases where the value is a list or a string
if isinstance(value, list):
prompt_v[key] = value[i]
else:
prompt_v[key] = value
prompt_data.append(prompt_v)
- res.append({"prompt_name": prompt["prompt_name"], "prompts": prompt_data})
+ res.append(
+ {"prompt_name": prompt["prompt_name"], "prompts": prompt_data}
+ )
return app_schema.ProjectData(data=res, project_name=project_name)
@@ -342,7 +400,7 @@ async def add_project_data(
user_id: str = Depends(validate_api_key_public),
db: Session = Depends(get_db),
):
-
+
fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results", f"{user_id}.db")
DB = get_sqlite_utils_db(fpath)
@@ -352,7 +410,7 @@ async def add_project_data(
checks = eval_args.checks
project = eval_args.project
timestamp = get_current_datetime()
- try:
+ try:
DB["results"].insert_all(
[
{
@@ -361,16 +419,18 @@ async def add_project_data(
"metadata": metadata,
"schema": schema,
"project": project,
- "timestamp": timestamp
+ "timestamp": timestamp,
}
- for row_data, row_check in zip(results, checks)
+ for row_data, row_check in zip(results, checks)
]
)
except Exception as e:
logger.exception(f"Error running the eval: {e}")
- raise HTTPException(status_code=500, detail=f"Error saving the data for the project: {e}")
+ raise HTTPException(
+ status_code=500, detail=f"Error saving the data for the project: {e}"
+ )
+
-
@router_public.get("/get_projects_list", response_model=app_schema.ProjectsList)
def get_projects_list(
num_days: int = 200,
@@ -379,8 +439,7 @@ def get_projects_list(
db: Session = Depends(get_db),
user_id: str = Depends(validate_api_key_public),
):
- """Get all the project names associated with the user.
- """
+ """Get all the project names associated with the user."""
user = db.query(ModelUser).filter_by(id=user_id).first()
if user is None:
raise HTTPException(status_code=403, detail="Invalid user name")
@@ -395,15 +454,15 @@ def get_projects_list(
ORDER BY latest_timestamp DESC
LIMIT {limit}
"""
- fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results" , f"{user_id}.db")
+ fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results", f"{user_id}.db")
if not os.path.exists(fpath):
raise HTTPException(
status_code=404, detail="No evaluations run yet for this user"
)
DB = get_sqlite_utils_db(fpath)
- experiment_runs = DB.query(query)
- except:
+ experiment_runs = DB.query(query)
+ except Exception:
experiment_runs = []
try:
@@ -415,20 +474,22 @@ def get_projects_list(
ORDER BY latest_timestamp DESC
LIMIT {limit}
"""
- fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results" , f"{user_id}.db")
+ fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results", f"{user_id}.db")
if not os.path.exists(fpath):
raise HTTPException(
status_code=404, detail="No evaluations run yet for this user"
)
DB = get_sqlite_utils_db(fpath)
- project_runs = DB.query(query)
- except:
+ project_runs = DB.query(query)
+ except Exception:
project_runs = []
try:
- prompts_runs = get_prompts_list(num_days=num_days, limit=limit, db=db, user_id=user_id)
- except:
+ prompts_runs = get_prompts_list(
+ num_days=num_days, limit=limit, db=db, user_id=user_id
+ )
+ except Exception:
prompts_runs = []
out = []
@@ -447,7 +508,7 @@ def get_projects_list(
{
"project": run["project"],
"latest_timestamp": run["latest_timestamp"],
- "run_via": "experiment"
+ "run_via": "experiment",
}
)
@@ -456,12 +517,12 @@ def get_projects_list(
{
"project": run["project"],
"latest_timestamp": run["latest_timestamp"],
- "run_via": "prompt"
+ "run_via": "prompt",
}
)
out.sort(reverse=True, key=lambda x: x["latest_timestamp"])
- return app_schema.ProjectsList(data = out, user_name = user_name)
+ return app_schema.ProjectsList(data=out, user_name=user_name)
@router_public.get("/get_evaluations_list", response_model=app_schema.ProjectsList)
@@ -472,8 +533,7 @@ def get_evaluations_list(
db: Session = Depends(get_db),
user_id: str = Depends(validate_api_key_public),
):
- """Get all the project names associated with the user.
- """
+ """Get all the project names associated with the user."""
user = db.query(ModelUser).filter_by(id=user_id).first()
if user is None:
raise HTTPException(status_code=403, detail="Invalid user name")
@@ -490,19 +550,19 @@ def get_evaluations_list(
LIMIT {limit}
"""
- fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results" , f"{user_id}.db")
+ fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results", f"{user_id}.db")
if not os.path.exists(fpath):
raise HTTPException(
status_code=404, detail="No evaluations run yet for this user"
)
DB = get_sqlite_utils_db(fpath)
- project_runs = DB.query(query)
- except:
+ project_runs = DB.query(query)
+ except Exception:
project_runs = []
out = []
-
+
for run in project_runs:
out.append(
{
@@ -513,7 +573,7 @@ def get_evaluations_list(
)
out.sort(reverse=True, key=lambda x: x["latest_timestamp"])
- return app_schema.ProjectsList(data = out, user_name = user_name)
+ return app_schema.ProjectsList(data=out, user_name=user_name)
@router_public.get("/get_experiments_list", response_model=app_schema.ProjectsList)
@@ -523,8 +583,7 @@ def get_experiments_list(
db: Session = Depends(get_db),
user_id: str = Depends(validate_api_key_public),
):
- """Get all the experiment names associated with the user.
- """
+ """Get all the experiment names associated with the user."""
user = db.query(ModelUser).filter_by(id=user_id).first()
if user is None:
raise HTTPException(status_code=403, detail="Invalid user name")
@@ -541,15 +600,15 @@ def get_experiments_list(
LIMIT {limit}
"""
- fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results" , f"{user_id}.db")
+ fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results", f"{user_id}.db")
if not os.path.exists(fpath):
raise HTTPException(
status_code=404, detail="No evaluations run yet for this user"
)
DB = get_sqlite_utils_db(fpath)
- project_runs = DB.query(query)
- except:
+ project_runs = DB.query(query)
+ except Exception:
project_runs = []
out = []
@@ -564,7 +623,7 @@ def get_experiments_list(
)
out.sort(reverse=True, key=lambda x: x["latest_timestamp"])
- return app_schema.ProjectsList(data = out, user_name = user_name)
+ return app_schema.ProjectsList(data=out, user_name=user_name)
@router_public.get("/get_prompts_list", response_model=app_schema.ProjectsList)
@@ -574,8 +633,7 @@ def get_prompts_list(
db: Session = Depends(get_db),
user_id: str = Depends(validate_api_key_public),
):
- """Get all the experiment names associated with the user.
- """
+ """Get all the experiment names associated with the user."""
user = db.query(ModelUser).filter_by(id=user_id).first()
if user is None:
raise HTTPException(status_code=403, detail="Invalid user name")
@@ -591,15 +649,15 @@ def get_prompts_list(
ORDER BY latest_timestamp DESC
LIMIT {limit}
"""
- fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results" , f"{user_id}.db")
+ fpath = os.path.join(DATABASE_PATH, "uptrain-eval-results", f"{user_id}.db")
if not os.path.exists(fpath):
raise HTTPException(
status_code=404, detail="No evaluations run yet for this user"
)
DB = get_sqlite_utils_db(fpath)
- prompts_runs = DB.query(query)
- except:
+ prompts_runs = DB.query(query)
+ except Exception:
prompts_runs = []
out = []
@@ -614,12 +672,12 @@ def get_prompts_list(
)
out.sort(reverse=True, key=lambda x: x["latest_timestamp"])
- return app_schema.ProjectsList(data = out, user_name = user_name)
+ return app_schema.ProjectsList(data=out, user_name=user_name)
@router_public.post("/find_common_topic")
async def find_common_topic(
- args : app_schema.TopicGenerate,
+ args: app_schema.TopicGenerate,
db: Session = Depends(get_db),
user_id: str = Depends(validate_api_key_public),
):
@@ -632,22 +690,32 @@ async def find_common_topic(
for elem in dataset:
if elem[1] is not None and elem[1] == 0.0:
refined_items.append(elem[0])
-
- refined_items = refined_items[:min(50, len(refined_items))]
- data = list(map(lambda x: {'question': x, 'cluster_index' : 0, 'cluster_index_distance' : 0}, refined_items))
+
+ refined_items = refined_items[: min(50, len(refined_items))]
+ data = list(
+ map(
+ lambda x: {"question": x, "cluster_index": 0, "cluster_index_distance": 0},
+ refined_items,
+ )
+ )
from uptrain.operators import TopicGenerator
+
user = db.query(ModelUser).filter_by(id=user_id).first()
if user is None:
raise HTTPException(status_code=403, detail="Invalid user name")
else:
user_name = user.name
-
- user_headers={"openai_api_key": user_name}
-
+
+ user_headers = {"openai_api_key": user_name}
+
try:
- result = TopicGenerator().setup(Settings(**user_headers)).run(pl.DataFrame(data))['output']
- return {'common_topic': result.to_dicts()[0]['topic']}
+ result = (
+ TopicGenerator()
+ .setup(Settings(**user_headers))
+ .run(pl.DataFrame(data))["output"]
+ )
+ return {"common_topic": result.to_dicts()[0]["topic"]}
except Exception as exc:
logger.exception("Error creating run")
db.rollback()
@@ -667,7 +735,7 @@ async def add_evaluation(
fsspec_fs: t.Any = Depends(get_fsspec_fs),
):
## project key would be present in the eval_args.metadata
-
+
existing_dataset = (
db.query(ModelDataset)
.filter_by(name=dataset_name, user_id=user_id)
@@ -718,18 +786,31 @@ async def add_evaluation(
checks_1.append(final_check)
settings_data = {}
- settings_data['model'] = model
+ settings_data["model"] = model
settings_data.update(metadata[model])
try:
from uptrain import EvalLLM
+
user_client = EvalLLM(Settings(**settings_data))
- data = JsonReader(fpath = os.path.join(DATABASE_PATH, "uptrain-datasets", name_w_version)).setup(Settings()).run()['output'].to_dicts()
- results = user_client.evaluate(data=data, checks=checks_1, project_name=project_name)
+ data = (
+ JsonReader(
+ fpath=os.path.join(DATABASE_PATH, "uptrain-datasets", name_w_version)
+ )
+ .setup(Settings())
+ .run()["output"]
+ .to_dicts()
+ )
+ results = user_client.evaluate(
+ data=data, checks=checks_1, project_name=project_name
+ )
return {"message": f"Evaluation has been queued up"}
except Exception as e:
logger.exception(f"Error running the eval: {e}")
- raise HTTPException(status_code=500, detail=f"Error running the evaluation: {e}")
+ raise HTTPException(
+ status_code=500, detail=f"Error running the evaluation: {e}"
+ )
+
@router_public.post("/add_prompts")
async def add_prompts(
@@ -752,7 +833,7 @@ async def add_prompts(
raise HTTPException(status_code=403, detail="Invalid user name")
else:
user_name = user.name
-
+
existing_dataset = (
db.query(ModelDataset)
.filter_by(name=dataset_name, user_id=user_id)
@@ -787,7 +868,7 @@ async def add_prompts(
raise HTTPException(
status_code=400, detail="Error adding/updating dataset to platform"
)
-
+
existing_prompt = (
db.query(ModelPrompt)
.filter_by(name=prompt_name, user_id=user_id)
@@ -800,10 +881,7 @@ async def add_prompts(
version = 1
try:
db_item = ModelPrompt(
- user_id=user_id,
- name=prompt_name,
- version=version,
- prompt=prompt
+ user_id=user_id, name=prompt_name, version=version, prompt=prompt
)
db.add(db_item)
db.commit()
@@ -813,7 +891,7 @@ async def add_prompts(
raise HTTPException(
status_code=400, detail="Error adding/updating prompts to platform"
)
-
+
checks = eval(checks[0])
checks_1 = []
metadata = eval(metadata)
@@ -828,31 +906,46 @@ async def add_prompts(
checks_1.append(final_check)
settings_data = {}
- settings_data['model'] = model
+ settings_data["model"] = model
settings_data.update(metadata[model])
from uptrain.operators import JsonReader
from uptrain import Settings as UserSettings
metadata = None
- metadata = {'project': project_name, 'prompt': prompt, 'prompt_name': prompt_name,'prompt_version': version, 'model': model}
+ metadata = {
+ "project": project_name,
+ "prompt": prompt,
+ "prompt_name": prompt_name,
+ "prompt_version": version,
+ "model": model,
+ }
try:
from uptrain import EvalLLM
+
user_client = EvalLLM(Settings(**settings_data))
- data = JsonReader(fpath = os.path.join(DATABASE_PATH, "uptrain-datasets", name_w_version)).setup(UserSettings()).run()['output'].to_dicts()
+ data = (
+ JsonReader(
+ fpath=os.path.join(DATABASE_PATH, "uptrain-datasets", name_w_version)
+ )
+ .setup(UserSettings())
+ .run()["output"]
+ .to_dicts()
+ )
results = user_client.evaluate_prompts(
project_name=project_name,
- data=data,
+ data=data,
checks=checks_1,
- prompt=prompt,
- metadata=metadata
- )
+ prompt=prompt,
+ metadata=metadata,
+ )
return {"message": f"Evaluation has been queued up"}
except Exception as e:
logger.exception(f"Error running the eval: {e}")
- raise HTTPException(status_code=500, detail=f"Error running the evaluation: {e}")
-
+ raise HTTPException(
+ status_code=500, detail=f"Error running the evaluation: {e}"
+ )
# -----------------------------------------------------------
@@ -873,4 +966,4 @@ async def add_prompts(
app.include_router(router_internal, prefix="/api/internal", tags=["internal"])
if __name__ == "__main__":
- uvicorn.run("app:app", host="0.0.0.0", port=4300, workers=3)
\ No newline at end of file
+ uvicorn.run("app:app", host="0.0.0.0", port=4300, workers=3)
diff --git a/uptrain/dashboard/backend/nest_asyncio.py b/uptrain/dashboard/backend/nest_asyncio.py
index 718b89a23..4c96811c6 100644
--- a/uptrain/dashboard/backend/nest_asyncio.py
+++ b/uptrain/dashboard/backend/nest_asyncio.py
@@ -26,7 +26,7 @@ def run(main, *, debug=False):
try:
loop = asyncio.get_event_loop()
except RuntimeError as e:
- if str(e).startswith('There is no current event loop in thread'):
+ if str(e).startswith("There is no current event loop in thread"):
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
else:
@@ -48,19 +48,20 @@ def _get_event_loop(stacklevel=3):
return loop
# Use module level _current_tasks, all_tasks and patch run method.
- if hasattr(asyncio, '_nest_patched'):
+ if hasattr(asyncio, "_nest_patched"):
return
if sys.version_info >= (3, 6, 0):
- asyncio.Task = asyncio.tasks._CTask = asyncio.tasks.Task = \
- asyncio.tasks._PyTask
- asyncio.Future = asyncio.futures._CFuture = asyncio.futures.Future = \
+ asyncio.Task = asyncio.tasks._CTask = asyncio.tasks.Task = asyncio.tasks._PyTask
+ asyncio.Future = asyncio.futures._CFuture = asyncio.futures.Future = (
asyncio.futures._PyFuture
+ )
if sys.version_info < (3, 7, 0):
asyncio.tasks._current_tasks = asyncio.tasks.Task._current_tasks
asyncio.all_tasks = asyncio.tasks.Task.all_tasks
if sys.version_info >= (3, 9, 0):
- events._get_event_loop = events.get_event_loop = \
- asyncio.get_event_loop = _get_event_loop
+ events._get_event_loop = events.get_event_loop = asyncio.get_event_loop = (
+ _get_event_loop
+ )
asyncio.run = run
asyncio._nest_patched = True
@@ -100,8 +101,7 @@ def run_until_complete(self, future):
if self._stopping:
break
if not f.done():
- raise RuntimeError(
- 'Event loop stopped before Future completed.')
+ raise RuntimeError("Event loop stopped before Future completed.")
return f.result()
def _run_once(self):
@@ -115,10 +115,14 @@ def _run_once(self):
heappop(scheduled)
timeout = (
- 0 if ready or self._stopping
- else min(max(
- scheduled[0]._when - self.time(), 0), 86400) if scheduled
- else None)
+ 0
+ if ready or self._stopping
+ else (
+ min(max(scheduled[0]._when - self.time(), 0), 86400)
+ if scheduled
+ else None
+ )
+ )
event_list = self._selector.select(timeout)
self._process_events(event_list)
@@ -164,8 +168,10 @@ def manage_run(self):
events._set_running_loop(old_running_loop)
self._num_runs_pending -= 1
if self._is_proactorloop:
- if (self._num_runs_pending == 0
- and self._self_reading_future is not None):
+ if (
+ self._num_runs_pending == 0
+ and self._self_reading_future is not None
+ ):
ov = self._self_reading_future._ov
self._self_reading_future.cancel()
if ov is not None:
@@ -174,7 +180,7 @@ def manage_run(self):
@contextmanager
def manage_asyncgens(self):
- if not hasattr(sys, 'get_asyncgen_hooks'):
+ if not hasattr(sys, "get_asyncgen_hooks"):
# Python version is too old.
return
old_agen_hooks = sys.get_asyncgen_hooks()
@@ -183,7 +189,8 @@ def manage_asyncgens(self):
if self._asyncgens is not None:
sys.set_asyncgen_hooks(
firstiter=self._asyncgen_firstiter_hook,
- finalizer=self._asyncgen_finalizer_hook)
+ finalizer=self._asyncgen_finalizer_hook,
+ )
yield
finally:
self._set_coroutine_origin_tracking(False)
@@ -194,10 +201,10 @@ def _check_running(self):
"""Do not throw exception if loop is already running."""
pass
- if hasattr(loop, '_nest_patched'):
+ if hasattr(loop, "_nest_patched"):
return
if not isinstance(loop, asyncio.BaseEventLoop):
- raise ValueError('Can\'t patch loop of type %s' % type(loop))
+ raise ValueError("Can't patch loop of type %s" % type(loop))
cls = loop.__class__
cls.run_forever = run_forever
cls.run_until_complete = run_until_complete
@@ -205,12 +212,16 @@ def _check_running(self):
cls._check_running = _check_running
cls._check_runnung = _check_running # typo in Python 3.7 source
cls._num_runs_pending = 1 if loop.is_running() else 0
- cls._is_proactorloop = (
- os.name == 'nt' and issubclass(cls, asyncio.ProactorEventLoop))
+ cls._is_proactorloop = os.name == "nt" and issubclass(
+ cls, asyncio.ProactorEventLoop
+ )
if sys.version_info < (3, 7, 0):
cls._set_coroutine_origin_tracking = cls._set_coroutine_wrapper
- curr_tasks = asyncio.tasks._current_tasks \
- if sys.version_info >= (3, 7, 0) else asyncio.Task._current_tasks
+ curr_tasks = (
+ asyncio.tasks._current_tasks
+ if sys.version_info >= (3, 7, 0)
+ else asyncio.Task._current_tasks
+ )
cls._nest_patched = True
@@ -219,8 +230,9 @@ def _patch_tornado():
If tornado is imported before nest_asyncio, make tornado aware of
the pure-Python asyncio Future.
"""
- if 'tornado' in sys.modules:
+ if "tornado" in sys.modules:
import tornado.concurrent as tc # type: ignore
+
tc.Future = asyncio.Future
if asyncio.Future not in tc.FUTURES:
- tc.FUTURES += (asyncio.Future,)
\ No newline at end of file
+ tc.FUTURES += (asyncio.Future,)
diff --git a/uptrain/framework/base.py b/uptrain/framework/base.py
index 773ce8120..44ce93891 100644
--- a/uptrain/framework/base.py
+++ b/uptrain/framework/base.py
@@ -8,14 +8,15 @@
from loguru import logger
import networkx as nx
import polars as pl
-from pydantic import BaseSettings, Field
+from pydantic import Field
from uptrain.operators.base import (
Operator,
- deserialize_operator,
TransformOp,
+ deserialize_operator,
)
from uptrain.utilities import to_py_types, jsondump, jsonload
+from pydantic_settings import BaseSettings, SettingsConfigDict
__all__ = [
"OperatorDAG",
@@ -27,32 +28,46 @@ class Settings(BaseSettings):
# uptrain stores logs in this folder
logs_folder: str = "/tmp/uptrain-logs"
# external api related
- openai_api_key: str = Field(None, env="OPENAI_API_KEY")
- cohere_api_key: str = Field(None, env="COHERE_API_KEY")
- huggingface_api_key: str = Field(None, env="HUGGINGFACE_API_KEY")
- anthropic_api_key: str = Field(None, env="ANTHROPIC_API_KEY")
- replicate_api_token: str = Field(None, env="REPLICATE_API_TOKEN")
- anyscale_api_key: str = Field(None, env="ANYSCALE_API_KEY")
- together_api_key: str = Field(None, env="TOGETHER_API_KEY")
- mistral_api_key: str = Field(None, env="MISTRAL_API_KEY")
-
- azure_api_key: str = Field(None, env="AZURE_API_KEY")
- azure_api_base: str = Field(None, env="AZURE_API_BASE")
- azure_api_version: str = Field(None, env="AZURE_API_VERSION")
+ openai_api_key: t.Optional[str] = Field(None, env="OPENAI_API_KEY")
+ cohere_api_key: t.Optional[str] = Field(None, env="COHERE_API_KEY")
+ huggingface_api_key: t.Optional[str] = Field(
+ None, env="HUGGINGFACE_API_KEY"
+ )
+ anthropic_api_key: t.Optional[str] = Field(
+ None, env="ANTHROPIC_API_KEY"
+ )
+ replicate_api_token: t.Optional[str] = Field(
+ None, env="REPLICATE_API_TOKEN"
+ )
+ anyscale_api_key: t.Optional[str] = Field(None, env="ANYSCALE_API_KEY")
+ together_api_key: t.Optional[str] = Field(None, env="TOGETHER_API_KEY")
+ mistral_api_key: t.Optional[str] = Field(None, env="MISTRAL_API_KEY")
+
+ azure_api_key: t.Optional[str] = Field(None, env="AZURE_API_KEY")
+ azure_api_base: t.Optional[str] = Field(None, env="AZURE_API_BASE")
+ azure_api_version: t.Optional[str] = Field(
+ None, env="AZURE_API_VERSION"
+ )
rpm_limit: int = 100
tpm_limit: int = 90_000
embedding_compute_method: t.Literal["local", "replicate", "api"] = "local"
# uptrain managed service related
- uptrain_access_token: str = Field(None, env="UPTRAIN_ACCESS_TOKEN")
+ uptrain_access_token: t.Optional[str] = Field(
+ None, env="UPTRAIN_ACCESS_TOKEN"
+ )
uptrain_server_url: str = Field(
"https://demo.uptrain.ai/", env="UPTRAIN_SERVER_URL"
)
# Embedding model related, applicable if embedding_compute_method is api.
- embedding_model_url: str = Field(None, env="EMBEDDING_MODEL_URL")
- embedding_model_api_token: str = Field(None, env="EMBEDDING_MODEL_API_TOKEN")
+ embedding_model_url: t.Optional[str] = Field(
+ None, env="EMBEDDING_MODEL_URL"
+ )
+ embedding_model_api_token: t.Optional[str] = Field(
+ None, env="EMBEDDING_MODEL_API_TOKEN"
+ )
# LLM model to run the evaluations
model: str = "gpt-3.5-turbo-1106"
@@ -64,10 +79,8 @@ class Settings(BaseSettings):
# Cot -> We will use chain of thought prompting to evaluate and get the grade
# basic -> We will simply prompt the LLM to return the grade without any reasoning
eval_type: t.Literal["basic", "cot"] = "cot"
-
- # allow additional fields as needed by different operators
- class Config:
- extra = "allow"
+ model_config = SettingsConfigDict(extra="allow")
+ model_config['protected_namespaces'] = ()
def __init__(self, **data):
super().__init__(**data)
diff --git a/uptrain/framework/builtins.py b/uptrain/framework/builtins.py
index 50a386bf9..1bea8adaf 100644
--- a/uptrain/framework/builtins.py
+++ b/uptrain/framework/builtins.py
@@ -30,6 +30,7 @@
# Response Quality
# -----------------------------------------------------------
+
def CheckResponseCompleteness():
return Check(
name="response_completeness_score",
@@ -82,6 +83,7 @@ def CheckResponseMatching(method="llm"):
# Context Quality
# -----------------------------------------------------------
+
def CheckContextRelevance():
return Check(
name="score_context_relevance",
@@ -126,6 +128,7 @@ def CheckContextConciseness():
# Language Proficiency
# -----------------------------------------------------------
+
def CheckLanguageQuality():
return Check(
name="language_critique_score",
@@ -148,6 +151,7 @@ def CheckToneQuality(llm_persona):
# Code generation
# -----------------------------------------------------------
+
def CheckCodeHallucination():
return Check(
name="code_hallucination_score",
@@ -160,10 +164,13 @@ def CheckCodeHallucination():
# Conversation Quality
# -----------------------------------------------------------
+
def CheckConversationSatisfaction(user_role="user", llm_role="assistant"):
return Check(
name="conversation_satisfaction_score",
- operators=[ConversationSatisfactionScore(user_role=user_role, llm_role=llm_role)],
+ operators=[
+ ConversationSatisfactionScore(user_role=user_role, llm_role=llm_role)
+ ],
plots=[Histogram(x="score_conversation_satisfaction")],
)
@@ -172,6 +179,7 @@ def CheckConversationSatisfaction(user_role="user", llm_role="assistant"):
# Custom Evaluations
# -----------------------------------------------------------
+
def CheckGuidelineAdherence(
guideline, guideline_name="guideline", response_schema=None
):
@@ -192,6 +200,7 @@ def CheckGuidelineAdherence(
# Security
# -----------------------------------------------------------
+
def CheckPromptInjection():
return Check(
name="prompt_injection_score",
@@ -212,6 +221,7 @@ def CheckJailbreakDetection():
# Subquery
# -----------------------------------------------------------
+
def CheckSubQueryCompleteness():
return Check(
name="sub_query_completeness_score",
diff --git a/uptrain/framework/checks.py b/uptrain/framework/checks.py
index 8c23176dc..292d5dfd6 100644
--- a/uptrain/framework/checks.py
+++ b/uptrain/framework/checks.py
@@ -1,5 +1,6 @@
"""Implements `Check` objects used for LLM evaluation purposes.
"""
+
from __future__ import annotations
from dataclasses import dataclass
import os
@@ -9,7 +10,12 @@
import polars as pl
from pydantic import BaseModel
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ Operator,
+ TransformOp,
+ ColumnOp,
+ deserialize_operator,
+)
from uptrain.utilities import jsonload, jsondump, to_py_types, clear_directory
from uptrain.framework.base import OperatorDAG, Settings
diff --git a/uptrain/framework/evalllm.py b/uptrain/framework/evalllm.py
index d289dafd2..1565cd09a 100644
--- a/uptrain/framework/evalllm.py
+++ b/uptrain/framework/evalllm.py
@@ -46,6 +46,11 @@
SubQueryCompleteness,
)
+from uptrain.framework.rca_templates import RcaTemplate
+from uptrain.operators import RagWithCitation
+
+RCA_TEMPLATE_TO_OPERATOR_MAPPING = {RcaTemplate.RAG_WITH_CITATION: RagWithCitation()}
+
EVAL_TO_OPERATOR_MAPPING = {
Evals.FACTUAL_ACCURACY: ResponseFactualScore(),
Evals.CONTEXT_RELEVANCE: ContextRelevance(),
@@ -82,6 +87,78 @@ def __init__(self, settings: Settings = None, openai_api_key: str = None) -> Non
self.settings = settings
self.executor = APIClientWithoutAuth(self.settings)
+ ####
+ def perform_root_cause_analysis(
+ self,
+ data: t.Union[list[dict], pl.DataFrame, pd.DataFrame],
+ rca_template: RcaTemplate,
+ scenario_description: t.Union[str, list[str], None] = None,
+ schema: t.Union[DataSchema, dict[str, str], None] = None,
+ metadata: t.Optional[dict[str, t.Any]] = None,
+ ):
+ """Perform root cause analysis for the open source user.
+ NOTE: This api doesn't log any data.
+
+ Args:
+ data: Data to evaluate on. Either a Pandas DataFrame or a list of dicts.
+ rca_template: rca template to run.
+ schema: Schema of the data. Only required if the data attributes aren't typical (question, response, context).
+ metadata: Attributes to attach to this dataset. Useful for filtering and grouping in the UI.
+
+ Returns:
+ results: List of dictionaries with each data point and corresponding evaluation results.
+ """
+
+ if isinstance(data, pl.DataFrame):
+ data = data.to_dicts()
+ elif isinstance(data, pd.DataFrame):
+ data = data.to_dict(orient="records")
+
+ if schema is None:
+ schema = DataSchema()
+ elif isinstance(schema, dict):
+ schema = DataSchema(**schema)
+
+ if metadata is None:
+ metadata = {}
+
+ req_attrs, ser_template = set(), {}
+ if rca_template == RcaTemplate.RAG_WITH_CITATION:
+ req_attrs.update(
+ [schema.question, schema.response, schema.context, schema.cited_context]
+ )
+ else:
+ raise Exception("RCA Template not supported yet")
+
+ dictn = {"scenario_description": scenario_description}
+ ser_template.update({"rca_template_name": rca_template.value, **dictn})
+
+ for idx, row in enumerate(data):
+ if not req_attrs.issubset(row.keys()):
+ raise ValueError(
+ f"Row {idx} is missing required all required attributes for evaluation: {req_attrs}"
+ )
+
+ if self.settings.evaluate_locally:
+ results = copy.deepcopy(data)
+ if rca_template in RCA_TEMPLATE_TO_OPERATOR_MAPPING:
+ op = RCA_TEMPLATE_TO_OPERATOR_MAPPING[rca_template]
+ op.scenario_description = (
+ scenario_description
+ if not isinstance(scenario_description, list)
+ else scenario_description[idx]
+ )
+ res = (
+ op.setup(self.settings).run(pl.DataFrame(data))["output"].to_dicts()
+ )
+ else:
+ res = self.evaluate_on_server(data, [ser_template], schema)
+ for idx, row in enumerate(res):
+ results[idx].update(row)
+ else:
+ results = self.evaluate_on_server(data, [ser_template], schema)
+ return results
+
def evaluate(
self,
data: t.Union[list[dict], pl.DataFrame, pd.DataFrame],
@@ -187,7 +264,11 @@ def evaluate(
if self.settings.evaluate_locally:
results = copy.deepcopy(data)
for idx, check in enumerate(checks):
- if isinstance(check, ParametricEval) and ser_checks[idx]["check_name"] in PARAMETRIC_EVAL_TO_OPERATOR_MAPPING:
+ if (
+ isinstance(check, ParametricEval)
+ and ser_checks[idx]["check_name"]
+ in PARAMETRIC_EVAL_TO_OPERATOR_MAPPING
+ ):
# Use the check_name field to get the operator and remove it from ser_checks
op = PARAMETRIC_EVAL_TO_OPERATOR_MAPPING[
ser_checks[idx].pop("check_name")
@@ -222,34 +303,31 @@ def evaluate(
headers={"uptrain-access-token": "default_key"},
timeout=httpx.Timeout(7200, connect=5),
)
- response = client.post(
- url,
- json={"name": "default_key"}
- )
+ response = client.post(url, json={"name": "default_key"})
- user_id = response.json()['id']
+ user_id = response.json()["id"]
checks = []
for res in results:
row_check = {}
for key in res:
- if key.startswith('score') or key.startswith('explanation'):
+ if key.startswith("score") or key.startswith("explanation"):
row_check.update({key: res[key]})
checks.append(row_check)
-
+
url = "http://localhost:4300/api/public/add_project_data"
response = client.post(
- url,
- json={
- "data": results,
- "checks": checks,
- "metadata": metadata,
- "schema_dict": schema.dict(),
- "project": project_name,
- },
- )
- except:
+ url,
+ json={
+ "data": results,
+ "checks": checks,
+ "metadata": metadata,
+ "schema_dict": schema.dict(),
+ "project": project_name,
+ },
+ )
+ except Exception:
user_id = "default_key"
- logger.info('Server is not running!')
+ logger.info("Server is not running!")
return results
def evaluate_on_server(self, data, ser_checks, schema):
@@ -331,13 +409,12 @@ def evaluate_experiments(
exp_results = exp_results.to_dicts()
return exp_results
-
def evaluate_prompts(
self,
project_name: str,
data: t.Union[list[dict], pl.DataFrame],
checks: list[t.Union[str, Evals, ParametricEval]],
- prompt: str,
+ prompt: str,
schema: t.Union[DataSchema, dict[str, str], None] = None,
metadata: t.Optional[dict[str, t.Any]] = None,
):
@@ -357,10 +434,10 @@ def evaluate_prompts(
"""
if metadata is None:
metadata = {}
-
+
base_prompt, prompt_vars = parse_prompt(prompt)
- prompts =[]
+ prompts = []
context_vars = {}
context_vars.update(zip(prompt_vars, prompt_vars))
for idx, item in enumerate(data):
@@ -370,18 +447,24 @@ def evaluate_prompts(
model = metadata["model"]
dataset = pl.DataFrame(data)
- dataset = dataset.with_columns(pl.Series(name="model", values=[model] * len(dataset)))
- dataset = dataset.with_columns(pl.Series(name="prompt", values= prompts))
-
+ dataset = dataset.with_columns(
+ pl.Series(name="model", values=[model] * len(dataset))
+ )
+ dataset = dataset.with_columns(pl.Series(name="prompt", values=prompts))
+
from uptrain.operators import TextCompletion
-
- dataset = TextCompletion(
- col_in_prompt = "prompt",
- col_in_model = "model",
- col_out_completion = "response",
- temperature = 0.0
- ).setup(self.settings).run(dataset)['output']
-
+
+ dataset = (
+ TextCompletion(
+ col_in_prompt="prompt",
+ col_in_model="model",
+ col_out_completion="response",
+ temperature=0.0,
+ )
+ .setup(self.settings)
+ .run(dataset)["output"]
+ )
+
dataset = dataset.to_dicts()
if schema is None:
diff --git a/uptrain/framework/evals.py b/uptrain/framework/evals.py
index f784a93a7..c5de0404e 100644
--- a/uptrain/framework/evals.py
+++ b/uptrain/framework/evals.py
@@ -1,7 +1,8 @@
import enum
-import pydantic
import typing as t
+from pydantic import BaseModel, ConfigDict
+
class Evals(enum.Enum):
CONTEXT_RELEVANCE = "context_relevance"
@@ -22,8 +23,8 @@ class Evals(enum.Enum):
CONTEXT_CONCISENESS = "context_conciseness"
-class ParametricEval(pydantic.BaseModel):
- ...
+class ParametricEval(BaseModel):
+ model_config = ConfigDict(protected_namespaces=())
class CritiqueTone(ParametricEval):
@@ -32,10 +33,12 @@ class CritiqueTone(ParametricEval):
class GuidelineAdherence(ParametricEval):
guideline: str
- guideline_name: str = "guideline" # User-assigned name of the guideline to distinguish between multiple checks
- response_schema: t.Union[
- str, None
- ] = None # Schema of the response in case it is of type JSON, XML, etc.
+ guideline_name: str = (
+ "guideline" # User-assigned name of the guideline to distinguish between multiple checks
+ )
+ response_schema: t.Union[str, None] = (
+ None # Schema of the response in case it is of type JSON, XML, etc.
+ )
class ConversationSatisfaction(ParametricEval):
@@ -52,9 +55,9 @@ class CustomPromptEval(ParametricEval):
list[float], list[int]
] # Scores associated for each choice. ex: [1.0, 0.0]
eval_type: t.Literal["classify", "cot_classify"] = "cot_classify"
- prompt_var_to_column_mapping: t.Union[
- dict[str, str], None
- ] = None # Specify matching between variables in the evaluation prompt and keys in your data
+ prompt_var_to_column_mapping: t.Union[dict[str, str], None] = (
+ None # Specify matching between variables in the evaluation prompt and keys in your data
+ )
class ResponseMatching(ParametricEval):
@@ -62,4 +65,6 @@ class ResponseMatching(ParametricEval):
class JailbreakDetection(ParametricEval):
- model_purpose: str = "To help the users with their queries without providing them with any illegal, immoral or abusive content."
+ model_purpose: str = (
+ "To help the users with their queries without providing them with any illegal, immoral or abusive content."
+ )
diff --git a/uptrain/framework/remote.py b/uptrain/framework/remote.py
index 8642cb92b..0effd90ab 100644
--- a/uptrain/framework/remote.py
+++ b/uptrain/framework/remote.py
@@ -6,10 +6,10 @@
import typing as t
from loguru import logger
+from pydantic import BaseModel
import httpx
import polars as pl
import pandas as pd
-import pydantic
from uptrain.framework.checks import CheckSet, ExperimentArgs
from uptrain.framework.base import Settings
@@ -26,7 +26,7 @@
from uptrain.utilities import polars_to_pandas
-class DataSchema(pydantic.BaseModel):
+class DataSchema(BaseModel):
id_: str = "id"
question: str = "question"
response: str = "response"
diff --git a/uptrain/integrations/llama_index.py b/uptrain/integrations/llama_index.py
index 4c6bd9b4d..4a826854e 100644
--- a/uptrain/integrations/llama_index.py
+++ b/uptrain/integrations/llama_index.py
@@ -11,7 +11,7 @@
from uptrain.framework.remote import DataSchema
-from llama_index.indices.query.base import BaseQueryEngine
+from llama_index.core.indices.query.base import BaseQueryEngine
__all__ = ["EvalLlamaIndex"]
@@ -34,6 +34,8 @@ def __init__(self, settings: Settings, query_engine: BaseQueryEngine) -> None:
self.client = APIClient(settings)
elif settings.check_and_get("openai_api_key"):
self.client = EvalLLM(settings)
+ elif settings.check_and_get("together_api_key"):
+ self.client = EvalLLM(settings)
def evaluate(
self,
@@ -44,7 +46,7 @@ def evaluate(
metadata: t.Optional[dict[str, str]] = None,
):
try:
- from llama_index.async_utils import run_async_tasks
+ from llama_index.core.async_utils import run_async_tasks
except ImportError:
raise ImportError(
"llama_index must be installed to use this function. "
diff --git a/uptrain/operators/__init__.pyi b/uptrain/operators/__init__.pyi
index df1d975cb..72a8a3133 100644
--- a/uptrain/operators/__init__.pyi
+++ b/uptrain/operators/__init__.pyi
@@ -59,7 +59,7 @@ __all__ = [
"ResponseCompletenessWrtContext",
"ResponseConsistency",
"ResponseConciseness",
- "ValidQuestion",
+ "ValidQuestionScore",
"LanguageCritique",
"ToneCritique",
"GuidelineAdherenceScore",
@@ -161,9 +161,7 @@ from .language.response_quality import (
ResponseRelevance,
ResponseMatchingScore,
)
-from .language.question_quality import (
- ValidQuestion
-)
+from .language.question_quality import ValidQuestionScore
from .language.language_quality import LanguageCritique, ResponseCoherence
from .language.tone import ToneCritique
from .language.guideline import GuidelineAdherenceScore
diff --git a/uptrain/operators/base.py b/uptrain/operators/base.py
index aaea2c51e..5d1b87732 100644
--- a/uptrain/operators/base.py
+++ b/uptrain/operators/base.py
@@ -9,7 +9,7 @@
import typing_extensions as te
from loguru import logger
-from pydantic import BaseModel
+from pydantic import ConfigDict, BaseModel
import polars as pl
if t.TYPE_CHECKING:
@@ -98,10 +98,9 @@ class OpBaseModel(BaseModel):
model, to get around some of the sharp edges.
"""
- class Config:
- extra = "allow"
- smart_union = True
- underscore_attrs_are_private = True
+ # TODO[pydantic]: The following keys were removed: `smart_union`, `underscore_attrs_are_private`.
+ # Check https://docs.pydantic.dev/dev-v2/migration/#changes-to-config for more information.
+ model_config = ConfigDict(extra="allow", protected_namespaces=())
class ColumnOp(OpBaseModel):
diff --git a/uptrain/operators/chart.py b/uptrain/operators/chart.py
index a15047826..67dfd2194 100644
--- a/uptrain/operators/chart.py
+++ b/uptrain/operators/chart.py
@@ -10,13 +10,16 @@
from __future__ import annotations
import typing as t
-from loguru import logger
from pydantic import Field
import polars as pl
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ OpBaseModel,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
from uptrain.utilities import lazy_load_dep, polars_to_pandas
px = lazy_load_dep("plotly.express", "plotly>=5.0.0")
@@ -175,7 +178,7 @@ class BarChart(Chart):
barmode: str = "group"
- kind = "bar"
+ kind: str = "bar"
@register_op
@@ -224,7 +227,7 @@ class LineChart(Chart):
description: str = ""
color: str = ""
- kind = "line"
+ kind: str = "line"
@register_op
@@ -274,7 +277,7 @@ class ScatterPlot(Chart):
color: str = ""
symbol: str = "circle"
- kind = "scatter"
+ kind: str = "scatter"
@register_op
@@ -327,7 +330,7 @@ class Scatter3DPlot(Chart):
color: str = ""
symbol: str = ""
- kind = "scatter_3d"
+ kind: str = "scatter_3d"
def setup(self, settings: Settings = None):
super(Scatter3DPlot, self).setup()
@@ -391,7 +394,7 @@ class Histogram(Chart):
color: str = ""
nbins: int = 20
- kind = "histogram"
+ kind: str = "histogram"
@register_op
@@ -461,10 +464,10 @@ class MultiPlot(Chart):
description: str = ""
charts: list
- kind = "multiplot"
+ kind: str = "multiplot"
def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
- if type(self.charts[0]) == dict:
+ if isinstance(self.charts[0], dict):
self.charts = [Chart(**chart).setup() for chart in self.charts]
fig = ps.make_subplots(
@@ -483,9 +486,6 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
annotation_multi_height = (
-0.3
) # Adjust this value for multiline annotation position
- annotation_line_height = (
- -0.05
- ) # Adjust this value for multiline annotation spacing
for idx, chart in enumerate(self.charts):
plot = getattr(px, chart.kind)(polars_to_pandas(data), **chart.props)
diff --git a/uptrain/operators/clustering.py b/uptrain/operators/clustering.py
index 7d4452740..0c40a01aa 100644
--- a/uptrain/operators/clustering.py
+++ b/uptrain/operators/clustering.py
@@ -14,11 +14,14 @@
from loguru import logger
import numpy as np
import polars as pl
-from pydantic import root_validator
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ ColumnOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
from uptrain.utilities import lazy_load_dep
nltk = lazy_load_dep("nltk", "nltk")
diff --git a/uptrain/operators/code/detection.py b/uptrain/operators/code/detection.py
index 0310b8bb2..8add2827c 100644
--- a/uptrain/operators/code/detection.py
+++ b/uptrain/operators/code/detection.py
@@ -9,19 +9,27 @@
from loguru import logger
import polars as pl
from uptrain.operators.language.llm import LLMMulticlient
-from uptrain.operators.language.prompts.classic import CODE_HALLUCINATION_PROMPT_TEMPLATE
-from uptrain.operators.language.prompts.few_shots import CODE_HALLUCINATION_FEW_SHOT__CLASSIFY, CODE_HALLUCINATION_FEW_SHOT__COT
+from uptrain.operators.language.prompts.classic import (
+ CODE_HALLUCINATION_PROMPT_TEMPLATE,
+)
+from uptrain.operators.language.prompts.few_shots import (
+ CODE_HALLUCINATION_FEW_SHOT__CLASSIFY,
+ CODE_HALLUCINATION_FEW_SHOT__COT,
+)
from uptrain.operators.language.prompts.instructions import CHAIN_OF_THOUGHT, CLASSIFY
-from uptrain.operators.language.prompts.output_format import CODE_HALLUCINATION_OUTPUT_FORMAT__CLASSIFY, CODE_HALLUCINATION_OUTPUT_FORMAT__COT
+from uptrain.operators.language.prompts.output_format import (
+ CODE_HALLUCINATION_OUTPUT_FORMAT__CLASSIFY,
+ CODE_HALLUCINATION_OUTPUT_FORMAT__COT,
+)
from uptrain.utilities.prompt_utils import parse_scenario_description
-
-from uptrain.framework.base import (
+from uptrain.operators.base import (
ColumnOp,
register_op,
TYPE_TABLE_OUTPUT,
- Settings,
)
+from uptrain.framework.base import Settings
+
from uptrain.utilities import polars_to_json_serializable_dict
@@ -56,7 +64,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -70,7 +81,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["context"] = row.pop(self.col_context)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate("code_hallucination", data_send)
diff --git a/uptrain/operators/code/sql.py b/uptrain/operators/code/sql.py
index dd47263be..57c596a9d 100644
--- a/uptrain/operators/code/sql.py
+++ b/uptrain/operators/code/sql.py
@@ -6,7 +6,6 @@
import itertools
import json
-import os
import typing as t
from pydantic import BaseModel
@@ -21,8 +20,12 @@
)
if t.TYPE_CHECKING:
- from uptrain.framework.base import *
-from uptrain.operators.base import *
+ from uptrain.framework.base import Settings
+from uptrain.operators.base import (
+ TransformOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
sqlglot = lazy_load_dep("sqlglot", "sqlglot")
diff --git a/uptrain/operators/drift.py b/uptrain/operators/drift.py
index 25e9dbac9..d79e9a73d 100644
--- a/uptrain/operators/drift.py
+++ b/uptrain/operators/drift.py
@@ -20,7 +20,12 @@
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ ColumnOp,
+ OpBaseModel,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
from uptrain.utilities import lazy_load_dep
drift = lazy_load_dep("river.drift", "river")
diff --git a/uptrain/operators/embedding/embedding.py b/uptrain/operators/embedding/embedding.py
index 3deb3326f..7aaa9d742 100644
--- a/uptrain/operators/embedding/embedding.py
+++ b/uptrain/operators/embedding/embedding.py
@@ -77,7 +77,9 @@ class Embedding(ColumnOp):
"""
- model: str = "" # t.Literal["MiniLM-L6-v2", "instructor-xl", "mpnet-base-v2", "bge-large-zh-v1.5", "instructor-large"]
+ model: str = (
+ "" # t.Literal["MiniLM-L6-v2", "instructor-xl", "mpnet-base-v2", "bge-large-zh-v1.5", "instructor-large"]
+ )
col_in_text: str = "text"
col_out: str = "embedding"
batch_size: int = 128
@@ -187,7 +189,7 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
).json()["data"]
]
emb_length = len(run_res[0])
- except:
+ except Exception:
run_res = []
for elem_idx in range(idx * BATCH_SIZE, (idx + 1) * BATCH_SIZE):
if elem_idx < len(inputs):
diff --git a/uptrain/operators/embs.py b/uptrain/operators/embs.py
index 52c9fc9e8..4d7f15406 100644
--- a/uptrain/operators/embs.py
+++ b/uptrain/operators/embs.py
@@ -15,11 +15,17 @@
from loguru import logger
import numpy as np
import polars as pl
-from pydantic import root_validator
+from pydantic import model_validator
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ ColumnOp,
+ TransformOp,
+ TYPE_TABLE_OUTPUT,
+ get_output_col_name_at,
+ register_op,
+)
from uptrain.utilities import lazy_load_dep
umap = lazy_load_dep("umap", "umap-learn")
@@ -92,7 +98,8 @@ class Distribution(TransformOp):
col_in_groupby: list[str]
col_out: list[str] | None = None
- @root_validator(pre=True)
+ @model_validator(mode="before")
+ @classmethod
def _check_cols(cls, values):
"""
Validator to check the validity of input and output column lists.
diff --git a/uptrain/operators/io/base.py b/uptrain/operators/io/base.py
index ce797496a..f7225c184 100644
--- a/uptrain/operators/io/base.py
+++ b/uptrain/operators/io/base.py
@@ -8,7 +8,12 @@
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ TransformOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+ OpBaseModel,
+)
from uptrain.utilities import lazy_load_dep
# -----------------------------------------------------------
@@ -153,8 +158,6 @@ class DeltaWriter(OpBaseModel):
columns: t.Optional[list[str]] = None
def setup(self, settings: Settings):
- dl = lazy_load_dep("deltatable", "deltalake>=0.9")
-
return self
def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
diff --git a/uptrain/operators/io/bq.py b/uptrain/operators/io/bq.py
index a5b29bfea..dd42c7711 100644
--- a/uptrain/operators/io/bq.py
+++ b/uptrain/operators/io/bq.py
@@ -8,7 +8,11 @@
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ TransformOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
from uptrain.utilities import lazy_load_dep
diff --git a/uptrain/operators/io/duck.py b/uptrain/operators/io/duck.py
index f88c1e8e9..741c4052e 100644
--- a/uptrain/operators/io/duck.py
+++ b/uptrain/operators/io/duck.py
@@ -7,7 +7,11 @@
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ TransformOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
from uptrain.utilities import lazy_load_dep
diff --git a/uptrain/operators/io/excel.py b/uptrain/operators/io/excel.py
index 6473a70e0..19ec1caa0 100644
--- a/uptrain/operators/io/excel.py
+++ b/uptrain/operators/io/excel.py
@@ -4,11 +4,14 @@
import typing as t
import polars as pl
-import deltalake as dl
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ TransformOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
from uptrain.utilities import lazy_load_dep
xlsx2csv = lazy_load_dep("xlsx2csv", "xlsx2csv")
diff --git a/uptrain/operators/io/mongodb.py b/uptrain/operators/io/mongodb.py
index 77143b7e7..45c0cdef5 100644
--- a/uptrain/operators/io/mongodb.py
+++ b/uptrain/operators/io/mongodb.py
@@ -2,13 +2,16 @@
from __future__ import annotations
import typing as t
-import io
import polars as pl
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ TransformOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
from uptrain.utilities import lazy_load_dep
diff --git a/uptrain/operators/language/bleu.py b/uptrain/operators/language/bleu.py
index b5c6fcec1..7d0faaa73 100644
--- a/uptrain/operators/language/bleu.py
+++ b/uptrain/operators/language/bleu.py
@@ -8,13 +8,16 @@
from __future__ import annotations
import typing as t
-from loguru import logger
import polars as pl
from uptrain.framework import Settings
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ ColumnOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
from uptrain.utilities import lazy_load_dep
# blue_score = lazy_load_dep("nltk.translate.bleu_score", "nltk")
diff --git a/uptrain/operators/language/context_quality.py b/uptrain/operators/language/context_quality.py
index ae372ccc4..197955ec9 100644
--- a/uptrain/operators/language/context_quality.py
+++ b/uptrain/operators/language/context_quality.py
@@ -78,7 +78,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -91,7 +94,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["context"] = row.pop(self.col_context)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate(
@@ -224,7 +230,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -238,7 +247,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["context"] = row.pop(self.col_context)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate(
@@ -338,9 +350,9 @@ def evaluate_local(self, data):
json.loads(res.response.choices[0].message.content)["Choice"]
]
output["score_response_completeness_wrt_context"] = float(score)
- output[
- "explanation_response_completeness_wrt_context"
- ] = res.response.choices[0].message.content
+ output["explanation_response_completeness_wrt_context"] = (
+ res.response.choices[0].message.content
+ )
except Exception:
logger.error(
f"Error when processing payload at index {idx}: {res.error}"
@@ -382,7 +394,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -396,7 +411,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["reranked_context"] = row.pop(self.col_reranked_context)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate(
@@ -530,7 +548,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -544,7 +565,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["concise_context"] = row.pop(self.col_concise_context)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate(
diff --git a/uptrain/operators/language/conversation.py b/uptrain/operators/language/conversation.py
index eb1eb1037..ef0f29d89 100644
--- a/uptrain/operators/language/conversation.py
+++ b/uptrain/operators/language/conversation.py
@@ -67,7 +67,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -79,7 +82,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["conversation"] = row[self.col_conversation]
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate(
diff --git a/uptrain/operators/language/factual_accuracy.py b/uptrain/operators/language/factual_accuracy.py
index 17c917470..0412e4047 100644
--- a/uptrain/operators/language/factual_accuracy.py
+++ b/uptrain/operators/language/factual_accuracy.py
@@ -70,7 +70,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -84,7 +87,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["context"] = row.pop(self.col_context)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate(
diff --git a/uptrain/operators/language/generation.py b/uptrain/operators/language/generation.py
index 1bb17e40e..63dda82e9 100644
--- a/uptrain/operators/language/generation.py
+++ b/uptrain/operators/language/generation.py
@@ -15,7 +15,12 @@
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ TransformOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+ ColumnOp,
+)
from uptrain.operators.language.llm import LLMMulticlient, Payload
@@ -82,7 +87,7 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
# TODO: Temp Fix for handling json in prompts. Permanent fix is to integrate langchain?
try:
prompt = row["template"].format(**fill)
- except:
+ except Exception:
prompt = row["template"]
for k, v in fill.items():
prompt = prompt.replace("{{" + k + "}}", v)
diff --git a/uptrain/operators/language/grammar.py b/uptrain/operators/language/grammar.py
index e85305bed..48208ac73 100644
--- a/uptrain/operators/language/grammar.py
+++ b/uptrain/operators/language/grammar.py
@@ -15,7 +15,11 @@
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ ColumnOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
from uptrain.operators.language.llm import LLMMulticlient, Payload
__all__ = ["GrammarScore"]
diff --git a/uptrain/operators/language/guideline.py b/uptrain/operators/language/guideline.py
index ae171ef97..b89af7cea 100644
--- a/uptrain/operators/language/guideline.py
+++ b/uptrain/operators/language/guideline.py
@@ -68,7 +68,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -81,7 +84,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["response"] = row.pop(self.col_response)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate(
diff --git a/uptrain/operators/language/jailbreak.py b/uptrain/operators/language/jailbreak.py
index f95ee1f74..3cd208176 100644
--- a/uptrain/operators/language/jailbreak.py
+++ b/uptrain/operators/language/jailbreak.py
@@ -57,7 +57,9 @@ class JailbreakDetectionScore(ColumnOp):
col_question: str = "question"
col_out: str = "score_jailbreak_attempted"
- model_purpose: str = "To help the users with their queries without providing them with any illegal, immoral or abusive content."
+ model_purpose: str = (
+ "To help the users with their queries without providing them with any illegal, immoral or abusive content."
+ )
scenario_description: t.Optional[str] = None
score_mapping: dict = {"A": 1.0, "B": 0.0}
@@ -66,7 +68,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -78,7 +83,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["question"] = row.pop(self.col_question)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate(
@@ -214,7 +222,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -226,7 +237,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["question"] = row.pop(self.col_question)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate(
diff --git a/uptrain/operators/language/language_quality.py b/uptrain/operators/language/language_quality.py
index 1fbba2358..25f307fa3 100644
--- a/uptrain/operators/language/language_quality.py
+++ b/uptrain/operators/language/language_quality.py
@@ -72,7 +72,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -84,7 +87,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["response"] = row.pop(self.col_response)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate("critique_language", data_send)
@@ -212,7 +218,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -224,7 +233,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["response"] = row.pop(self.col_response)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate("critique_language", data_send)
diff --git a/uptrain/operators/language/llm.py b/uptrain/operators/language/llm.py
index 9c5f9e48b..4223cbbd7 100644
--- a/uptrain/operators/language/llm.py
+++ b/uptrain/operators/language/llm.py
@@ -8,13 +8,11 @@
import random
import typing as t
-from contextlib import suppress
from loguru import logger
from pydantic import BaseModel, Field
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
from uptrain.utilities import lazy_load_dep
openai = lazy_load_dep("openai", "openai")
@@ -25,6 +23,7 @@
from openai import AsyncOpenAI
from openai import AsyncAzureOpenAI
import openai
+from aiolimiter import AsyncLimiter
# import openai.error
@@ -55,11 +54,11 @@ def run_validation(llm_output, validation_func):
async def async_process_payload(
payload: Payload,
- rpm_limiter: aiolimiter.AsyncLimiter,
- tpm_limiter: aiolimiter.AsyncLimiter,
- aclient: t.Union[AsyncOpenAI, AsyncAzureOpenAI, None],
+ rpm_limiter: AsyncLimiter,
+ tpm_limiter: AsyncLimiter,
+ aclient: t.Any,
max_retries: int,
- validate_func: function = None,
+ validate_func: t.Callable = None,
) -> Payload:
messages = payload.data["messages"]
total_chars = sum(len(msg["role"]) + len(msg["content"]) for msg in messages)
@@ -90,9 +89,10 @@ async def async_process_payload(
break
except Exception as exc:
logger.error(f"Error when sending request to LLM API: {exc}")
- sleep_and_retry = (count < max_retries - 1)
+ sleep_and_retry = count < max_retries - 1
if aclient is not None:
- if not ( isinstance(
+ if not (
+ isinstance(
exc,
(
openai.APIConnectionError,
@@ -106,11 +106,10 @@ async def async_process_payload(
sleep_and_retry = False
else:
litellm = lazy_load_dep("litellm", "litellm")
- if not ( isinstance(
+ if not (
+ isinstance(
exc,
- (
- litellm.RateLimitError,
- ),
+ (litellm.RateLimitError,),
)
):
sleep_and_retry = False
@@ -159,12 +158,12 @@ async def async_process_payload(
class LLMMulticlient:
"""Uses asyncio to send requests to LLM APIs concurrently."""
- def __init__(self, settings: t.Optional[Settings] = None):
+ def __init__(self, settings: t.Optional[Settings] = None, aclient: t.Any = None):
self._max_tries = 4
# TODO: consult for accurate limits - https://platform.openai.com/account/rate-limits
self._rpm_limit = 200
self._tpm_limit = 90_000
- self.aclient = None
+ self.aclient = aclient
self.settings = settings
if settings is not None:
if (
@@ -172,7 +171,8 @@ def __init__(self, settings: t.Optional[Settings] = None):
and settings.check_and_get("openai_api_key") is not None
):
openai.api_key = settings.check_and_get("openai_api_key") # type: ignore
- self.aclient = AsyncOpenAI()
+ if self.aclient is None:
+ self.aclient = AsyncOpenAI()
if (
settings.model.startswith("azure")
@@ -231,7 +231,7 @@ def make_payload(
)
def fetch_responses(
- self, input_payloads: list[Payload], validate_func: function = None
+ self, input_payloads: list[Payload], validate_func: t.Callable = None
) -> list[Payload]:
try:
return asyncio.run(
@@ -251,17 +251,16 @@ def fetch_responses(
input_payloads, validate_func=validate_func
),
).result()
- except:
+ except Exception:
logger.error(f"Caught an exception: {e}")
-
async def async_fetch_responses(
self,
input_payloads: list[Payload],
- validate_func: function = None,
+ validate_func: t.Callable = None,
) -> list[Payload]:
- rpm_limiter = aiolimiter.AsyncLimiter(self._rpm_limit, time_period=60)
- tpm_limiter = aiolimiter.AsyncLimiter(self._tpm_limit, time_period=60)
+ rpm_limiter = AsyncLimiter(self._rpm_limit, time_period=60)
+ tpm_limiter = AsyncLimiter(self._tpm_limit, time_period=60)
async_outputs = [
async_process_payload(
data,
diff --git a/uptrain/operators/language/meteor.py b/uptrain/operators/language/meteor.py
index 3642b85e9..695c36a09 100644
--- a/uptrain/operators/language/meteor.py
+++ b/uptrain/operators/language/meteor.py
@@ -11,13 +11,16 @@
from __future__ import annotations
import typing as t
-from loguru import logger
import polars as pl
from uptrain.framework import Settings
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ ColumnOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
from uptrain.utilities import lazy_load_dep
nltk = lazy_load_dep("nltk", "nltk")
diff --git a/uptrain/operators/language/model_grade.py b/uptrain/operators/language/model_grade.py
index 020a44455..7de7d4ca5 100644
--- a/uptrain/operators/language/model_grade.py
+++ b/uptrain/operators/language/model_grade.py
@@ -4,10 +4,9 @@
from __future__ import annotations
import typing as t
-import os
import copy
import re
-
+from uuid import uuid4
from loguru import logger
import polars as pl
@@ -15,7 +14,11 @@
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ ColumnOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
from uptrain.operators.language.llm import LLMMulticlient, Payload
# from evals.elsuite.modelgraded.classify_utils import (
@@ -26,7 +29,6 @@
import logging
import string
-from typing import Any, Callable, Iterable, Optional, Union
MATCH_FNS = {
"include": lambda x, y: float(x in y),
@@ -55,9 +57,9 @@
def get_choice_score(
choice: str,
- choice_strings: Iterable[str],
- choice_scores: Optional[Union[dict[str, float], str]] = None,
-) -> Optional[float]:
+ choice_strings: t.Iterable[str],
+ choice_scores: t.Optional[t.Union[dict[str, float], str]] = None,
+) -> t.Optional[float]:
if choice_scores is None:
return None
if choice_scores == "from_strings":
@@ -68,7 +70,7 @@ def get_choice_score(
return choice_scores[choice]
-def choice_to_str(choice_strings: Iterable[str]) -> str:
+def choice_to_str(choice_strings: t.Iterable[str]) -> str:
"""Return a string of choices, e.g. '"Yes" or "No" or "Maybe"'."""
return " or ".join(f'"{choice}"' for choice in choice_strings)
@@ -77,8 +79,8 @@ def append_answer_prompt(
prompt: list,
eval_type: str,
append_type: str = "as_content",
- answer_prompt: Optional[list] = None,
- choice_strings: Optional[Iterable[str]] = None,
+ answer_prompt: t.Optional[list] = None,
+ choice_strings: t.Optional[t.Iterable[str]] = None,
) -> list:
"""Append answer prompt to prompt."""
answer_prompt = (
@@ -160,9 +162,11 @@ def format_prompt(
"""Format a prompt with only necessary kwargs."""
# if any input kwargs is chat prompt, convert to text prompt
kwargs = {
- k: chat_prompt_to_text_prompt(v, for_completion=False)
- if is_chat_prompt(v)
- else v
+ k: (
+ chat_prompt_to_text_prompt(v, for_completion=False)
+ if is_chat_prompt(v)
+ else v
+ )
for k, v in kwargs.items()
}
if is_chat_prompt(prompt):
@@ -258,11 +262,12 @@ class ModelGradeScore(ColumnOp):
context_vars: dict[str, str]
col_out: t.Union[str, list[str]] = "model_grade_score"
- def setup(self, settings: Settings):
- self._api_client = LLMMulticlient(settings=settings)
+ def setup(self, settings: Settings, aclient: t.Any = None):
+ self._api_client = LLMMulticlient(settings=settings, aclient=aclient)
+ self._aclient = aclient
self._settings = settings
self.model = settings.model.replace("azure/", "")
- if not (self.eval_type in ["cot_classify", "tot_classify", "tot_score"]):
+ if self.eval_type not in ["cot_classify", "tot_classify", "tot_score"]:
raise Exception(
"Only eval_type: cot_classify and tot_classify is supported for model grading check"
)
@@ -289,21 +294,13 @@ def setup(self, settings: Settings):
return self
def _make_payload(self, id: t.Any, messages: list[dict]) -> Payload:
- if self._settings.seed is None:
- return Payload(
- data={"model": self.model, "messages": messages, "temperature": 0.2},
- metadata={"index": id},
- )
- else:
- return Payload(
- data={
- "model": self.model,
- "messages": messages,
- "temperature": 0.2,
- "seed": self._settings.seed,
- },
- metadata={"index": id},
- )
+ payload = Payload(data={"model": self.model, "messages": messages, "temperature": 0.2}, metadata={"index": id})
+ if self._settings.seed is not None:
+ payload.data["seed"] = self._settings.seed
+ if self._aclient is not None:
+ trace_id = str(uuid4())
+ payload.data["trace_id"] = trace_id
+ return payload
def get_choice_via_llm(self, text: str, grading_prompt_template: str) -> str:
"""Queries LLM to get score from the text"""
@@ -324,15 +321,15 @@ def get_choice_via_llm(self, text: str, grading_prompt_template: str) -> str:
score = output_payload.response.choices[0].message.content
float(score)
return score
- except:
+ except Exception:
return str(0.0)
def get_choice(
self,
text: str,
eval_type: str,
- match_fn: Union[str, Callable],
- choice_strings: Iterable[str],
+ match_fn: t.Union[str, t.Callable],
+ choice_strings: t.Iterable[str],
choice_scores: dict = {},
) -> str:
"""Clean the answer string to a choice string to one of choice_strings. Return '__invalid__.' if no match."""
@@ -416,7 +413,7 @@ def get_choice(
new_char = char + prev_char
try:
float(new_char)
- except:
+ except Exception:
break
prev_char = new_char
part_before_decimal = prev_char
@@ -427,7 +424,7 @@ def get_choice(
new_char = prev_char + char
try:
float(new_char)
- except:
+ except Exception:
break
prev_char = new_char
part_after_decimal = prev_char
@@ -439,7 +436,7 @@ def get_choice(
text, self.grading_prompt_template
)
return str(choice)
- except:
+ except Exception:
return self.get_choice_via_llm(
text, self.grading_prompt_template
)
diff --git a/uptrain/operators/language/openai_evals.py b/uptrain/operators/language/openai_evals.py
index c858537ec..bf6ff364c 100644
--- a/uptrain/operators/language/openai_evals.py
+++ b/uptrain/operators/language/openai_evals.py
@@ -4,18 +4,21 @@
import typing as t
import uuid
import itertools
-import numpy as np
import evals
import evals.base
import evals.record
import evals.registry
-from loguru import logger
import polars as pl
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ ColumnOp,
+ TransformOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
from uptrain.utilities import to_py_types
UPTRAIN_BASE_DIR = os.path.dirname(os.path.abspath(__file__))
diff --git a/uptrain/operators/language/prompts/classic.py b/uptrain/operators/language/prompts/classic.py
index 151ab4bc7..1a081d7df 100644
--- a/uptrain/operators/language/prompts/classic.py
+++ b/uptrain/operators/language/prompts/classic.py
@@ -448,8 +448,6 @@
"""
-
-
# Code Hallucination
CODE_HALLUCINATION_PROMPT_TEMPLATE = """
You are given a response generated from a chatbot. Please assess whether the given response includes any computer code (or CLI command) or not. If you do find a code/command, include the line number in which you found the code/command.
@@ -473,4 +471,3 @@
Task data:
[Response]: {response}
"""
-
diff --git a/uptrain/operators/language/prompts/few_shots.py b/uptrain/operators/language/prompts/few_shots.py
index 158c58b51..a56f34603 100644
--- a/uptrain/operators/language/prompts/few_shots.py
+++ b/uptrain/operators/language/prompts/few_shots.py
@@ -545,4 +545,4 @@
"Choice": "A",
"Snippet": "SELECT * FROM hospitals WHERE name = \"St. Mary's Hospital\";"
}
-"""
\ No newline at end of file
+"""
diff --git a/uptrain/operators/language/question_quality.py b/uptrain/operators/language/question_quality.py
index eb9efef4d..fde4c6704 100644
--- a/uptrain/operators/language/question_quality.py
+++ b/uptrain/operators/language/question_quality.py
@@ -15,7 +15,7 @@
@register_op
-class ValidQuestion(ColumnOp):
+class ValidQuestionScore(ColumnOp):
"""
Simply check the number of words in the question and grades as incomplete if below a threshold
Attributes:
@@ -41,16 +41,20 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
try:
for row in data_send:
question = row.pop(self.col_question)
- results.append({"score_valid_question": int(len(question.split(" ")) > self.words_threshold)})
+ results.append(
+ {
+ "score_valid_question": int(
+ len(question.split(" ")) > self.words_threshold
+ )
+ }
+ )
except Exception as e:
- logger.error(f"Failed to run evaluation for `ValidQuestion`: {e}")
+ logger.error(f"Failed to run evaluation for `ValidQuestionScore`: {e}")
raise e
-
+
assert results is not None
return {
"output": data.with_columns(
- pl.from_dicts(results).rename(
- {"score_valid_question": self.col_out}
- )
+ pl.from_dicts(results).rename({"score_valid_question": self.col_out})
)
- }
\ No newline at end of file
+ }
diff --git a/uptrain/operators/language/response_quality.py b/uptrain/operators/language/response_quality.py
index 872a16328..4fb3b3b14 100644
--- a/uptrain/operators/language/response_quality.py
+++ b/uptrain/operators/language/response_quality.py
@@ -77,7 +77,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -90,7 +93,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["response"] = row.pop(self.col_response)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate(
@@ -223,7 +229,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -236,7 +245,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["response"] = row.pop(self.col_response)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate(
@@ -366,7 +378,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -378,7 +393,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["response"] = row.pop(self.col_response)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate(
@@ -507,7 +525,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -519,7 +540,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["response"] = row.pop(self.col_response)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate(
@@ -646,7 +670,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -658,7 +685,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["response"] = row.pop(self.col_response)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate(
@@ -775,7 +805,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
# TODO: Add support for local evaluation for all methods
if self.method != "llm":
raise Exception(
@@ -796,7 +829,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["ground_truth"] = row.pop(self.col_ground_truth)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate(
@@ -834,24 +870,29 @@ def evaluate_local(self, data):
Our methodology is based on the model grade evaluation introduced by openai evals.
"""
- data_precision = copy.deepcopy(pl.DataFrame(data)).rename({
- self.col_response: "response",
- self.col_ground_truth: "context"
- })
- data_recall = copy.deepcopy(pl.DataFrame(data)).rename({
- self.col_ground_truth: "response",
- self.col_response: "context"
- })
- eval_data = pl.concat([data_precision, data_recall.select(data_precision.columns)])
-
- output = ResponseFactualScore(
- col_question=self.col_question,
- col_response="response",
- col_context="context",
- scenario_description=self.scenario_description,
- ).setup(settings=self.settings).run(eval_data)["output"].to_dicts()
- output_precision = output[0:len(data)]
- output_recall = output[len(data):]
+ data_precision = copy.deepcopy(pl.DataFrame(data)).rename(
+ {self.col_response: "response", self.col_ground_truth: "context"}
+ )
+ data_recall = copy.deepcopy(pl.DataFrame(data)).rename(
+ {self.col_ground_truth: "response", self.col_response: "context"}
+ )
+ eval_data = pl.concat(
+ [data_precision, data_recall.select(data_precision.columns)]
+ )
+
+ output = (
+ ResponseFactualScore(
+ col_question=self.col_question,
+ col_response="response",
+ col_context="context",
+ scenario_description=self.scenario_description,
+ )
+ .setup(settings=self.settings)
+ .run(eval_data)["output"]
+ .to_dicts()
+ )
+ output_precision = output[0 : len(data)]
+ output_recall = output[len(data) :]
results = []
for combined_row in zip(output_precision, output_recall):
@@ -877,7 +918,7 @@ def evaluate_local(self, data):
if precision != 0 and recall != 0:
output["score_response_matching"] = 4 * (
- (precision * recall) / (precision*3 + recall)
+ (precision * recall) / (precision * 3 + recall)
)
else:
output["score_response_matching"] = 0
diff --git a/uptrain/operators/language/rouge.py b/uptrain/operators/language/rouge.py
index d39a70322..1913d7ccd 100644
--- a/uptrain/operators/language/rouge.py
+++ b/uptrain/operators/language/rouge.py
@@ -8,13 +8,16 @@
from __future__ import annotations
import typing as t
-from loguru import logger
import polars as pl
from uptrain.framework import Settings
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ ColumnOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
from uptrain.utilities import lazy_load_dep
rouge_scorer = lazy_load_dep("rouge_score.rouge_scorer", "rouge_score")
diff --git a/uptrain/operators/language/subquery.py b/uptrain/operators/language/subquery.py
index d593e6de6..6784439f5 100644
--- a/uptrain/operators/language/subquery.py
+++ b/uptrain/operators/language/subquery.py
@@ -59,7 +59,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -72,7 +75,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["sub_questions"] = row.pop(self.col_sub_questions)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate(
diff --git a/uptrain/operators/language/text.py b/uptrain/operators/language/text.py
index 6a7415837..10993ec2b 100644
--- a/uptrain/operators/language/text.py
+++ b/uptrain/operators/language/text.py
@@ -13,13 +13,16 @@
import typing as t
from urllib.parse import urlparse
-from loguru import logger
import polars as pl
from uptrain.framework import Settings
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ ColumnOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
# TODO: Add support for versions without a minor version number (e.g., "v1") or without a patch version number (e.g., "v1.2")
diff --git a/uptrain/operators/language/tone.py b/uptrain/operators/language/tone.py
index b440ecfb0..160fed924 100644
--- a/uptrain/operators/language/tone.py
+++ b/uptrain/operators/language/tone.py
@@ -66,7 +66,10 @@ def setup(self, settings: t.Optional[Settings] = None):
assert settings is not None
self.settings = settings
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
self._api_client = LLMMulticlient(settings)
else:
self._api_client = APIClient(settings)
@@ -78,7 +81,10 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["response"] = row.pop(self.col_response)
try:
- if self.settings.evaluate_locally and (self.settings.uptrain_access_token is None or not len(self.settings.uptrain_access_token)):
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
results = self.evaluate_local(data_send)
else:
results = self._api_client.evaluate(
diff --git a/uptrain/operators/language/topic.py b/uptrain/operators/language/topic.py
index d59c43c67..c0d66d1e2 100644
--- a/uptrain/operators/language/topic.py
+++ b/uptrain/operators/language/topic.py
@@ -9,19 +9,20 @@
from __future__ import annotations
import typing as t
-from loguru import logger
import numpy as np
import polars as pl
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
-from uptrain.utilities import lazy_load_dep
+from uptrain.operators.base import (
+ ColumnOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
@register_op
class TopicAssignmentviaCluster(ColumnOp):
-
"""
Operator for assigning topics based on cluster assignments. Note, you should run Clustering operator before using this.
diff --git a/uptrain/operators/metrics.py b/uptrain/operators/metrics.py
index 129675966..27f7d3857 100644
--- a/uptrain/operators/metrics.py
+++ b/uptrain/operators/metrics.py
@@ -6,13 +6,16 @@
from __future__ import annotations
import typing as t
-from loguru import logger
import numpy as np
import polars as pl
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ ColumnOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
@register_op
diff --git a/uptrain/operators/rca/rag_with_citation.py b/uptrain/operators/rca/rag_with_citation.py
index a653998e3..14a8e01b5 100644
--- a/uptrain/operators/rca/rag_with_citation.py
+++ b/uptrain/operators/rca/rag_with_citation.py
@@ -8,13 +8,25 @@
from loguru import logger
import polars as pl
+import copy
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ ColumnOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
from uptrain import RcaTemplate
from uptrain.utilities import polars_to_json_serializable_dict
+from uptrain.operators.language.llm import LLMMulticlient
+from uptrain.operators import (
+ ValidQuestionScore,
+ ResponseFactualScore,
+ ContextRelevance,
+ ValidResponseScore,
+)
@register_op
@@ -43,7 +55,14 @@ def setup(self, settings: t.Optional[Settings] = None):
from uptrain.framework.remote import APIClient
assert settings is not None
- self._api_client = APIClient(settings)
+ self.settings = settings
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
+ self._api_client = LLMMulticlient(settings)
+ else:
+ self._api_client = APIClient(settings)
return self
def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
@@ -55,13 +74,19 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
row["cited_context"] = row.pop(self.col_cited_context)
try:
- results = self._api_client.perform_root_cause_analysis(
- project_name="_internal",
- data=data_send,
- rca_template=RcaTemplate.RAG_WITH_CITATION,
- scenario_description=self.scenario_description,
- metadata={"internal_call": True},
- )
+ if self.settings.evaluate_locally and (
+ self.settings.uptrain_access_token is None
+ or not len(self.settings.uptrain_access_token)
+ ):
+ results = self.evaluate_local(data_send)
+ else:
+ results = self._api_client.perform_root_cause_analysis(
+ project_name="_internal",
+ data=data_send,
+ rca_template=RcaTemplate.RAG_WITH_CITATION,
+ scenario_description=self.scenario_description,
+ metadata={"internal_call": True},
+ )
except Exception as e:
logger.error(
f"Failed to run Root cause analysis for `RagWithCitation`: {e}"
@@ -70,3 +95,158 @@ def run(self, data: pl.DataFrame) -> TYPE_TABLE_OUTPUT:
assert results is not None
return {"output": data.with_columns(pl.from_dicts(results))}
+
+ def evaluate_local(self, data):
+ question_valid_scores = (
+ ValidQuestionScore(col_question="question")
+ .setup(settings=self.settings)
+ .run(pl.DataFrame(data))["output"]
+ .to_dicts()
+ )
+
+ response_valid_scores = (
+ ValidResponseScore(col_response="response")
+ .setup(settings=self.settings)
+ .run(pl.DataFrame(data))["output"]
+ .to_dicts()
+ )
+
+ context_relevance_scores = (
+ ContextRelevance(col_question="question", col_context="context")
+ .setup(settings=self.settings)
+ .run(pl.DataFrame(data))["output"]
+ .to_dicts()
+ )
+
+ factual_accuracy_scores = (
+ ResponseFactualScore(
+ col_question="question", col_context="context", col_response="response"
+ )
+ .setup(settings=self.settings)
+ .run(pl.DataFrame(data))["output"]
+ .to_dicts()
+ )
+
+ data_cited = (
+ copy.deepcopy(pl.DataFrame(data))
+ .drop("context")
+ .rename({"cited_context": "context"})
+ )
+
+ cited_context_relevance_scores = (
+ ContextRelevance(col_question="question", col_context="context")
+ .setup(settings=self.settings)
+ .run(data_cited)["output"]
+ .to_dicts()
+ )
+
+ cited_factual_accuracy_scores = (
+ ResponseFactualScore(
+ col_question="question", col_context="context", col_response="response"
+ )
+ .setup(settings=self.settings)
+ .run(data_cited)["output"]
+ .to_dicts()
+ )
+
+ results = []
+
+ for idx, row in enumerate(data):
+ this_row_scores = [None, None, None, None, None, None]
+ this_row_error = None
+ this_row_suggestion = None
+
+ question_completeness = question_valid_scores[idx]["score_valid_question"]
+ valid_response = response_valid_scores[idx]["score_valid_response"]
+ context_relevance = context_relevance_scores[idx]["score_context_relevance"]
+ factual_accuracy = factual_accuracy_scores[idx]["score_factual_accuracy"]
+ cited_relevance = cited_context_relevance_scores[idx][
+ "score_context_relevance"
+ ]
+ cited_factual = cited_factual_accuracy_scores[idx]["score_factual_accuracy"]
+
+ this_row_explanations = [
+ None,
+ response_valid_scores[idx]["explanation_valid_response"],
+ context_relevance_scores[idx]["explanation_context_relevance"],
+ factual_accuracy_scores[idx]["explanation_factual_accuracy"],
+ cited_context_relevance_scores[idx]["explanation_context_relevance"],
+ cited_factual_accuracy_scores[idx]["explanation_factual_accuracy"],
+ ]
+
+ this_row_scores = [
+ question_completeness,
+ valid_response,
+ context_relevance,
+ factual_accuracy,
+ cited_relevance,
+ cited_factual,
+ ]
+
+ if question_completeness == 0:
+ this_row_scores = [0, 0, 0, 0, 0, 0]
+ this_row_explanations = [
+ None,
+ "Default explanation as the question is incomplete",
+ "Default explanation as the question is incomplete",
+ "Default explanation as the question is incomplete",
+ "Default explanation as the question is incomplete",
+ "Default explanation as the question is incomplete",
+ ]
+ this_row_error = "Incomplete Question"
+ this_row_suggestion = "Ask the user to provide a valid question. In case of an ongoing conversation, rewrite the question by taking previous messages into account."
+ elif valid_response == 0:
+ this_row_scores = [1, 0, context_relevance, 0, 0, 0]
+ this_row_explanations = [
+ None,
+ response_valid_scores[idx]["explanation_valid_response"],
+ context_relevance_scores[idx]["explanation_context_relevance"],
+ "Default explanation as the response doesn't contain any relevant information",
+ "Default explanation as the response doesn't contain any relevant information",
+ "Default explanation as the response doesn't contain any relevant information",
+ ]
+ if context_relevance == 0:
+ this_row_error = "Response With No Information - Poor Retrieval"
+ this_row_suggestion = "Context Retrieval Pipeline needs improvement"
+ else:
+ this_row_error = (
+ "Response With No Information - Poor Context Utilization"
+ )
+ this_row_suggestion = "Add intermediary steps so as the LLM can better understand context and generate a valid response"
+ elif context_relevance == 0:
+ this_row_error = "Poor Retrieval"
+ this_row_suggestion = "Context Retrieval Pipeline needs improvement"
+ elif factual_accuracy == 0:
+ this_row_error = "Hallucinations"
+ this_row_suggestion = "Add instructions to your LLM to adher to the context provide - Try tipping"
+ elif cited_factual == 0:
+ this_row_error = "Poor citation"
+ this_row_suggestion = "LLM is extracting facts from the context which are not cited correctly. Improve the citation quality of LLM by adding more instructions"
+ elif cited_relevance == 0:
+ this_row_error = "Poor Context Utilization"
+ this_row_suggestion = "Add intermediary steps so as the LLM can better understand context and generate a complete response"
+ else:
+ this_row_error = "Others"
+ this_row_suggestion = (
+ "Please reach out to the UpTrain team for further brainstorming"
+ )
+
+ results.append(
+ {
+ "error_mode": this_row_error,
+ "error_resolution_suggestion": this_row_suggestion,
+ "score_question_completeness": this_row_scores[0],
+ "score_valid_response": this_row_scores[1],
+ "explanation_valid_response": this_row_explanations[1],
+ "score_context_relevance": this_row_scores[2],
+ "explanation_context_relevance": this_row_explanations[2],
+ "score_factual_accuracy": this_row_scores[3],
+ "explanation_factual_accuracy": this_row_explanations[3],
+ "score_cited_context_relevance": this_row_scores[4],
+ "explanation_cited_context_relevance": this_row_explanations[4],
+ "score_factual_accuracy_wrt_cited": this_row_scores[5],
+ "explanation_factual_accuracy_wrt_cited": this_row_explanations[5],
+ }
+ )
+
+ return results
diff --git a/uptrain/operators/similarity.py b/uptrain/operators/similarity.py
index 752909682..2f451010a 100644
--- a/uptrain/operators/similarity.py
+++ b/uptrain/operators/similarity.py
@@ -9,12 +9,15 @@
import typing as t
import numpy as np
-from loguru import logger
import polars as pl
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ ColumnOp,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
@register_op
diff --git a/uptrain/operators/table.py b/uptrain/operators/table.py
index 4be99adef..f7456286f 100644
--- a/uptrain/operators/table.py
+++ b/uptrain/operators/table.py
@@ -8,14 +8,17 @@
from __future__ import annotations
import typing as t
-import numpy as np
-from loguru import logger
from pydantic import Field
import polars as pl
if t.TYPE_CHECKING:
from uptrain.framework import Settings
-from uptrain.operators.base import *
+from uptrain.operators.base import (
+ ColumnOp,
+ OpBaseModel,
+ register_op,
+ TYPE_TABLE_OUTPUT,
+)
@register_op
@@ -32,7 +35,7 @@ class Table(OpBaseModel):
props: dict = Field(default_factory=dict)
title: str = ""
- kind = "table"
+ kind: str = "table"
def setup(self, settings: Settings):
return self
diff --git a/uptrain/utilities/__init__.py b/uptrain/utilities/__init__.py
index 7616ac9bf..f5e5dbdcc 100644
--- a/uptrain/utilities/__init__.py
+++ b/uptrain/utilities/__init__.py
@@ -10,7 +10,7 @@
from lazy_loader import load as _lazy_load
from loguru import logger
-import pydantic
+from pydantic import BaseModel
import numpy as np
# import numpy.typing as npt
@@ -53,7 +53,7 @@ def to_py_types(obj: t.Any) -> t.Any:
"op_name": getattr(obj, "_uptrain_op_name"),
"params": obj.dict(include=set(obj.__fields__)),
}
- elif isinstance(obj, pydantic.BaseModel):
+ elif isinstance(obj, BaseModel):
return obj.dict()
# for numpy types
@@ -151,11 +151,11 @@ def polars_to_json_serializable_dict(data: pl.DataFrame):
try:
json.dumps(data_dictn)
- except:
+ except Exception:
for key in list(data_dictn[0].keys()):
try:
json.dumps([x[key] for x in data_dictn])
- except:
+ except Exception:
for row in data_dictn:
del row[key]
@@ -171,7 +171,7 @@ def polars_to_pandas(data: pl.DataFrame):
try:
pd_data = data.to_pandas()
- except:
+ except Exception:
# convert to python native types first and then to pandas
logger.warning(
"Error converting polars to pandas. Trying to convert to python native types first."
@@ -279,7 +279,7 @@ def lazy_load_dep(import_name: str, package_name: str):
"""
try:
spec = importlib.util.find_spec(import_name)
- except:
+ except Exception:
spec = None
if spec is None:
logger.warning(
diff --git a/uptrain/utilities/app_schema.py b/uptrain/utilities/app_schema.py
index c4bfbb260..bc9a2bda3 100644
--- a/uptrain/utilities/app_schema.py
+++ b/uptrain/utilities/app_schema.py
@@ -1,7 +1,5 @@
from __future__ import annotations
-import datetime as dt
-from enum import Enum
import typing as t
from pydantic import BaseModel
@@ -34,6 +32,7 @@ class EvaluateV2(BaseModel):
schema_dict: dict
project: str
+
class EvaluateV3(BaseModel):
model: str
project_name: str
@@ -50,4 +49,3 @@ class ProjectsList(BaseModel):
class ProjectData(BaseModel):
data: list[t.Any]
project_name: str
-
\ No newline at end of file
diff --git a/uptrain/utilities/db.py b/uptrain/utilities/db.py
index c07d57c4b..5aa8cb04b 100644
--- a/uptrain/utilities/db.py
+++ b/uptrain/utilities/db.py
@@ -38,6 +38,7 @@ class ModelDataset(SQLBase):
UniqueConstraint("user_id", "name", "version", name="uix_dataset"),
)
+
class ModelPrompt(SQLBase):
__tablename__ = "prompts"
@@ -52,6 +53,7 @@ class ModelPrompt(SQLBase):
UniqueConstraint("user_id", "name", "version", name="uix_prompt"),
)
+
class ModelUser(SQLBase):
__tablename__ = "users"
diff --git a/uptrain/utilities/utils.py b/uptrain/utilities/utils.py
index 15d4cc87b..fca95d4c2 100644
--- a/uptrain/utilities/utils.py
+++ b/uptrain/utilities/utils.py
@@ -1,57 +1,55 @@
-from datetime import datetime, timedelta
-import io
-import json, os
-import typing as t
+from datetime import datetime
+import os
import polars as pl
-import pandas as pd
-import dateutil.parser
import fsspec
from fsspec.implementations.dirfs import DirFileSystem
-from uptrain import Settings
from uptrain import (
- Evals,
- ResponseMatching,
- GuidelineAdherence,
- ConversationSatisfaction,
+ Evals,
+ ResponseMatching,
+ GuidelineAdherence,
+ ConversationSatisfaction,
JailbreakDetection,
- CritiqueTone
+ CritiqueTone,
)
+from uptrain.utilities import lazy_load_dep
def _get_fsspec_filesystem(database_path) -> fsspec.AbstractFileSystem:
return DirFileSystem(database_path, auto_mkdir=True)
-from uptrain.utilities import lazy_load_dep
+
fsspec.config.conf["file"] = {"auto_mkdir": True}
evals_mapping = {
- "context_relevance" : Evals.CONTEXT_RELEVANCE,
- "factual_accuracy" : Evals.FACTUAL_ACCURACY,
- "response_relevance" : Evals.RESPONSE_RELEVANCE,
- "critique_language" : Evals.CRITIQUE_LANGUAGE,
- "response_completeness" : Evals.RESPONSE_COMPLETENESS,
- "response_completeness_wrt_context" : Evals.RESPONSE_COMPLETENESS_WRT_CONTEXT,
- "response_consistency" : Evals.RESPONSE_CONSISTENCY,
- "response_conciseness" : Evals.RESPONSE_CONCISENESS,
- "valid_response" : Evals.VALID_RESPONSE,
+ "context_relevance": Evals.CONTEXT_RELEVANCE,
+ "factual_accuracy": Evals.FACTUAL_ACCURACY,
+ "response_relevance": Evals.RESPONSE_RELEVANCE,
+ "critique_language": Evals.CRITIQUE_LANGUAGE,
+ "response_completeness": Evals.RESPONSE_COMPLETENESS,
+ "response_completeness_wrt_context": Evals.RESPONSE_COMPLETENESS_WRT_CONTEXT,
+ "response_consistency": Evals.RESPONSE_CONSISTENCY,
+ "response_conciseness": Evals.RESPONSE_CONCISENESS,
+ "valid_response": Evals.VALID_RESPONSE,
"response_alignment_with_scenario": Evals.RESPONSE_ALIGNMENT_WITH_SCENARIO,
- "response_sincerity_with_scenario" : Evals.RESPONSE_SINCERITY_WITH_SCENARIO,
- "prompt_injection" : Evals.PROMPT_INJECTION,
- "code_hallucination" : Evals.CODE_HALLUCINATION,
- "sub_query_completeness" : Evals.SUB_QUERY_COMPLETENESS,
- "context_reranking" : Evals.CONTEXT_RERANKING,
- "context_conciseness ": Evals.CONTEXT_CONCISENESS
+ "response_sincerity_with_scenario": Evals.RESPONSE_SINCERITY_WITH_SCENARIO,
+ "prompt_injection": Evals.PROMPT_INJECTION,
+ "code_hallucination": Evals.CODE_HALLUCINATION,
+ "sub_query_completeness": Evals.SUB_QUERY_COMPLETENESS,
+ "context_reranking": Evals.CONTEXT_RERANKING,
+ "context_conciseness ": Evals.CONTEXT_CONCISENESS,
}
parametric_evals_mapping = {
- "CritiqueTone" : CritiqueTone,
- "GuidelineAdherence" : GuidelineAdherence,
- "ConversationSatisfaction" : ConversationSatisfaction,
- "ResponseMatching" : ResponseMatching,
- "JailbreakDetection" : JailbreakDetection
+ "CritiqueTone": CritiqueTone,
+ "GuidelineAdherence": GuidelineAdherence,
+ "ConversationSatisfaction": ConversationSatisfaction,
+ "ResponseMatching": ResponseMatching,
+ "JailbreakDetection": JailbreakDetection,
}
+
+
def checks_mapping(check_name: str, params: dict = dict()):
if check_name in evals_mapping:
return evals_mapping[check_name]
@@ -59,12 +57,14 @@ def checks_mapping(check_name: str, params: dict = dict()):
return parametric_evals_mapping[check_name](**params)
else:
return None
-
+
+
def get_uuid():
import uuid
return str(uuid.uuid4().hex)
+
def get_current_datetime():
return datetime.utcnow()
@@ -74,6 +74,7 @@ def hash_string(s: str):
return hashlib.sha256(s.encode()).hexdigest()
+
def create_dirs(path: str):
dirs_to_create = [
os.path.join(path),
@@ -84,6 +85,7 @@ def create_dirs(path: str):
os.makedirs(_dir, exist_ok=True)
return
+
def get_sqlite_utils_db(fpath: str):
sqlite = lazy_load_dep("sqlite_utils", "sqlite_utils")
import sqlite3
@@ -91,43 +93,46 @@ def get_sqlite_utils_db(fpath: str):
conn = sqlite3.connect(fpath, check_same_thread=False)
return sqlite.Database(conn)
+
def parse_prompt(prompt):
prompt_vars = []
if prompt is not None and len(prompt):
- if '{{' in prompt:
+ if "{{" in prompt:
prompt_vars = [x.split("}}")[0] for x in prompt.split("{{")[1:]]
for var in prompt_vars:
prompt = prompt.replace("{{" + var + "}}", "{" + var + "}")
- elif '{' in prompt:
+ elif "{" in prompt:
prompt_vars = [x.split("}")[0] for x in prompt.split("{")[1:]]
else:
- prompt = ''
+ prompt = ""
return prompt, prompt_vars
+
def convert_project_to_polars(project_data):
dictn = []
for row in project_data:
- data = row['data']
- data.update(row['checks'])
- if 'uptrain_settings' in row['metadata']:
- del row['metadata']['uptrain_settings']
- data.update(row['metadata'])
- data.update({'project_name': row['project'], 'timestamp': row['timestamp']})
+ data = row["data"]
+ data.update(row["checks"])
+ if "uptrain_settings" in row["metadata"]:
+ del row["metadata"]["uptrain_settings"]
+ data.update(row["metadata"])
+ data.update({"project_name": row["project"], "timestamp": row["timestamp"]})
dictn.append(data)
return pl.DataFrame(dictn)
+
def convert_project_to_dicts(project_data):
dictn = []
checks_mapping = {}
for row in project_data:
- data = row['data']
- #data.update(row['checks'])
+ data = row["data"]
+ # data.update(row['checks'])
uuid_tag = get_uuid()
- data.update({'uuid_tag': uuid_tag})
- checks_mapping[uuid_tag] = row['checks']
- if 'uptrain_settings' in row['metadata']:
- del row['metadata']['uptrain_settings']
- data.update(row['metadata'])
- data.update({'project_name': row['project'], 'timestamp': row['timestamp']})
+ data.update({"uuid_tag": uuid_tag})
+ checks_mapping[uuid_tag] = row["checks"]
+ if "uptrain_settings" in row["metadata"]:
+ del row["metadata"]["uptrain_settings"]
+ data.update(row["metadata"])
+ data.update({"project_name": row["project"], "timestamp": row["timestamp"]})
dictn.append(data)
return pl.DataFrame(dictn), checks_mapping
diff --git a/uptrain/v0/core/classes/helpers/config_handler.py b/uptrain/v0/core/classes/helpers/config_handler.py
index 27733e6c0..f4e95dd31 100644
--- a/uptrain/v0/core/classes/helpers/config_handler.py
+++ b/uptrain/v0/core/classes/helpers/config_handler.py
@@ -2,7 +2,7 @@
import typing
from datetime import datetime
-from pydantic import BaseModel
+from pydantic import ConfigDict, BaseModel
import numpy as np
from uptrain.v0.constants import AnnotationMethod
@@ -82,6 +82,4 @@ class Config(BaseModel):
class GroundTruthArgs(BaseModel):
gt: typing.Union[np.ndarray, list]
id: typing.Union[np.ndarray, list]
-
- class Config:
- arbitrary_types_allowed = True
+ model_config = ConfigDict(arbitrary_types_allowed=True)
diff --git a/uptrain/v0/core/classes/monitors/data_drift.py b/uptrain/v0/core/classes/monitors/data_drift.py
index ac55c1214..e713f0fef 100644
--- a/uptrain/v0/core/classes/monitors/data_drift.py
+++ b/uptrain/v0/core/classes/monitors/data_drift.py
@@ -118,9 +118,9 @@ def check(self, inputs, outputs, gts=None, extra_args={}):
] = uniq_indexs
query_indexs[uniq_clusters] = uniq_indexs
query_indexs = np.array(query_indexs, dtype=int)
- self.bucket_labelling_info[
- "hover_vals_for_production_clusters"
- ] = hover_measurable_vals[query_indexs]
+ self.bucket_labelling_info["hover_vals_for_production_clusters"] = (
+ hover_measurable_vals[query_indexs]
+ )
self.prod_dist_counts_arr.append(self.prod_dist_counts.copy())
@@ -283,9 +283,9 @@ def base_is_data_interesting(self, inputs, outputs, gts=None, extra_args={}):
is_interesting = np.logical_or(is_close, is_interesting)
for lkdx in range(len(is_close)):
if is_close[lkdx]:
- reasons[
- lkdx
- ] = "Lies_to_Low_Density_Regions_In_Training_Distribution"
+ reasons[lkdx] = (
+ "Lies_to_Low_Density_Regions_In_Training_Distribution"
+ )
if len(self.outliers):
dists_from_outliers = np.min(
@@ -403,9 +403,9 @@ def bucket_reference_dataset(self):
clustering_results["idxs_closest_to_cluster_centroids"].values()
)
]
- self.bucket_labelling_info[
- "hover_vals_for_reference_clusters"
- ] = all_hover_vals[hover_label_idxs]
+ self.bucket_labelling_info["hover_vals_for_reference_clusters"] = (
+ all_hover_vals[hover_label_idxs]
+ )
self.prod_dist = np.zeros(self.ref_dist.shape)
self.prod_dist_counts = np.zeros(self.ref_dist_counts.shape)
diff --git a/uptrain/v0/core/classes/monitors/feature_drift.py b/uptrain/v0/core/classes/monitors/feature_drift.py
index 509ed0b31..e9f2b80b0 100644
--- a/uptrain/v0/core/classes/monitors/feature_drift.py
+++ b/uptrain/v0/core/classes/monitors/feature_drift.py
@@ -77,7 +77,9 @@ def check(self, inputs, outputs, gts=None, extra_args={}):
if psi > self.psi_threshold:
alert = f"Feature Drift last detected at {self.all_count} for {feat_name} with PSI = {psi}"
self.log_handler.add_alert(
- f"Feature Drift Alert for {feat_name} 🚨", alert, self.dashboard_name
+ f"Feature Drift Alert for {feat_name} 🚨",
+ alert,
+ self.dashboard_name,
)
self.feats = np.array([])