diff --git a/.gitignore b/.gitignore index fa1c218fb..585a44c5f 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,5 @@ credentials.toml # Generated Files, not enabling this yet # trulens_eval/generated_files/*.{py,md,ipynb} + +**/*.sqlite \ No newline at end of file diff --git a/trulens_eval/.gitignore b/trulens_eval/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/trulens_eval/trulens_eval/tests/groundedness_smoke_tests.ipynb b/trulens_eval/trulens_eval/tests/groundedness_smoke_tests.ipynb index b61ffcbde..45ecd6878 100644 --- a/trulens_eval/trulens_eval/tests/groundedness_smoke_tests.ipynb +++ b/trulens_eval/trulens_eval/tests/groundedness_smoke_tests.ipynb @@ -13,24 +13,14 @@ "SummEval is one of the datasets dedicated to automated evaluations on summarization tasks, which are closely related to the groundedness evaluation in RAG with the retrieved context (i.e. the source) and response (i.e. the summary). It contains human annotation of numerical score (**1** to **5**) comprised of scoring from 3 human expert annotators and 5 croweded-sourced annotators. There are 16 models being used for generation in total for 100 paragraphs in the test set, so there are a total of 16,000 machine-generated summaries. Each paragraph also has several human-written summaries for comparative analysis. \n", "\n", "\n", - "For evaluating groundedness feedback functions, we calculate the annotated \"relevance\" and \"consistency (aka factuality)\" scores with equal weights and normalized to **0** to **1** score to match the output of feedback functions." + "For evaluating groundedness feedback functions, we compute the annotated \"consistency\" scores, a measure of whether the summarized response is factually consisntent with the source texts and hence can be used as a proxy to evaluate groundedness in our RAG triad, and normalized to **0** to **1** score as our **expected_score** and to match the output of feedback functions." ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🦑 Tru initialized with db url sqlite:///default.sqlite .\n", - "🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.\n", - "Deleted 0 rows.\n" - ] - } - ], + "outputs": [], "source": [ "# Import groundedness feedback function\n", "from trulens_eval.feedback import GroundTruthAgreement, Groundedness\n", @@ -40,17 +30,24 @@ "Tru().reset_database()\n", "\n", "# generator for groundedness golden set\n", - "test_cases_gen = generate_summeval_groundedness_golden_set(\"./datasets/summeval_test_100.json\")\n", - "\n", - "# generate x number of test cases\n", + "test_cases_gen = generate_summeval_groundedness_golden_set(\"./datasets/summeval_test_100.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# specify the number of test cases we want to run the smoke test on\n", "groundedness_golden_set = []\n", - "for i in range(50):\n", + "for i in range(100):\n", " groundedness_golden_set.append(next(test_cases_gen))" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -58,27 +55,33 @@ "text/plain": [ "[{'query': '(CNN)Donald Sterling\\'s racist remarks cost him an NBA team last year. But now it\\'s his former female companion who has lost big. A Los Angeles judge has ordered V. Stiviano to pay back more than $2.6 million in gifts after Sterling\\'s wife sued her. In the lawsuit, Rochelle \"Shelly\" Sterling accused Stiviano of targeting extremely wealthy older men. She claimed Donald Sterling used the couple\\'s money to buy Stiviano a Ferrari, two Bentleys and a Range Rover, and that he helped her get a $1.8 million duplex. Who is V. Stiviano? Stiviano countered that there was nothing wrong with Donald Sterling giving her gifts and that she never took advantage of the former Los Angeles Clippers owner, who made much of his fortune in real estate. Shelly Sterling was thrilled with the court decision Tuesday, her lawyer told CNN affiliate KABC. \"This is a victory for the Sterling family in recovering the $2,630,000 that Donald lavished on a conniving mistress,\" attorney Pierce O\\'Donnell said in a statement. \"It also sets a precedent that the injured spouse can recover damages from the recipient of these ill-begotten gifts.\" Stiviano\\'s gifts from Donald Sterling didn\\'t just include uber-expensive items like luxury cars. According to the Los Angeles Times, the list also includes a $391 Easter bunny costume, a $299 two-speed blender and a $12 lace thong. Donald Sterling\\'s downfall came after an audio recording surfaced of the octogenarian arguing with Stiviano. In the tape, Sterling chastises Stiviano for posting pictures on social media of her posing with African-Americans, including basketball legend Magic Johnson. \"In your lousy f**ing Instagrams, you don\\'t have to have yourself with -- walking with black people,\" Sterling said in the audio first posted by TMZ. He also tells Stiviano not to bring Johnson to Clippers games and not to post photos with the Hall of Famer so Sterling\\'s friends can see. \"Admire him, bring him here, feed him, f**k him, but don\\'t put (Magic) on an Instagram for the world to have to see so they have to call me,\" Sterling said. NBA Commissioner Adam Silver banned Sterling from the league, fined him $2.5 million and pushed through a charge to terminate all of his ownership rights in the franchise. Fact check: Donald Sterling\\'s claims vs. reality CNN\\'s Dottie Evans contributed to this report.',\n", " 'response': \"donald sterling , nba team last year . sterling 's wife sued for $ 2.6 million in gifts . sterling says he is the former female companion who has lost the . sterling has ordered v. stiviano to pay back $ 2.6 m in gifts after his wife sued . sterling also includes a $ 391 easter bunny costume , $ 299 and a $ 299 .\",\n", - " 'expected_score': 0.27},\n", + " 'expected_score': 0.2},\n", " {'query': '(CNN)Donald Sterling\\'s racist remarks cost him an NBA team last year. But now it\\'s his former female companion who has lost big. A Los Angeles judge has ordered V. Stiviano to pay back more than $2.6 million in gifts after Sterling\\'s wife sued her. In the lawsuit, Rochelle \"Shelly\" Sterling accused Stiviano of targeting extremely wealthy older men. She claimed Donald Sterling used the couple\\'s money to buy Stiviano a Ferrari, two Bentleys and a Range Rover, and that he helped her get a $1.8 million duplex. Who is V. Stiviano? Stiviano countered that there was nothing wrong with Donald Sterling giving her gifts and that she never took advantage of the former Los Angeles Clippers owner, who made much of his fortune in real estate. Shelly Sterling was thrilled with the court decision Tuesday, her lawyer told CNN affiliate KABC. \"This is a victory for the Sterling family in recovering the $2,630,000 that Donald lavished on a conniving mistress,\" attorney Pierce O\\'Donnell said in a statement. \"It also sets a precedent that the injured spouse can recover damages from the recipient of these ill-begotten gifts.\" Stiviano\\'s gifts from Donald Sterling didn\\'t just include uber-expensive items like luxury cars. According to the Los Angeles Times, the list also includes a $391 Easter bunny costume, a $299 two-speed blender and a $12 lace thong. Donald Sterling\\'s downfall came after an audio recording surfaced of the octogenarian arguing with Stiviano. In the tape, Sterling chastises Stiviano for posting pictures on social media of her posing with African-Americans, including basketball legend Magic Johnson. \"In your lousy f**ing Instagrams, you don\\'t have to have yourself with -- walking with black people,\" Sterling said in the audio first posted by TMZ. He also tells Stiviano not to bring Johnson to Clippers games and not to post photos with the Hall of Famer so Sterling\\'s friends can see. \"Admire him, bring him here, feed him, f**k him, but don\\'t put (Magic) on an Instagram for the world to have to see so they have to call me,\" Sterling said. NBA Commissioner Adam Silver banned Sterling from the league, fined him $2.5 million and pushed through a charge to terminate all of his ownership rights in the franchise. Fact check: Donald Sterling\\'s claims vs. reality CNN\\'s Dottie Evans contributed to this report.',\n", " 'response': \"donald sterling accused stiviano of targeting extremely wealthy older men . she claimed donald sterling used the couple 's money to buy stiviano a ferrari , two bentleys and a range rover . stiviano countered that there was nothing wrong with donald sterling giving her gifts .\",\n", - " 'expected_score': 0.4},\n", + " 'expected_score': 0.47},\n", " {'query': '(CNN)Donald Sterling\\'s racist remarks cost him an NBA team last year. But now it\\'s his former female companion who has lost big. A Los Angeles judge has ordered V. Stiviano to pay back more than $2.6 million in gifts after Sterling\\'s wife sued her. In the lawsuit, Rochelle \"Shelly\" Sterling accused Stiviano of targeting extremely wealthy older men. She claimed Donald Sterling used the couple\\'s money to buy Stiviano a Ferrari, two Bentleys and a Range Rover, and that he helped her get a $1.8 million duplex. Who is V. Stiviano? Stiviano countered that there was nothing wrong with Donald Sterling giving her gifts and that she never took advantage of the former Los Angeles Clippers owner, who made much of his fortune in real estate. Shelly Sterling was thrilled with the court decision Tuesday, her lawyer told CNN affiliate KABC. \"This is a victory for the Sterling family in recovering the $2,630,000 that Donald lavished on a conniving mistress,\" attorney Pierce O\\'Donnell said in a statement. \"It also sets a precedent that the injured spouse can recover damages from the recipient of these ill-begotten gifts.\" Stiviano\\'s gifts from Donald Sterling didn\\'t just include uber-expensive items like luxury cars. According to the Los Angeles Times, the list also includes a $391 Easter bunny costume, a $299 two-speed blender and a $12 lace thong. Donald Sterling\\'s downfall came after an audio recording surfaced of the octogenarian arguing with Stiviano. In the tape, Sterling chastises Stiviano for posting pictures on social media of her posing with African-Americans, including basketball legend Magic Johnson. \"In your lousy f**ing Instagrams, you don\\'t have to have yourself with -- walking with black people,\" Sterling said in the audio first posted by TMZ. He also tells Stiviano not to bring Johnson to Clippers games and not to post photos with the Hall of Famer so Sterling\\'s friends can see. \"Admire him, bring him here, feed him, f**k him, but don\\'t put (Magic) on an Instagram for the world to have to see so they have to call me,\" Sterling said. NBA Commissioner Adam Silver banned Sterling from the league, fined him $2.5 million and pushed through a charge to terminate all of his ownership rights in the franchise. Fact check: Donald Sterling\\'s claims vs. reality CNN\\'s Dottie Evans contributed to this report.',\n", " 'response': \"a los angeles judge has ordered v. stiviano to pay back more than $ 2.6 million in gifts after sterling 's wife sued her . -lrb- cnn -rrb- donald sterling 's racist remarks cost him an nba team last year . but now it 's his former female companion who has lost big . who is v. stiviano ? .\",\n", - " 'expected_score': 0.7}]" + " 'expected_score': 0.93},\n", + " {'query': '(CNN)Donald Sterling\\'s racist remarks cost him an NBA team last year. But now it\\'s his former female companion who has lost big. A Los Angeles judge has ordered V. Stiviano to pay back more than $2.6 million in gifts after Sterling\\'s wife sued her. In the lawsuit, Rochelle \"Shelly\" Sterling accused Stiviano of targeting extremely wealthy older men. She claimed Donald Sterling used the couple\\'s money to buy Stiviano a Ferrari, two Bentleys and a Range Rover, and that he helped her get a $1.8 million duplex. Who is V. Stiviano? Stiviano countered that there was nothing wrong with Donald Sterling giving her gifts and that she never took advantage of the former Los Angeles Clippers owner, who made much of his fortune in real estate. Shelly Sterling was thrilled with the court decision Tuesday, her lawyer told CNN affiliate KABC. \"This is a victory for the Sterling family in recovering the $2,630,000 that Donald lavished on a conniving mistress,\" attorney Pierce O\\'Donnell said in a statement. \"It also sets a precedent that the injured spouse can recover damages from the recipient of these ill-begotten gifts.\" Stiviano\\'s gifts from Donald Sterling didn\\'t just include uber-expensive items like luxury cars. According to the Los Angeles Times, the list also includes a $391 Easter bunny costume, a $299 two-speed blender and a $12 lace thong. Donald Sterling\\'s downfall came after an audio recording surfaced of the octogenarian arguing with Stiviano. In the tape, Sterling chastises Stiviano for posting pictures on social media of her posing with African-Americans, including basketball legend Magic Johnson. \"In your lousy f**ing Instagrams, you don\\'t have to have yourself with -- walking with black people,\" Sterling said in the audio first posted by TMZ. He also tells Stiviano not to bring Johnson to Clippers games and not to post photos with the Hall of Famer so Sterling\\'s friends can see. \"Admire him, bring him here, feed him, f**k him, but don\\'t put (Magic) on an Instagram for the world to have to see so they have to call me,\" Sterling said. NBA Commissioner Adam Silver banned Sterling from the league, fined him $2.5 million and pushed through a charge to terminate all of his ownership rights in the franchise. Fact check: Donald Sterling\\'s claims vs. reality CNN\\'s Dottie Evans contributed to this report.',\n", + " 'response': \"donald sterling 's wife sued stiviano of targeting extremely wealthy older men . she claimed donald sterling used the couple 's money to buy stiviano a ferrari , bentleys and a range rover . stiviano 's gifts from donald sterling did n't just include uber-expensive items like luxury cars .\",\n", + " 'expected_score': 1.0},\n", + " {'query': '(CNN)Donald Sterling\\'s racist remarks cost him an NBA team last year. But now it\\'s his former female companion who has lost big. A Los Angeles judge has ordered V. Stiviano to pay back more than $2.6 million in gifts after Sterling\\'s wife sued her. In the lawsuit, Rochelle \"Shelly\" Sterling accused Stiviano of targeting extremely wealthy older men. She claimed Donald Sterling used the couple\\'s money to buy Stiviano a Ferrari, two Bentleys and a Range Rover, and that he helped her get a $1.8 million duplex. Who is V. Stiviano? Stiviano countered that there was nothing wrong with Donald Sterling giving her gifts and that she never took advantage of the former Los Angeles Clippers owner, who made much of his fortune in real estate. Shelly Sterling was thrilled with the court decision Tuesday, her lawyer told CNN affiliate KABC. \"This is a victory for the Sterling family in recovering the $2,630,000 that Donald lavished on a conniving mistress,\" attorney Pierce O\\'Donnell said in a statement. \"It also sets a precedent that the injured spouse can recover damages from the recipient of these ill-begotten gifts.\" Stiviano\\'s gifts from Donald Sterling didn\\'t just include uber-expensive items like luxury cars. According to the Los Angeles Times, the list also includes a $391 Easter bunny costume, a $299 two-speed blender and a $12 lace thong. Donald Sterling\\'s downfall came after an audio recording surfaced of the octogenarian arguing with Stiviano. In the tape, Sterling chastises Stiviano for posting pictures on social media of her posing with African-Americans, including basketball legend Magic Johnson. \"In your lousy f**ing Instagrams, you don\\'t have to have yourself with -- walking with black people,\" Sterling said in the audio first posted by TMZ. He also tells Stiviano not to bring Johnson to Clippers games and not to post photos with the Hall of Famer so Sterling\\'s friends can see. \"Admire him, bring him here, feed him, f**k him, but don\\'t put (Magic) on an Instagram for the world to have to see so they have to call me,\" Sterling said. NBA Commissioner Adam Silver banned Sterling from the league, fined him $2.5 million and pushed through a charge to terminate all of his ownership rights in the franchise. Fact check: Donald Sterling\\'s claims vs. reality CNN\\'s Dottie Evans contributed to this report.',\n", + " 'response': \"donald sterling 's racist remarks cost him an nba team last year . but now it 's his former female companion who has lost big . a judge has ordered v. stiviano to pay back more than $ 2.6 million in gifts .\",\n", + " 'expected_score': 1.0}]" ] }, - "execution_count": 2, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "groundedness_golden_set[:3]" + "groundedness_golden_set[:5]\n" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -96,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -138,7 +141,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -160,22 +163,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 12, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ added app groundedness huggingface\n", - "✅ added feedback definition feedback_definition_hash_ca9bbb2338965a9d34e17ae53f641d9c\n", - "✅ added app groundedness openai\n", - "✅ added feedback definition feedback_definition_hash_ca9bbb2338965a9d34e17ae53f641d9c\n", - "✅ added app groundedness openai gpt4\n", - "✅ added feedback definition feedback_definition_hash_ca9bbb2338965a9d34e17ae53f641d9c\n" - ] - } - ], + "outputs": [], "source": [ "tru_wrapped_groundedness_hug = TruBasicApp(wrapped_groundedness_hug, app_id = \"groundedness huggingface\", feedbacks=[f_mae])\n", "tru_wrapped_groundedness_openai = TruBasicApp(wrapped_groundedness_openai, app_id = \"groundedness openai gpt-3.5\", feedbacks=[f_mae])\n", @@ -201,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -238,36 +228,36 @@ " \n", " \n", " \n", - " groundedness huggingface\n", - " 0.251471\n", - " 2.4\n", - " 0.000000\n", + " groundedness openai gpt-4\n", + " 0.088000\n", + " 3.59\n", + " 0.028865\n", " \n", " \n", - " groundedness openai\n", - " 2.371200\n", - " 2.4\n", - " 0.001344\n", + " groundedness openai gpt-3.5\n", + " 0.185600\n", + " 3.59\n", + " 0.001405\n", " \n", " \n", - " groundedness openai gpt4\n", - " 2.371200\n", - " 2.4\n", - " 0.001464\n", + " groundedness huggingface\n", + " 0.239318\n", + " 3.59\n", + " 0.000000\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Mean Absolute Error latency total_cost\n", - "app_id \n", - "groundedness huggingface 0.251471 2.4 0.000000\n", - "groundedness openai 2.371200 2.4 0.001344\n", - "groundedness openai gpt4 2.371200 2.4 0.001464" + " Mean Absolute Error latency total_cost\n", + "app_id \n", + "groundedness openai gpt-4 0.088000 3.59 0.028865\n", + "groundedness openai gpt-3.5 0.185600 3.59 0.001405\n", + "groundedness huggingface 0.239318 3.59 0.000000" ] }, - "execution_count": 9, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -293,7 +283,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/trulens_eval/trulens_eval/tests/test_cases.py b/trulens_eval/trulens_eval/tests/test_cases.py index 5a47cdb2b..6b9a89d52 100644 --- a/trulens_eval/trulens_eval/tests/test_cases.py +++ b/trulens_eval/trulens_eval/tests/test_cases.py @@ -215,8 +215,33 @@ def calculate_expected_score(normalized_metrics_lst, weights_lst): ) + def generate_summeval_groundedness_golden_set(file_path): + with open(file_path, 'r') as f: + data = json.load(f) + + for item in data["rows"]: + row = item["row"] + + assert ( + len(row["machine_summaries"]) == len(row["consistency"])) + + for i in range(len(row["machine_summaries"])): + yield { + "query": + row["text"], + "response": + row["machine_summaries"][i], + "expected_score": + calculate_expected_score( + [ + row["consistency"][i] / 5, # normalize to [0, 1] + ], + [1.0] + ) + } +def generate_summeval_answer_relevance_golden_set(file_path): with open(file_path, 'r') as f: data = json.load(f) @@ -224,8 +249,7 @@ def generate_summeval_groundedness_golden_set(file_path): row = item["row"] assert ( - len(row["machine_summaries"]) == len(row["relevance"]) == - len(row["consistency"]) + len(row["machine_summaries"]) == len(row["relevance"]) == len(row["human_summaries"]) ) for i in range(len(row["machine_summaries"])): @@ -238,8 +262,7 @@ def generate_summeval_groundedness_golden_set(file_path): calculate_expected_score( [ row["relevance"][i] / 5, # normalize to [0, 1] - row["consistency"][i] / 5 ], - [1.0, 1.0] # use uniform weights for now + [1.0] ) - } \ No newline at end of file + }