Skip to content

Raga's Evaluation For Multi Modes #806

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
b7dc1e9
Updatedmodels for ragas eval
kaustubh-darekar Oct 17, 2024
f177c8e
context utilization metrics removed
kaustubh-darekar Oct 17, 2024
74f4afa
updated supported llms for ragas
kartikpersistent Oct 17, 2024
8147b29
Merge branch 'ragas_model_update' of https://github.com/neo4j-labs/ll…
kartikpersistent Oct 17, 2024
ae2f957
removed context utilization
kartikpersistent Oct 17, 2024
b511079
Implemented Parallel API
kartikpersistent Oct 17, 2024
5127910
multi api calls error resolved
kaustubh-darekar Oct 17, 2024
247ec49
Merge branch 'DEV' of https://github.com/neo4j-labs/llm-graph-builder…
kartikpersistent Oct 17, 2024
9a529b5
MultiMode Metrics
kartikpersistent Oct 17, 2024
678c08f
Fix: Metric Evalution For Single Mode
kartikpersistent Oct 18, 2024
6a2d226
multi modes ragas evaluation
kaustubh-darekar Oct 18, 2024
93756d0
api payload changes
kartikpersistent Oct 18, 2024
77cb004
metric api output format changed
kaustubh-darekar Oct 18, 2024
6486420
multi mode ragas changes
kaustubh-darekar Oct 18, 2024
a2223cd
removed pre process dataset
kaustubh-darekar Oct 18, 2024
00af914
api response changes
kartikpersistent Oct 18, 2024
46d625e
Merge branch 'ragas-evaluation-for-multi-modes' of https://github.com…
kartikpersistent Oct 18, 2024
113c0a6
Multimode metrics api integration
kartikpersistent Oct 18, 2024
dbf1cd1
nan error for no answer resolved
kaustubh-darekar Oct 18, 2024
a6de26d
QA integration changes
kartikpersistent Oct 18, 2024
84c9d01
Merge branch 'DEV' into ragas-evaluation-for-multi-modes
kartikpersistent Oct 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 26 additions & 16 deletions backend/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,24 +786,34 @@ async def retry_processing(uri=Form(), userName=Form(), password=Form(), databas
gc.collect()

@app.post('/metric')
async def calculate_metric(question=Form(), context=Form(), answer=Form(), model=Form()):
async def calculate_metric(question: str = Form(),
context: str = Form(),
answer: str = Form(),
model: str = Form(),
mode: str = Form()):
try:
payload_json_obj = {'api_name':'metric', 'context':context, 'answer':answer, 'model':model, 'logging_time': formatted_time(datetime.now(timezone.utc))}
logger.log_struct(payload_json_obj, "INFO")
result = await asyncio.to_thread(get_ragas_metrics, question, context, answer, model)
if result is None or "error" in result:
return create_api_response(
'Failed',
message='Failed to calculate evaluation metrics.',
error=result.get("error", "Ragas evaluation returned null")
)
return create_api_response('Success', data=result)
context_list = [str(item).strip() for item in json.loads(context)] if context else []
answer_list = [str(item).strip() for item in json.loads(answer)] if answer else []
mode_list = [str(item).strip() for item in json.loads(mode)] if mode else []

result = await asyncio.to_thread(
get_ragas_metrics, question, context_list, answer_list, model
)
if result is None or "error" in result:
return create_api_response(
'Failed',
message='Failed to calculate evaluation metrics.',
error=result.get("error", "Ragas evaluation returned null")
)
data = {mode: {metric: result[metric][i] for metric in result} for i, mode in enumerate(mode_list)}
return create_api_response('Success', data=data)
except Exception as e:
job_status = "Failed"
message = "Error while calculating evaluation metrics"
error_message = str(e)
logging.exception(f'{error_message}')
return create_api_response(job_status, message=message, error=error_message)
logging.exception(f"Error while calculating evaluation metrics: {e}")
return create_api_response(
'Failed',
message="Error while calculating evaluation metrics",
error=str(e)
)
finally:
gc.collect()

Expand Down
2 changes: 1 addition & 1 deletion backend/src/QA_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ def process_chat_response(messages, history, question, model, graph, document_na
total_tokens = 0
formatted_docs = ""

question = transformed_question if transformed_question else question
# question = transformed_question if transformed_question else question
# metrics = get_ragas_metrics(question,formatted_docs,content)
# print(metrics)

Expand Down
78 changes: 17 additions & 61 deletions backend/src/ragas_eval.py
Original file line number Diff line number Diff line change
@@ -1,96 +1,52 @@
import os
import logging
import time
from typing import Dict, Tuple, Optional
import boto3
from src.llm import get_llm
from datasets import Dataset
from dotenv import load_dotenv
from langchain_anthropic import ChatAnthropic
from langchain_aws import ChatBedrock
from langchain_community.chat_models import ChatOllama
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
from langchain_fireworks import ChatFireworks
from langchain_google_vertexai import (
ChatVertexAI,
HarmBlockThreshold,
HarmCategory,
)
from langchain_groq import ChatGroq
from langchain_openai import AzureChatOpenAI, ChatOpenAI
from ragas import evaluate
from ragas.metrics import answer_relevancy, context_utilization, faithfulness
from ragas.metrics import answer_relevancy, faithfulness
from src.shared.common_fn import load_embedding_model

load_dotenv()

RAGAS_MODEL_VERSIONS = {
"openai_gpt_3.5": "gpt-3.5-turbo-16k",
"openai_gpt_4": "gpt-4-turbo-2024-04-09",
"openai_gpt_4o_mini": "gpt-4o-mini-2024-07-18",
"openai_gpt_4o": "gpt-4o-mini-2024-07-18",
"groq_llama3_70b": "groq_llama3_70b",
}
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
EMBEDDING_FUNCTION, _ = load_embedding_model(EMBEDDING_MODEL)


def get_ragas_llm(model: str) -> Tuple[object, str]:
"""Retrieves the specified language model. Improved error handling and structure."""
env_key = f"LLM_MODEL_CONFIG_{model}"
env_value = os.environ.get(env_key)
logging.info(f"Loading model configuration: {env_key}")
try:
if "openai" in model:
model_name = RAGAS_MODEL_VERSIONS[model]
llm = ChatOpenAI(
api_key=os.environ.get("OPENAI_API_KEY"), model=model_name, temperature=0
)
elif "groq" in model:
model_name, base_url, api_key = env_value.split(",")
llm = ChatGroq(api_key=api_key, model_name=model_name, temperature=0)
else:
raise ValueError(f"Unsupported model for evaluation: {model}")

logging.info(f"Model loaded - Model Version: {model}")
return llm, model_name
except (ValueError, KeyError) as e:
logging.error(f"Error loading LLM: {e}")
raise


def get_ragas_metrics(
question: str, context: str, answer: str, model: str
) -> Optional[Dict[str, float]]:
def get_ragas_metrics(question: str, context: list, answer: list, model: str):
"""Calculates RAGAS metrics."""
try:
start_time = time.time()
dataset = Dataset.from_dict(
{"question": [question], "answer": [answer], "contexts": [[context]]}
{"question": [question] * len(answer), "answer": answer, "contexts": [[ctx] for ctx in context]}
)
logging.info("Dataset created successfully.")

llm, model_name = get_ragas_llm(model=model)
logging.info("Evaluation dataset created successfully.")
if ("diffbot" in model) or ("ollama" in model):
raise ValueError(f"Unsupported model for evaluation: {model}")
else:
llm, model_name = get_llm(model=model)

logging.info(f"Evaluating with model: {model_name}")

score = evaluate(
dataset=dataset,
metrics=[faithfulness, answer_relevancy, context_utilization],
metrics=[faithfulness, answer_relevancy],
llm=llm,
embeddings=EMBEDDING_FUNCTION,
)

score_dict = (
score.to_pandas()[["faithfulness", "answer_relevancy", "context_utilization"]]
score.to_pandas()[["faithfulness", "answer_relevancy"]]
.fillna(0)
.round(4)
.to_dict(orient="records")[0]
.to_dict(orient="list")
)
end_time = time.time()
logging.info(f"Evaluation completed in: {end_time - start_time:.2f} seconds")
return score_dict
except ValueError as e:
if "Unsupported model for evaluation" in str(e):
logging.error(f"Unsupported model error: {e}")
return {"error": str(e)} # Return the specific error message as a dictionary
return {"error": str(e)}
logging.exception(f"ValueError during metrics evaluation: {e}")
return {"error": str(e)}
except Exception as e:
Expand Down
Loading