Open
Description
according to the document
https://docs.ragas.io/en/stable/howtos/customizations/testgenerator/_language_adaptation/
import asyncio
from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.openai_like import OpenAILike
from ragas.embeddings import LlamaIndexEmbeddingsWrapper
from ragas.llms import LlamaIndexLLMWrapper
from ragas.testset import TestsetGenerator
from ragas.testset.persona import Persona
from ragas.testset.synthesizers.single_hop.specific import (
SingleHopSpecificQuerySynthesizer,
)
from ragas.testset.transforms.extractors.llm_based import NERExtractor
from ragas.testset.transforms.splitters import HeadlineSplitter
from settings import settings
# set_global_handler("simple")
documents = SimpleDirectoryReader(
"./Sample_non_english_corpus"
).load_data()
generator_llm = LlamaIndexLLMWrapper(
OpenAILike(
model=settings.OPENAI_MODEL,
api_key=settings.OPENAI_API_KEY,
api_base=settings.OPENAI_BASE_URL,
is_chat_model=True,
)
)
generator_embeddings = LlamaIndexEmbeddingsWrapper(
OllamaEmbedding(
model_name=settings.EMBED_MODEL,
)
)
personas = [
Persona(
name="curious_student",
role_description="A student who is curious about the world and wants to learn more about different cultures and languages",
),
]
transforms = [HeadlineSplitter(), NERExtractor(llm=generator_llm)]
generator = TestsetGenerator(
llm=generator_llm,
embedding_model=generator_embeddings,
persona_list=personas,
)
distribution = [
(SingleHopSpecificQuerySynthesizer(llm=generator_llm), 1.0),
]
for query, _ in distribution:
prompts = asyncio.run(query.adapt_prompts("chinese", llm=generator_llm))
query.set_prompts(**prompts)
testset = generator.generate_with_llamaindex_docs(
documents,
testset_size=5,
transforms=transforms,
query_distribution=distribution,
)
# df = testset.to_pandas()
# print(df.head())
dataset = testset.to_evaluation_dataset()
print("Query:", dataset[0].user_input)
print("Reference:", dataset[0].reference)
# os.makedirs("outputs", exist_ok=True)
# df.to_excel(
# f"outputs/medical_insurance_testset-{datetime.now().strftime('%Y%m%d%H%M%S')}.xlsx"
# )