Skip to content

Non-English QA generation is not working #1970

Open
@whisper-bye

Description

@whisper-bye

according to the document
https://docs.ragas.io/en/stable/howtos/customizations/testgenerator/_language_adaptation/

import asyncio

from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.openai_like import OpenAILike
from ragas.embeddings import LlamaIndexEmbeddingsWrapper
from ragas.llms import LlamaIndexLLMWrapper
from ragas.testset import TestsetGenerator
from ragas.testset.persona import Persona
from ragas.testset.synthesizers.single_hop.specific import (
    SingleHopSpecificQuerySynthesizer,
)
from ragas.testset.transforms.extractors.llm_based import NERExtractor
from ragas.testset.transforms.splitters import HeadlineSplitter

from settings import settings

# set_global_handler("simple")

documents = SimpleDirectoryReader(
    "./Sample_non_english_corpus"
).load_data()

generator_llm = LlamaIndexLLMWrapper(
    OpenAILike(
        model=settings.OPENAI_MODEL,
        api_key=settings.OPENAI_API_KEY,
        api_base=settings.OPENAI_BASE_URL,
        is_chat_model=True,
    )
)

generator_embeddings = LlamaIndexEmbeddingsWrapper(
    OllamaEmbedding(
        model_name=settings.EMBED_MODEL,
    )
)

personas = [
    Persona(
        name="curious_student",
        role_description="A student who is curious about the world and wants to learn more about different cultures and languages",
    ),
]

transforms = [HeadlineSplitter(), NERExtractor(llm=generator_llm)]

generator = TestsetGenerator(
    llm=generator_llm,
    embedding_model=generator_embeddings,
    persona_list=personas,
)

distribution = [
    (SingleHopSpecificQuerySynthesizer(llm=generator_llm), 1.0),
]
for query, _ in distribution:
    prompts = asyncio.run(query.adapt_prompts("chinese", llm=generator_llm))
    query.set_prompts(**prompts)


testset = generator.generate_with_llamaindex_docs(
    documents,
    testset_size=5,
    transforms=transforms,
    query_distribution=distribution,
)

# df = testset.to_pandas()
# print(df.head())

dataset = testset.to_evaluation_dataset()
print("Query:", dataset[0].user_input)
print("Reference:", dataset[0].reference)

# os.makedirs("outputs", exist_ok=True)
# df.to_excel(
#     f"outputs/medical_insurance_testset-{datetime.now().strftime('%Y%m%d%H%M%S')}.xlsx"
# )

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workingmodule-testsetgenModule testset generationquestionFurther information is requested

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions