fix

the-seeds · Dec 7, 2024 · f252a47 · f252a47
1 parent aa3e078
commit f252a47
Show file tree

Hide file tree

Showing 10 changed files with 19,217 additions and 36 deletions.
diff --git a/src/example/dataset/dateset.json → src/example/dataset/dataset.json b/src/example/dataset/dateset.json → src/example/dataset/dataset.json
diff --git a/src/example/result/json_file_demo_QA.json b/src/example/result/json_file_demo_QA.json
diff --git a/src/example/result/multi_file_demo_QA.json b/src/example/result/multi_file_demo_QA.json
diff --git a/src/example/result/single_file_demo_QA.json b/src/example/result/single_file_demo_QA.json
diff --git a/src/postprocess/verify/__init__.py b/src/postprocess/verify/__init__.py
diff --git a/src/postprocess/verify/verify.py b/src/postprocess/verify/verify.py
diff --git a/src/strategy/genQA.py b/src/strategy/genQA.py
@@ -71,23 +71,25 @@ async def verifyQA(self,text,main_theme,questions,answers,concurrent_api_request
             for q,a in zip(batch_questions,batch_answers):
                 prompt = buildMessages(
                         SystemMessage(
-                            "你的任务是根据提供的文本，判断给定的问答是否“有效”。\n"
-                            "有效的条件包括：\n"
-                            f"1. 问题合理：问题在逻辑上清晰，不混乱，符合人类习惯，并且与文本主题 {main_theme} 相关。\n" 
-                            "2. 答案正确：答案可由文本信息支持，与问题所问内容相符，不包含'根据文本内容'等字眼。\n"
-                            "若符合上述两个条件，则回答“有效”；如果任一条件不满足，则回答“无效”。\n"
-                            "只输出'有效'或'无效'，不得输出其他信息。"
+                            "你需要根据提供的文本，判断给定的问答是否“有效”。\n"
+                            "有效的问答的标准是：\n"
+                            f"问题合理。问题与文本主题 {main_theme} 相关，逻辑清晰，不混乱，符合人类习惯。问题中不包含回答等无关信息。\n" 
+                            "问题完整，不含有省略、不完整、或出现意外截断情况的问题。\n"
+                            "回答正确，完整。答案可由文本信息支持，与问题所问内容相符，不包含'根据文本内容'等字眼。回答逻辑清晰，不包含无关信息。\n"
+                            "若符合以上所有标准，则回答“有效”；如果任一条件不满足，则回答“无效”。\n"
+                            "请先简述你判断的原因，然后在最后一行输出'有效'或'无效'，不得输出其他信息。"
                         ),
                         UserMessage(
                             f"{text}\n\n请根据上面的文本判断以下问答是否有效：\n"
                             f"问题: {q}\n"
                             f"答案: {a}\n"
-                            "只输出'有效'或'无效'。"
+                            "请先简述你判断的原因，然后在最后一行输出'有效'或'无效'，不得输出其他信息。"
                         )
                     )
                 prompts.append(prompt)
             replies = await self.api.async_chat(prompts)
-            bin = [0 if "无效" in r else 1 for r in replies]
+            bin = [0 if "无效" in r.split()[-1] + r.split()[0] else 1 for r in replies]
+            delete_num = len([idx for idx in bin if idx == 0])
             verified_Q = [q for idx,q in enumerate(batch_questions) if bin[idx] == 1]
             verified_A = [a for idx,a in enumerate(batch_answers) if bin[idx] == 1]
             new_questions.extend(verified_Q)
@@ -183,18 +185,43 @@ async def generateQuestions(self, text, main_theme, num_question_per_title,title
                             f"你的问题需要满足以下要求：\n"
                             f"1. 问题必须包括完整的信息以避免模糊，例如：具体的人物、名称、事件、时间等。\n"
                             f"2. 问题应当客观、具体，避免模糊不清。"
-                            f"3. 问题需基于客观事实，不得包含主观感受、预测或想象。\n"
-                            f"4. 每个问题必须指向客观事实而非主观感受或预测，想象。问题应当能在文本中找到答案。\n"
+                            f"3. 问题需基于客观事实，不得包含主观感受、预测或想象。问题应当能在文本中找到答案\n"
+                            f"4. 确保问题内容不重复，包含不同类型的问题并且覆盖文本的不同部分或不同维度。\n"
+                            f"每个问题以'问题'加数字加'::'开头，且问题内容不能重复。"
+
                         ),
                         UserMessage(
-                            f"文本:{text}\n每个问题一行，以数字加'.'开始，不能重复。"
+                            f"文本:{text}\n每个问题以'问题'加数字加'::'开头，且问题内容不能重复。请提问。"
                         ),                        
                 )
                 prompts.append(prompt)
             genQuestions = await self.api.async_chat(prompts)
-            logger.debug(f"{'-' * 20}Questions of {batch_titles[i]}{'-'*20}")
+            logger.debug(f"{'-' * 20}Questions of {batch_titles}{'-'*20}")
             logger.debug(genQuestions)
-            genQuestions = clean_and_split_reply_list(genQuestions)
+            genQuestions = clean_and_split_question_list(genQuestions)
+            questions.extend(genQuestions)
+            prompts = []
+            for i in range(len(batch_titles)):
+                prompt = buildMessages(
+                        SystemMessage(
+                            f"你对{main_theme}相关内容十分感兴趣。请您根据以下文本内容，围绕'{main_theme}'提出{num_question_per_title}个清晰、客观的问题，"
+                            f"这些问题中不能含'{batch_titles[i]}'，但能够从文本中与'{batch_titles[i]}'相关的信息回答\n"
+                            f"你的问题需要满足以下要求：\n"
+                            f"1. 问题必须包括完整的信息以避免模糊，例如：具体的人物、名称、事件、时间等。\n"
+                            f"2. 问题应当客观、具体，避免模糊不清。"
+                            f"3. 问题需基于客观事实，不得包含主观感受、预测或想象。问题应当能在文本中找到答案\n"
+                            f"4. 确保问题内容不重复，包含不同类型的问题并且覆盖文本的不同部分或不同维度。\n"
+                            f"每个问题以'问题'加数字加'::'开头，且问题内容不能重复。"
+                        ),
+                        UserMessage(
+                            f"文本:{text}\n每个问题以'问题'加数字加'::'开头，且问题内容不能重复。请提问。"
+                        ),                        
+                )
+                prompts.append(prompt)
+            genQuestions = await self.api.async_chat(prompts)
+            logger.debug(f"{'-' * 20}Questions of 2 {batch_titles}{'-'*20}")
+            logger.debug(genQuestions)
+            genQuestions = clean_and_split_question_list(genQuestions)
             questions.extend(genQuestions)
         return questions
 
@@ -206,7 +233,7 @@ async def getAnswers(self, text,questions,concurrent_api_requests_num=1,main_the
             prompts=  []
             for i in range(len(batch_questions)):
                 prompt = buildMessages(
-                        SystemMessage(f"{text}\n你是一位AI助手，请礼貌地回复以下问题。对于无法回答的问题，请回复'无法回答'"),
+                        SystemMessage(f"{text}\n你是一位AI助手。请从热情、耐心、专业、简洁中选择一个回答风格并礼貌地回答以下问题。对于无法回答的问题，请回复'无法回答'"),
                         UserMessage(
                             f"{batch_questions[i]}"
                         )

diff --git a/src/strategy/genQA_persona.py b/src/strategy/genQA_persona.py
@@ -73,29 +73,32 @@ async def verifyQA(self,text,main_theme,questions,answers,concurrent_api_request
             for q,a in zip(batch_questions,batch_answers):
                 prompt = buildMessages(
                         SystemMessage(
-                            "你的任务是根据提供的文本，判断给定的问答是否“有效”。\n"
-                            "有效的条件包括：\n"
-                            f"1. 问题合理：问题在逻辑上清晰，不混乱，符合人类习惯，与文本主题 {main_theme} 相关。\n" 
-                            "2. 答案正确：答案可由文本信息支持，与问题所问内容相符，不包含'根据文本内容'等字眼。\n"
-                            "若符合上述两个条件，则回答“有效”；如果任一条件不满足，则回答“无效”。\n"
-                            "只输出'有效'或'无效'，不得输出其他信息。"
+                            "你需要根据提供的文本，判断给定的问答是否“有效”。\n"
+                            "有效的问答的标准是：\n"
+                            f"问题合理。问题与文本主题 {main_theme} 相关，逻辑清晰，不混乱，符合人类习惯。问题中不包含回答等无关信息。\n" 
+                            "问题完整，不含有省略、不完整、或出现意外截断情况的问题。\n"
+                            "回答正确，完整。答案可由文本信息支持，与问题所问内容相符，不包含'根据文本内容'等字眼。回答逻辑清晰，不包含无关信息。\n"
+                            "若符合以上所有标准，则回答“有效”；如果任一条件不满足，则回答“无效”。\n"
+                            "请先简述你判断的原因，然后在最后一行输出'有效'或'无效'，不得输出其他信息。"
                         ),
                         UserMessage(
                             f"{text}\n\n请根据上面的文本判断以下问答是否有效：\n"
                             f"问题: {q}\n"
                             f"答案: {a}\n"
-                            "只输出'有效'或'无效'。"
+                            "请先简述你判断的原因，然后在最后一行输出'有效'或'无效'，不得输出其他信息。"
                         )
                     )
                 prompts.append(prompt)
             replies = await self.api.async_chat(prompts)
-            bin = [0 if "无效" in r else 1 for r in replies]
+            bin = [0 if "无效" in r.split()[-1] + r.split()[0] else 1 for r in replies]
+            delete_num = len([idx for idx in bin if idx == 0])
             verified_Q = [q for idx,q in enumerate(batch_questions) if bin[idx] == 1]
             verified_A = [a for idx,a in enumerate(batch_answers) if bin[idx] == 1]
             new_questions.extend(verified_Q)
             new_answers.extend(verified_A)
         return new_questions,new_answers
 
+
     async def run(self, config, num_question_per_title=5, concurrent_api_requests_num=1):
         init_QA_dataset(config.save_dir,config.save_file_name)
         all_questions = []
@@ -160,34 +163,71 @@ async def splitTitles(self,titles,concurrent_api_requests_num=1):
 
     async def getPersona(self, text):
         prompt = buildMessages(
+            SystemMessage(
+                f"你是一个擅长生成人物描述的助手。"
+                "以下是一个文本，你需要根据文本内容生成10个可能对该文本相关内容感兴趣的大致人物描述。\n"
+                "生成的人物应具有普遍性、对文本大部分内容感兴趣。每个人物应有独特的兴趣或特点，不能包含人名，每个人物一行。\n"
+            ),
             UserMessage(
-                f"根据以下文本：\n{text}生成10个可能对该文本相关内容感兴趣的大致人物描述，不能包含人名，每个人物一行"
+                f"{text}"
             )
         )
         personas = await self.api.async_chat([prompt])
         personas = clean_and_split_reply_list(personas)
         return personas
 
-    async def generateQuestions(self, text, main_theme, num_question_per_title,titles,personas,concurrent_api_requests_num=1,additional_info=""):
+    async def generateQuestions(self, text, main_theme, num_question_per_title,titles,concurrent_api_requests_num=1,personas = "",additional_info=""):
         questions = []
         for idx in tqdm(range(0,len(titles),concurrent_api_requests_num),desc='Generating questions'):
             batch_titles = titles[idx:idx+concurrent_api_requests_num]
             prompts = []
             for i in range(len(batch_titles)):
-                persona = {random.choice(personas).strip()}
+                persona = random.choice(personas)
+                prompt = buildMessages(
+                        SystemMessage(
+                            f"你是'{persona}。'你对{main_theme}相关内容十分感兴趣。请您根据以下文本内容，围绕'{main_theme}'提出{num_question_per_title}个清晰、客观的问题，"
+                            f"这些问题必须紧密围绕'{batch_titles[i]}'，且可以在文本中找到明确的答案。\n"
+                            f"你的问题需要满足以下要求：\n"
+                            f"1. 问题必须包括完整的信息以避免模糊，例如：具体的人物、名称、事件、时间等。\n"
+                            f"2. 问题应当客观、具体，避免模糊不清。"
+                            f"3. 问题需基于客观事实，不得包含主观感受、预测或想象。问题应当能在文本中找到答案\n"
+                            f"4. 确保问题内容不重复，包含不同类型的问题并且覆盖文本的不同部分或不同维度。\n"
+                            f"每个问题以'问题'加数字加'::'开头，且问题内容不能重复。"
+
+                        ),
+                        UserMessage(
+                            f"文本:{text}\n每个问题以'问题'加数字加'::'开头，且问题内容不能重复。请提问。"
+                        ),                        
+                )
+                prompts.append(prompt)
+            genQuestions = await self.api.async_chat(prompts)
+            logger.debug(f"{'-' * 20}Questions of {batch_titles}{'-'*20}")
+            logger.debug(genQuestions)
+            genQuestions = clean_and_split_question_list(genQuestions)
+            questions.extend(genQuestions)
+            prompts = []
+            for i in range(len(batch_titles)):
+                persona = random.choice(personas)
                 prompt = buildMessages(
-                        SystemMessage(f"你是'{persona}'。请根据以下文本内容指向'{main_theme}'提出{num_question_per_title}个您感兴趣的，在不同场景下与“{batch_titles[i]}”有关的问题。问题必须指向'{batch_titles[i]}'且可以在文本中找到答案。您的问题需包含完整名称，事件等完整信息以避免模糊，严禁使用简称。"),
+                        SystemMessage(
+                            f"你是'{persona}'。你对{main_theme}相关内容十分感兴趣。请您根据以下文本内容，围绕'{main_theme}'提出{num_question_per_title}个清晰、客观的问题，"
+                            f"这些问题中不能含'{batch_titles[i]}'，但能够从文本中与'{batch_titles[i]}'相关的信息回答\n"
+                            f"你的问题需要满足以下要求：\n"
+                            f"1. 问题必须包括完整的信息以避免模糊，例如：具体的人物、名称、事件、时间等。\n"
+                            f"2. 问题应当客观、具体，避免模糊不清。"
+                            f"3. 问题需基于客观事实，不得包含主观感受、预测或想象。问题应当能在文本中找到答案\n"
+                            f"4. 确保问题内容不重复，包含不同类型的问题并且覆盖文本的不同部分或不同维度。\n"
+                            f"每个问题以'问题'加数字加'::'开头，且问题内容不能重复。"
+                        ),
                         UserMessage(
-                            f"文本:{text}\n每个问题一行，以数字加'. '开始，不能重复。"
-                        ),                 
+                            f"文本:{text}\n每个问题以'问题'加数字加'::'开头，且问题内容不能重复。请提问。"
+                        ),                        
                 )
-                logger.debug(f"{'-'*20}Prompt of {batch_titles[i]}{'-'*20}")
-                logger.debug(prompt)
                 prompts.append(prompt)
             genQuestions = await self.api.async_chat(prompts)
-            logger.debug(f"{'-' * 20}Questions of {batch_titles[i]}{'-'*20}")
+            logger.debug(f"{'-' * 20}Questions of 2 {batch_titles}{'-'*20}")
             logger.debug(genQuestions)
-            genQuestions = clean_and_split_reply_list(genQuestions)
+            genQuestions = clean_and_split_question_list(genQuestions)
             questions.extend(genQuestions)
         return questions
 

diff --git a/src/tools/filter/pattern.py b/src/tools/filter/pattern.py
@@ -1,7 +1,7 @@
 import re
 ABANDONED_PATTERN_IN_QUESTIONS = [
-    r"^[\*\^\.\/\\\" ]*(回答|答案|解答|结果|答)",
-    r"^[\*\^\.\/\\\" ]*([Aa]nswer|[Rr]eply|[Rr]esponse)",
+    r"^\W*(回答|答案|解答|结果|答)",
+    r"^\W*([Aa]nswer|[Rr]eply|[Rr]esponse)",
 ]
 
 ABANDONED_PATTERN_IN_ANSWERS = [

diff --git a/src/tools/tool.py b/src/tools/tool.py
@@ -78,13 +78,31 @@ def clean_and_split_reply(lines:str)-> List[str]:
     lines = [l for l in lines if len(l) > 10]
     return lines
 
-def clean_and_split_reply_list(replys:List[str])-> List[str]:
+def clean_and_split_reply_list(replies:List[str])-> List[str]:
     cleaned =[]
-    for reply in replys: 
+    for reply in replies: 
         reply = clean_and_split_reply(reply)
         cleaned.extend(reply)
     return cleaned
 
+def clean_and_split_question_list(replies:List[str])-> List[str]:
+    cleaned = []
+    pattern = r'问题[\d]+::(.*?)(?=问题[\d]+::|$)'
+    for reply in replies:
+        matches = re.finditer(pattern, reply, re.DOTALL)
+        matches_list = list(matches)
+        questions = []
+        for match in matches_list:
+            logger.error(f"match: {match}") 
+            print("group1:",match.group(1))
+            question = match.group(1).strip()
+            if question:
+                questions.append(question)
+        cleaned.extend(questions)
+
+    return cleaned
+
+
 def clean_and_split_titles(titles:str)-> List[str]:
     titles = titles.split('\n')
     titles = [re.sub(r'^[ :：\.、]?(\d+)[ :：\.、]', '', l) for l in titles]