update datascience assistant logic to achieve better results

modelscope · zzhangpurdue · Aug 6, 2024 · Jul 4, 2024 · Jul 5, 2024 · Jul 12, 2024
commit 6c418a323c7247c7bd79125166c80f21005992db
diff --git a/modelscope_agent/agents/data_science_assistant.py b/modelscope_agent/agents/data_science_assistant.py
@@ -44,62 +44,74 @@
 ```
 """
 CODE_TEMPLATE = """
-you are a code fixer, you need to fix python a code block in jupyter notebook to achieve the\
+you are a code generator, you need to generate a code python block in jupyter notebook to achieve the \
 current task:
 {instruction}
 
 current task is part of the whole plan to achieve the user request:
 {user_request}
 
-the code format is as follows:
-```python
-# the code you need to write
-```
-previous code are as follows, you need to generate python code that follows previous code, no need to repeat previous \
+previous code blocks are as follows, you need to generate python code \
+that follows previous code, no need to repeat previous
 code:
-{previous_code}
+{previous_code_blocks}
+
 Attention: the code format MUST be followed, otherwise the code interpreter will not be able to parse\
 the code correctly,the code format is as follows:
 ```python
 # the code you need to write
 ```
 """
 CODE_REFLECT_TEMPLATE = """
+you are a code generator, you need to generate a new code block in a new jupyter notebook to achieve the \
+current task. The code should be generated based on the previous code block and the current task instruction.
+we have generated some code for current task but didn't execute successfully, you can use these code \
+as a reference to generate the correct code.
 
-you are a code fixer, you need to fix python a code block in jupyter notebook to achieve the\
-current task:
+[current task]
 {instruction}
 
 current task is part of the whole plan to achieve the user request:
+[User Request]
 {user_request}
 
-the code format is as follows:
+previous code blocks are as follows and have been executed successfully in the previous jupyter notebook code blocks, \
+which means you can use the variables defined in the previous code blocks.\
+the code you need to generate should follow previous code, no need to repeat.
+[previous code blocks]
+{previous_code_blocks}
+
+the code we have generated for current task is as follows, you can use it as a reference to generate the correct code:
+{code_and_error}
+
+Attention: the code format MUST be followed, otherwise the code interpreter will not be able to parse the code \
+correctly,the code format is as follows:
 ```python
 # the code you need to write
 ```
+"""
+JUDGE_TEMPLATE = """
+take a deep breath and think step by step.
+you are a code judge, you need to judge the code block in jupyter notebook to achieve the \
+current task.
+[current task]
+{instruction}
 
-the code you need to fix is as follows:
-```python
+this is the code block you need to judge, it contains code and execution result:
 {code}
-```
 
-but the code is not correct, and caused the following error:
-{error}
-please correct the code and try again
+Even if the code has been executed successfully, doesn't mean it's totally correct. You need to carefully \
+check the code logic to ensure the code can accomplish the task correctly. Ignore the warning messages\
 
-previous code are as follows and have been executed successfully in the previous jupyter notebook code blocks, \
-which means you can use the variables defined in the previous code blocks.
-the code you need to fix should follow previous code, no need to repeat
-previous code:
-{previous_code}
+these are the previous code blocks, which have been executed successfully in the previous jupyter notebook code blocks \
+{previous_code_blocks}
 
-Attention: the code format MUST be followed, otherwise the code interpreter will not be able to parse the code \
-correctly,the code format is as follows:
-```python
-# the code you need to write
-```
+Attention: your response should be one of the following:
+- correct, [reason]
+- incorrect, [reason and advice]
 """
-ERROR_KEYWORDS = ['Error', 'error', 'ERROR', 'Traceback', 'exception']
+
+ERROR_KEYWORDS = ['Error']
 
 
 class DataScienceAssistant(RolePlay):
@@ -122,6 +134,7 @@ def __init__(self,
             instruction=instruction,
             **kwargs)
         self.code_interpreter = CodeInterpreter()
+        self.plan = None
 
     def _update_plan(self, user_request: str, curr_plan: Plan = None) -> Plan:
         resp = self._call_llm(
@@ -159,80 +172,214 @@ def _update_plan(self, user_request: str, curr_plan: Plan = None) -> Plan:
             return curr_plan
 
     @staticmethod
-    def _save(nb: nbformat.NotebookNode):
+    def _save(nb: nbformat.NotebookNode, plan: Plan, **kwargs):
         if not os.path.exists('data'):
             os.makedirs('data')
-        file_name = 'data/' + str(
-            datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) + '.ipynb'
+        if kwargs.get('dir_name'):
+            dir_name = 'data/' + kwargs.get('dir_name') + '/'
+        dir_name = 'data/' + str(
+            datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) + '/'
+        if not os.path.exists(dir_name):
+            os.makedirs(dir_name)
+        if (kwargs.get('name')):
+            file_name = dir_name + kwargs.get('name') + '.ipynb'
+        else:
+            file_name = dir_name + 'result' + '.ipynb'
+        # save ipynb
         with open(file_name, 'w', encoding='utf-8') as file:
             nbformat.write(nb, file)
+        # save plan.tasks
+        tasks_list = plan.tasks
+
+        # 将Task列表转换为字典列表，以便于序列化
+        tasks_dict_list = [task.model_dump() for task in tasks_list]
+
+        # 将字典列表转换为JSON字符串
+        tasks_json = json.dumps(tasks_dict_list, indent=4)
+
+        # 将JSON字符串写入文件
+        with open(dir_name + 'plan.json', 'w', encoding='utf-8') as file:
+            file.write(tasks_json)
+
+    def _generate_code(self, code_counter: int, task: Task,
+                       previous_code_blocks: str, user_request: str,
+                       code_and_error: str):
+        if code_counter == 0:
+            # first time to generate code
+            prompt = CODE_TEMPLATE.format(
+                instruction=task.instruction,
+                previous_code_blocks=previous_code_blocks,
+                user_request=user_request)
+        else:
+            # reflect the error and ask user to fix the code
+            prompt = CODE_REFLECT_TEMPLATE.format(
+                instruction=task.instruction,
+                previous_code_blocks=previous_code_blocks,
+                code_and_error=code_and_error,
+                user_request=user_request)
+
+            # prompt = CODE_REFLECT_TEMPLATE_NEW_NEW.format(
+            #     instruction=task.instruction,
+            #     previous_code=previous_code_and_results,
+            #     user_request=user_request,
+            #     code_and_error=code_and_error)
+        logger.info('\n---------\ngenerate code prompt: \n' + prompt
+                    + '\n--------\n')
+        messages = [{'role': 'user', 'content': prompt}]
+        resp = self._call_llm(
+            prompt=None,
+            messages=messages,
+            stop=None,
+        )
+
+        llm_result = ''
+        for s in resp:
+            llm_result += s
+        code = parse_code(text=llm_result, lang='python')
+        return code
+
+    def _get_previous_code_blocks(self):
+        previous_code_blocks = ''
+        # for cell in self.code_interpreter.nb.cells:
+        #     error = False
+        #     if cell.cell_type == 'code':
+        #         for out_put in cell.outputs:
+        #             if out_put.output_type == 'error':
+        #                 error = True
+        #             if "name" in out_put:
+        #                 if out_put.name == 'stderr':
+        #                     error = True
+        #         if not error:
+        #             previous_code_blocks += cell.source
+        counter = 0
+        for task in self.plan.tasks:
+            if task.is_finished:
+                counter += 1
+                previous_code_blocks += (
+                    f'\nCodeblock_{counter}:\n```python{task.code}\n```\n'
+                    f'Codeblock_{counter} Output:\n{task.result}\n')
+        return previous_code_blocks
+
+    def _judge_code(self, task, cell, previous_code_blocks):
+        success = True
+        failed_reason = ''
+        cell = str(cell)
+        judge_prompt = JUDGE_TEMPLATE.format(
+            instruction=task.instruction,
+            previous_code_blocks=previous_code_blocks,
+            code=cell)
+        logger.info(f'\n---------\njudge_prompt: \n{judge_prompt}\n--------\n')
+        messages = [{'role': 'user', 'content': judge_prompt}]
+        judge_resp = self._call_llm(prompt=None, messages=messages, stop=None)
+        judge_result = ''
+        for s in judge_resp:
+            judge_result += s
+        logger.info(
+            f'\n---------\n judge_result: \n{judge_result}\n--------\n')
+        if 'incorrect' in judge_result:
+            success = False
+            failed_reason = 'The code logic is incorrect, here is the reason: ' + judge_result
+        return success, failed_reason
 
     def _run(self, user_request, save: bool = True, **kwargs):
         try:
-            plan = self._update_plan(user_request=user_request)
-            logger.info(f'plan: {plan}')
-
-            while plan.current_task_id:
-                task = plan.task_map.get(plan.current_task_id)
-                logger.info(f'task: {task}')
-                previous_code = ''
-                for cell in self.code_interpreter.nb.cells:
-                    error = False
-                    if cell.cell_type == 'code':
-                        for out_put in cell.outputs:
-                            if out_put.output_type == 'error':
-                                error = True
-                        if not error:
-                            previous_code += cell.source
-                success = False
-                counter = 0
-                resp = ''
-                code = ''
-
-                while not success and counter < 5:
-
-                    if counter == 0:
-                        # first time to generate code
-                        prompt = CODE_TEMPLATE.format(
-                            instruction=task.instruction,
-                            previous_code=previous_code,
-                            user_request=user_request)
-                    else:
-                        # reflect the error and ask user to fix the code
-                        prompt = CODE_REFLECT_TEMPLATE.format(
-                            instruction=task.instruction,
-                            previous_code=previous_code,
-                            code=code,
-                            user_request=user_request,
-                            error=resp[:10000])
-                    resp = self._call_llm(
-                        prompt=prompt,
-                        messages=None,
-                        stop=None,
-                    )
-
-                    code = ''
-                    for chunk in resp:
-                        code += chunk
-                    code = parse_code(text=code, lang='python')
+            self.plan = self._update_plan(user_request=user_request)
+            jupyter_file_path = ''
+            if save:
+                dir_name = 'data/' + str(
+                    datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) + '/'
+                if not os.path.exists(dir_name):
+                    os.makedirs(dir_name)
+                jupyter_file_path = dir_name + 'result' + '.ipynb'
+
+            while self.plan.current_task_id:
+                task = self.plan.task_map.get(self.plan.current_task_id)
+
+                logger.info(
+                    f'new task starts: task_{task.task_id} , instruction: {task.instruction}'
+                )
+                previous_code_blocks = self._get_previous_code_blocks()
+                code_execute_success = False
+                code_counter = 0
+                code_and_error = ''
+                max_try = kwargs.get('max_try', 10)
+                while not code_execute_success and code_counter < max_try:
+                    # generate code
+                    code = self._generate_code(code_counter, task,
+                                               previous_code_blocks,
+                                               user_request, code_and_error)
+
+                    # execute code
+                    failed_reason = ''
+                    code_interpreter_resp = ''
                     try:
                         # call code interpreter to execute the code
-                        resp = self.code_interpreter.call(
+                        code_interpreter_resp = self.code_interpreter.call(
                             params=json.dumps({'code': code}), nb_mode=True)
-                        success = not any(keyword in resp
-                                          for keyword in ERROR_KEYWORDS)
+                        logger.info(
+                            f'code_interpreter_resp task_{task.task_id} '
+                            f'counter {code_counter}: \n{code_interpreter_resp}'
+                        )
+                        code_execute_success = not any(
+                            keyword in code_interpreter_resp
+                            for keyword in ERROR_KEYWORDS)
+                        if not code_execute_success:
+                            failed_reason = (
+                                'The code execution caused error: \n'
+                                + code_interpreter_resp[:5000])
+
                     except Exception as e:
-                        success = False
-                        resp = 'Error: ' + str(e)
+                        code_execute_success = False
+                        logger.info(
+                            f'task_{task.task_id} code execution failed, counter: {code_counter}'
+                        )
+                        failed_reason = 'The code execution caused error: \n' + str(
+                            e)[:5000]
+
+                    if code_execute_success:
+                        logger.info(
+                            f'task_{task.task_id} code successfully executed , counter: {code_counter}'
+                        )
+                        code_execute_success, failed_reason = self._judge_code(
+                            task=task,
+                            previous_code_blocks=previous_code_blocks,
+                            cell=self.code_interpreter.nb.cells[-1])
+                        if not code_execute_success:
+                            code_and_error += 'code_' + str(
+                                code_counter
+                                + 1) + ':\n```python\n' + code + '\n```\n'
+                            code_and_error += 'code_' + str(
+                                code_counter + 1
+                            ) + ' error message: \n' + failed_reason + '\n'
+                    else:
+                        code_and_error += 'code_' + str(
+                            code_counter
+                            + 1) + ':\n```python\n' + code + '\n```\n'
+                        code_and_error += 'code_' + str(
+                            code_counter
+                            + 1) + ' error message: \n' + failed_reason + '\n'
+
+                    if not code_execute_success:
+                        # delete the last cell if the code execution failed
+                        del self.code_interpreter.nb.cells[-1]
+                    else:
+                        task.code = code
+                        task.result = code_interpreter_resp
+                    code_counter += 1
 
-                    counter += 1
-                if success:
-                    plan.finish_current_task()
+                    # save the successful code in jupyter notebook
+                if code_execute_success:
+                    self.plan.finish_current_task()
+                    if save:
+                        with open(
+                                jupyter_file_path, 'w',
+                                encoding='utf-8') as file:
+                            nbformat.write(self.code_interpreter.nb, file)
                 else:
-                    plan = self._update_plan(
-                        user_request=user_request, curr_plan=plan)
-            if save:
-                self._save(self.code_interpreter.nb)
+                    self.plan = self._update_plan(
+                        user_request=user_request, curr_plan=self.plan)
+                    self.code_interpreter.nb.cells.clear()
+
         except Exception as e:
             logger.error(f'error: {e}')
             raise e
diff --git a/modelscope_agent/tools/code_interpreter/code_interpreter.py b/modelscope_agent/tools/code_interpreter/code_interpreter.py
@@ -86,7 +86,7 @@ def __init__(self, cfg={}):
         self.nb = nbformat.v4.new_notebook()  # noqa E501
         self.nb_client = NotebookClient(self.nb, timeout=600)
         self.console = Console()
-        self.interaction: str
+        self.interaction = 'ipython'
         # timeout: int = 600
         self.kc = kc