split validating $, $$ placement phase into parts to increase accuracy per question

HarrySu123 · HarrySu123 · commit e355cf179b3b · 2025-07-11T10:00:05.000+01:00
diff --git a/conversion2025/mathpix_to_llm_to_in2lambda_to_JSON.ipynb b/conversion2025/mathpix_to_llm_to_in2lambda_to_JSON.ipynb
@@ -318,7 +318,7 @@
     "        4. Look through the entire markdown:\n",
     "            - Do not neglect any images, figures, or other media mentioned in the question, do not alter or neglect the alt text and the image URL.\n",
     "            - Leave the Image links and alt text within the question/solution, but also make a copy and place it into the `images` field.\n",
-    "            - Identify full Questions, place it into question_content\n",
+    "            - Identify full Questions, place it into question_content, becareful to not Include the solution in the question.\n",
     "            - Identify the full Worked Solution for each full Question.\n",
     "            - If the Worked Solution is not found, try to find the Answers associated with it instead.\n",
     "            - If Worked Solution or Answers are found, place it into the solution_content. Otherwise leave as empty string, \"\".\n",
@@ -404,7 +404,7 @@
     "            - Identify the stem and parts of the question, the parts may be obvious to find, like \"a)...\", \"b)...\", etc., or they could be implied by the question itself. All question must have at least one part, if there is only one part. :\n",
     "                1. The stem should be placed into the \"content\" field. Text in this field should be valid in the Milkdown editor. \n",
     "                2. the parts of the question (subquestions) should be placed into the \"parts\" field. Text in this field should be valid under Lexdown.\n",
-    "                3. for each part, identify the worked solution/answer and place it into the \"parts_solutions\" field, if not found, leave as empty string, \"\". Text in this field should be valid under Lexdown.\n",
+    "                3. for each part, carefully identify the worked solution/answer associated with it and place it into the \"parts_solutions\" field, if not found, leave as empty string, \"\". Text in this field should be valid under Lexdown.\n",
     "        5. Output only a valid, plain, raw JSON string matching the schema above, ready to parse immediately, with no code fence or extra text. Use plain newlines (not escaped as `\\n`).\n",
     "        6. The Text inside the JSON should be in Lexdown:\n",
     "            1. preserving all LaTeX math delimiters (`$...$` and `$$...$$`) and all formatting exactly as in the input, without paraphrasing, summarizing, or simplifying any mathematical expressions or formulas.\n",
@@ -442,14 +442,12 @@
     "    \"\"\"\n",
     "    # Initialize the output parser with the Tutorial schema.\n",
     "    parser = PydanticOutputParser(pydantic_object=Set_Question)\n",
-    "\n",
-    "\n",
     "    \n",
     "    questions_in_parts = []\n",
-    "    for question in questions_dict[\"questions\"]:\n",
+    "    for question_idx, question in enumerate(questions_dict[\"questions\"]):\n",
     "        passed = False\n",
     "\n",
-    "        for idx in range(3):\n",
+    "        for attempt_idx in range(3):\n",
     "\n",
     "            # Construct the prompt, appending the parser's format instructions.\n",
     "            prompt = f\"\"\"\n",
@@ -458,7 +456,7 @@
     "\n",
     "                {llm_task_seperate_parts}\n",
     "\n",
-    "                Input JSON:\n",
+    "                Input Dictionary:\n",
     "                ```JSON\n",
     "                {question}\n",
     "                ```\n",
@@ -476,17 +474,17 @@
     "            try:\n",
     "                # Parse the response using the output parser.\n",
     "                parsed_output = parser.parse(response.content)\n",
-    "                print(f\"LLM response successfully parsed question {idx}.\")\n",
+    "                print(f\"LLM response successfully parsed question {question_idx}.\")\n",
     "                # For Pydantic v2, use model_dump() to convert the model to a dictionary.\n",
     "                passed = True\n",
     "                break\n",
     "            except Exception as e:\n",
-    "                print(\"Error parsing LLM response as JSON:\")\n",
-    "                print(\"Retrying...\")\n",
+    "                print(f\"Error parsing LLM response as JSON for question {question_idx}:\")\n",
+    "                print(f\"Retrying... Attempt No.{attempt_idx + 1}\")\n",
     "                time.sleep(2)\n",
     "\n",
     "        if not passed:\n",
-    "            raise Exception(\"Failed to parse LLM response as JSON after multiple attempts.\")\n",
+    "            raise Exception(f\"Failed to parse LLM response as JSON after multiple attempts for question {question_idx}.\")\n",
     "        \n",
     "        questions_in_parts.append(parsed_output)\n",
     "    \n",
@@ -515,7 +513,7 @@
    "outputs": [],
    "source": [
     "llm_task_expression_check = r\"\"\"\n",
-    "    Look inside the JSON object's `questions`, specifically the `content`, `parts`, and `parts_solutions` fields. Ensure that the JSON content follows these rules:\n",
+    "    Look inside the structure, specifically the `content`, `parts`, and `parts_solutions` fields. Ensure that the JSON content follows these rules:\n",
     "        1. No extra escaping: The JSON string must contain no literal `\\\\n`, `\\\\\\\\`, or unnecessary escape sequences unless they are explicitly present in the original input text.\n",
     "        2. Careful to make the distinction between inline and display math, i.e. do not mess up the use of `$` and `$$`.\n",
     "        3. Math delimiters: All mathematical expressions must be fully enclosed within math delimiters — use `$...$` for inline math, and `$$...$$` for display math.\n",
@@ -555,41 +553,57 @@
     "        dict: A dictionary containing the keys \"name\" and \"exercise\".\n",
     "              If parsing fails, returns None.\n",
     "    \"\"\"\n",
-    "    json_string = json.dumps(validated_dict, indent=2)\n",
-    "    \n",
-    "    # prompt to let llm validate the JSON.\n",
-    "    validation_prompt = f\"\"\"\n",
-    "    {llm_task_expression_check}\n",
-    "\n",
-    "    Input JSON:\n",
-    "    ```json\n",
-    "    {json_string}\n",
-    "    ```\n",
-    "    return the JSON with the content fixed if needed.\n",
-    "    \"\"\"\n",
+    "    parser = PydanticOutputParser(pydantic_object=Set_Question)\n",
     "\n",
-    "    parser = PydanticOutputParser(pydantic_object=Set)\n",
+    "    questions_in_parts = []\n",
+    "    for question_idx, question in enumerate(validated_dict[\"questions\"]):\n",
+    "        passed = False\n",
     "    \n",
-    "    # loop 3 times to ensure robustness.\n",
-    "    for i in range(3):\n",
-    "        \n",
-    "        # Call the LLM\n",
-    "        response = llm.invoke(validation_prompt)\n",
+    "        # loop 3 times to ensure robustness.\n",
+    "        for attempt_idx in range(3):\n",
+    "            # prompt to let llm validate the JSON.\n",
+    "            validation_prompt = f\"\"\"\n",
+    "                Your task is to extract a JSON with the following structure exactly:\n",
+    "                {parser.get_format_instructions()}\n",
     "\n",
-    "        # Debug: print the raw LLM response\n",
-    "        # print(\"Raw LLM Response:\")\n",
-    "        # print(response)\n",
+    "                {llm_task_expression_check}\n",
     "\n",
-    "        try:\n",
-    "            # Parse the response using the output parser.\n",
-    "            parsed_output = parser.parse(response.content)\n",
-    "            print(\"LLM response successfully parsed as JSON with valid $$.\")\n",
-    "            # For Pydantic v2, use model_dump() to convert the model to a dictionary.\n",
-    "            return parsed_output.model_dump()\n",
-    "        except Exception as e:\n",
-    "            print(\"Error parsing textdown LLM response as JSON:\")\n",
-    "            print(\"Retrying...\")\n",
-    "            time.sleep(2)"
+    "                Input Dictionary:\n",
+    "                ```json\n",
+    "                {question}\n",
+    "                ```\n",
+    "                return the JSON with the content fixed if needed.\n",
+    "                \"\"\"\n",
+    "\n",
+    "            # Call the LLM\n",
+    "            response = llm.invoke(validation_prompt)\n",
+    "\n",
+    "            # Debug: print the raw LLM response\n",
+    "            # print(\"Raw LLM Response:\")\n",
+    "            # print(response)\n",
+    "\n",
+    "            try:\n",
+    "                # Parse the response using the output parser.\n",
+    "                parsed_output = parser.parse(response.content)\n",
+    "                print(f\"LLM response successfully parsed as JSON with valid $$ for question {question_idx}.\")\n",
+    "                # For Pydantic v2, use model_dump() to convert the model to a dictionary.\n",
+    "                passed = True\n",
+    "                break\n",
+    "                return parsed_output.model_dump()\n",
+    "            except Exception as e:\n",
+    "                print(f\"Error parsing textdown LLM response as JSON for question {question_idx}:\")\n",
+    "                print(\"Retrying... Attempt No.\", attempt_idx + 1)\n",
+    "                time.sleep(2)\n",
+    "        \n",
+    "        if not passed:\n",
+    "            raise Exception(f\"Failed to parse LLM response as JSON after multiple attempts for question {question_idx}.\")\n",
+    "        \n",
+    "        questions_in_parts.append(parsed_output)\n",
+    "    return  Set(\n",
+    "        name=validated_dict[\"name\"],\n",
+    "        year=validated_dict[\"year\"],\n",
+    "        questions=questions_in_parts\n",
+    "    ).model_dump()"
    ]
   },
   {