improved JSON content by letting llm go through it again to correct any syntax mistakes

HarrySu123 · HarrySu123 · commit 55af70e6686e · 2025-07-09T16:06:19.000+01:00
diff --git a/conversion2025/mathpix_to_llm_to_in2lambda_to_JSON.ipynb b/conversion2025/mathpix_to_llm_to_in2lambda_to_JSON.ipynb
@@ -307,7 +307,26 @@
     "    year: str = Field(..., description=\"Year of the set\")\n",
     "    questions: list[Set_Question] = Field(..., description=\"List of questions in the set\")\n",
     "\n",
-    "def extract_tutorial_questions(doc_page_content: str) -> dict:\n",
+    "llm_task = \"\"\"\n",
+    "Please follow these steps carefully:\n",
+    "    1. You can decide what to call the Set.\n",
+    "    2. Identify the year of the tutorial, if mentioned. Otherwise, use \"0\".\n",
+    "    3. Every character should match the original source exactly unless you're instructed to split content into fields.\n",
+    "    4. Identify the questions in the Input markdown and add them to the \"questions\" list.\n",
+    "    5. for each question:\n",
+    "        - Title is the only field where you are allowed to name it whatever you seem fit for the question.\n",
+    "        - Identify the content of the question, which will be always visible above the individual parts. This field uses the Milkdown editor.\n",
+    "        - Identify the parts of the question (subquestions) and their worked solutions. The parts could be obvious to find, like \"a)...\", \"b)...\", etc., or they could be implied by the question itself.\n",
+    "        - If the worked solution is not given, leave the worked solution empty.\n",
+    "        - Add the parts of the question (subquestions) and their worked solutions to the \"parts\" and \"parts_solutions\" lists, respectively.\n",
+    "    6. Output only a valid, plain, raw JSON string matching the schema above, ready to parse immediately, with no code fence or extra text. Use plain newlines (not escaped as `\\n`).\n",
+    "    7. The Text inside the JSON should be in Lexdown:\n",
+    "        1. preserving all LaTeX math delimiters (`$...$` and `$$...$$`) and all formatting exactly as in the input, without paraphrasing, summarizing, or simplifying any mathematical expressions or formulas.\n",
+    "        2. do not remove or collapse blank lines.\n",
+    "        3. Do not escape characters like `\\n` or `\\\\`.\n",
+    "\"\"\"\n",
+    "\n",
+    "def extract_questions(doc_page_content: str, extra_instruction: str = \"\") -> dict:\n",
     "    \"\"\"\n",
     "    Extracts the title and individual questions from a tutorial sheet.\n",
     "\n",
@@ -348,18 +367,9 @@
     "        Your task is to extract a JSON with the following structure exactly:\n",
     "        {parser.get_format_instructions()}\n",
     "\n",
-    "        Please follow these steps carefully:\n",
-    "            1. Infer a very short and concise title describing the entire Input.\n",
-    "            2. Identify the year of the tutorial, if mentioned. Otherwise, use \"0\".\n",
-    "            3. Use the original markdown text exactly as it appears for content, question, parts, and parts_solutions, **preserving all LaTeX math delimiters (`$...$` and `$$...$$`) and all formatting exactly as in the input**, without paraphrasing, summarizing, or simplifying any mathematical expressions or formulas.\n",
-    "            4. Identify the questions in the Input markdown and add them to the \"questions\" list.\n",
-    "            5. for each question:\n",
-    "                - Infer the title of the question (only the text, no numbering).\n",
-    "                - Identify the content of the question (no exercise title, no subquestions).\n",
-    "                - Identify the parts of the question (subquestions) and their worked solutions. If the worked solution is not given, leave the worked solution empty.\n",
-    "                - Add the parts of the question (subquestions) and their worked solutions to the \"parts\" and \"parts_solutions\" lists, respectively.\n",
-    "            6. Output only a valid, plain, raw JSON string matching the schema above, ready to parse immediately, with no extra text, comments, or explanations. Use plain newlines (not escaped as `\\n`).\n",
-    "            7. The Text inside the JSON should be in Lexdown, preserving all LaTeX math delimiters (`$...$` and `$$...$$`) and all formatting exactly as in the input, without paraphrasing, summarizing, or simplifying any mathematical expressions or formulas. As it will be parsed by KaTex, it should be valid LaTeX.\n",
+    "        {llm_task}\n",
+    "\n",
+    "        {extra_instruction}\n",
     "\n",
     "        Return the JSON now.\n",
     "        \"\"\"\n",
@@ -371,36 +381,201 @@
     "        response = llm.invoke(prompt)\n",
     "\n",
     "        # Debug: print the raw LLM response\n",
-    "        print(\"Raw LLM Response:\")\n",
-    "        print(response)\n",
+    "        # print(\"Raw LLM Response:\")\n",
+    "        # print(response)\n",
     "\n",
     "        try:\n",
     "            # Parse the response using the output parser.\n",
     "            parsed_output = parser.parse(response.content)\n",
+    "            print(\"LLM response successfully parsed as JSON.\")\n",
     "            # For Pydantic v2, use model_dump() to convert the model to a dictionary.\n",
     "            return parsed_output.model_dump()\n",
     "        except ValidationError as ve:\n",
     "            print(\"❌ Pydantic Validation Error:\")\n",
     "            for error in ve.errors():\n",
     "                print(f\" - {error['loc']}: {error['msg']}\")\n",
     "            print(\"Raw LLM output:\")\n",
-    "            print(response.content)"
+    "            print(response.content)\n",
+    "        except Exception as e:\n",
+    "            print(\"Error parsing LLM response as JSON:\")\n",
+    "            print(\"Retrying...\")\n",
+    "            time.sleep(2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "16",
+   "metadata": {},
+   "source": [
+    "# LLM evaluation of the content of JSON"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "16",
+   "id": "17",
    "metadata": {},
    "outputs": [],
    "source": [
-    "imported_tutorial = extract_tutorial_questions(md_content)"
+    "def task_rules_obeyed_check(extracted_dict: dict) -> dict:\n",
+    "    \"\"\"\n",
+    "    Extracts the title and individual questions from a tutorial sheet.\n",
+    "    \n",
+    "    Args:\n",
+    "        md_content (str): The content of a set.\n",
+    "        \n",
+    "    Returns:\n",
+    "        dict: A dictionary containing the keys \"name\" and \"exercise\".\n",
+    "              If parsing fails, returns None.\n",
+    "    \"\"\"\n",
+    "    json_string = json.dumps(extracted_dict, indent=2)\n",
+    "    \n",
+    "    # prompt to let llm validate the JSON.\n",
+    "    validation_prompt = f\"\"\"\n",
+    "    You were given the following rules to follow:\n",
+    "    {llm_task}\n",
+    "\n",
+    "    please make sure that the content of the JSON followed the rules above and return the new JSON.\n",
+    "    {json_string}\n",
+    "    \"\"\"\n",
+    "\n",
+    "    parser = PydanticOutputParser(pydantic_object=Set)\n",
+    "    # loop 3 times to ensure robustness.\n",
+    "    for i in range(3):\n",
+    "        \n",
+    "        # Call the LLM\n",
+    "        response = llm.invoke(validation_prompt)\n",
+    "\n",
+    "        # Debug: print the raw LLM response\n",
+    "        # print(\"Raw LLM Response:\")\n",
+    "        # print(response)\n",
+    "\n",
+    "        try:\n",
+    "            # Parse the response using the output parser.\n",
+    "            parsed_output = parser.parse(response.content)\n",
+    "            print(\"LLM response successfully parsed as validatedJSON.\")\n",
+    "            # For Pydantic v2, use model_dump() to convert the model to a dictionary.\n",
+    "            return parsed_output.model_dump()\n",
+    "        except ValidationError as ve:\n",
+    "            print(\"❌ Pydantic Validation Error:\")\n",
+    "            for error in ve.errors():\n",
+    "                print(f\" - {error['loc']}: {error['msg']}\")\n",
+    "            print(\"Raw LLM output:\")\n",
+    "            print(response.content)\n",
+    "        except Exception as e:\n",
+    "            print(\"Error parsing validation LLM response as JSON:\")\n",
+    "            print(\"Retrying...\")\n",
+    "            time.sleep(2)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "17",
+   "id": "18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def content_texdown_check(validated_dict: dict) -> dict:\n",
+    "    \"\"\"\n",
+    "    Checks if the content of the JSON is in Texdown format.\n",
+    "    \n",
+    "    Args:\n",
+    "        validated_dict (dict): The validated dictionary from the LLM.\n",
+    "        \n",
+    "    Returns:\n",
+    "        dict: A dictionary containing the keys \"name\" and \"exercise\".\n",
+    "              If parsing fails, returns None.\n",
+    "    \"\"\"\n",
+    "    json_string = json.dumps(validated_dict, indent=2)\n",
+    "    \n",
+    "    # prompt to let llm validate the JSON.\n",
+    "    validation_prompt = f\"\"\"\n",
+    "    Here is a JSON:\n",
+    "    ```json\n",
+    "    {json_string}\n",
+    "    ```\n",
+    "\n",
+    "    look inside questions, content, parts and parts_solutions, ensure that the content of the JSON follows these rules:\n",
+    "        1. Ensure the JSON string contains no literal \"\\n\" or \"\\\\\" characters unless explicitly part of the input text.\n",
+    "        2. All mathematical expressions and formulas must be fully enclosed within matching math delimiters: either inline math `$...$` or display math `$$...$$`.\n",
+    "        3. Verify all `$$` delimiters are properly opened and closed; no unbalanced or partial math blocks allowed.\n",
+    "        4. Verify all `$` delimiters are properly opened and closed; inline math should not span multiple lines.\n",
+    "        5. Preserve all LaTeX formatting, including backslashes and braces, exactly as in the input without adding extra escaping or modifying math commands.\n",
+    "        6. Blank lines inside math blocks must be preserved as-is.\n",
+    "        7. Output only a valid JSON string without any additional escaping or characters.\n",
+    "\n",
+    "\n",
+    "    return the JSON with the content fixed if needed.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    parser = PydanticOutputParser(pydantic_object=Set)\n",
+    "    \n",
+    "    # loop 3 times to ensure robustness.\n",
+    "    for i in range(3):\n",
+    "        \n",
+    "        # Call the LLM\n",
+    "        response = llm.invoke(validation_prompt)\n",
+    "\n",
+    "        # Debug: print the raw LLM response\n",
+    "        # print(\"Raw LLM Response:\")\n",
+    "        # print(response)\n",
+    "\n",
+    "        try:\n",
+    "            # Parse the response using the output parser.\n",
+    "            parsed_output = parser.parse(response.content)\n",
+    "            print(\"LLM response successfully parsed as Texdown JSON.\")\n",
+    "            # For Pydantic v2, use model_dump() to convert the model to a dictionary.\n",
+    "            return parsed_output.model_dump()\n",
+    "        except ValidationError as ve:\n",
+    "            print(\"❌ Pydantic Validation Error:\")\n",
+    "            for error in ve.errors():\n",
+    "                print(f\" - {error['loc']}: {error['msg']}\")\n",
+    "            print(\"Raw LLM output:\")\n",
+    "            print(response.content)\n",
+    "        except Exception as e:\n",
+    "            print(\"Error parsing textdown LLM response as JSON:\")\n",
+    "            print(\"Retrying...\")\n",
+    "            time.sleep(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def md_to_json(md_content: str) -> dict:\n",
+    "    \"\"\"\n",
+    "    Extracts the title and individual questions from a tutorial sheet.\n",
+    "    \n",
+    "    Args:\n",
+    "        md_content (str): The content of a set.\n",
+    "        \n",
+    "    Returns:\n",
+    "        dict: A dictionary containing the keys \"name\" and \"exercise\".\n",
+    "              If parsing fails, returns None.\n",
+    "    \"\"\"\n",
+    "    extracted_dict = extract_questions(md_content)\n",
+    "    # validated_dict = task_rules_obeyed_check(extracted_dict)\n",
+    "    content_validated_dict = content_texdown_check(extracted_dict)\n",
+    "    return content_validated_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imported_tutorial = md_to_json(md_content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "21",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -429,7 +604,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "18",
+   "id": "22",
    "metadata": {},
    "source": [
     "# Form JSON Schemas"
@@ -438,7 +613,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "19",
+   "id": "23",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -469,7 +644,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "20",
+   "id": "24",
    "metadata": {},
    "outputs": [],
    "source": []