added extra step to sepearet extraction of question and its parts

HarrySu123 · HarrySu123 · commit b820b4c9799d · 2025-07-10T13:44:34.000+01:00
diff --git a/conversion2025/mathpix_to_llm_to_in2lambda_to_JSON.ipynb b/conversion2025/mathpix_to_llm_to_in2lambda_to_JSON.ipynb
@@ -294,6 +294,89 @@
    "id": "15",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "#define initial question model\n",
+    "class QuestionModel(BaseModel):\n",
+    "    # full question and full solution\n",
+    "    question_content: str = Field(..., description=\"The content of the question.\")\n",
+    "    solution_content: str = Field(..., description=\"The content of the solution.\")\n",
+    "\n",
+    "class AllQuestionsModel(BaseModel):\n",
+    "    name: str = Field(..., description=\"Title of the set\")\n",
+    "    year: str = Field(..., description=\"Year of the set\")\n",
+    "    questions: list[QuestionModel] = Field(..., description=\"A list of questions.\")\n",
+    "\n",
+    "llm_task_seperate_questions = \"\"\"\n",
+    "    Your task is to extract all the individual questions and their worked solutions from the markdown content.\n",
+    "    please follow these steps carefully:\n",
+    "        1. you can choose the name of \"AllQuestionModel\".\n",
+    "        2. Identify the year of the tutorial, if mentioned. Otherwise, use \"0\".\n",
+    "        3. Every character should match the original source exactly unless you're instructed to split content into fields, without adding escapes or modifications.\n",
+    "        4. Look through the entire markdown:\n",
+    "            - Without ignoring any mentions of images, figures, or other media.\n",
+    "            - Identify full Questions, place it into question_content\n",
+    "            - Identify the full Worked Solution for each full Question.\n",
+    "            - If the Worked Solution is not found, try to find the Answers associated with it instead.\n",
+    "            - If Worked Solution or Answers are found, place it into the solution_content. Otherwise leave as empty string, \"\".\n",
+    "        5. Output only a valid, plain, raw JSON string matching the schema above, ready to parse immediately, with no code fence or extra text. Use plain newlines (not escaped as `\\n`).\n",
+    "        6. The Text inside the JSON should be in Lexdown:\n",
+    "            1. preserving all LaTeX math delimiters (`$...$` and `$$...$$`) and all formatting exactly as in the input, without paraphrasing, summarizing, or simplifying any mathematical expressions or formulas.\n",
+    "            2. do not remove or collapse blank lines.\n",
+    "            3. Do not escape characters like `\\n` or `\\\\`.\n",
+    "    \"\"\"\n",
+    "\n",
+    "def extract_questions(doc_page_content: str) -> dict:\n",
+    "    # Initialise the parser for the output.\n",
+    "    parser = PydanticOutputParser(pydantic_object=AllQuestionsModel)\n",
+    "\n",
+    "    prompt = f\"\"\"\n",
+    "        Your task is to extract a JSON with the following structure exactly:\n",
+    "        {parser.get_format_instructions()}\n",
+    "\n",
+    "        {llm_task_seperate_questions}\n",
+    "\n",
+    "        Input markdown:\n",
+    "        ```\n",
+    "        {doc_page_content}\n",
+    "        ```\n",
+    "        Return the JSON now.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # tries to call the LLM multiple times to ensure robustness.\n",
+    "    for i in range(3):\n",
+    "        \n",
+    "        # Call the LLM\n",
+    "        response = llm.invoke(prompt)\n",
+    "\n",
+    "        # Debug: print the raw LLM response\n",
+    "        # print(\"Raw LLM Response:\")\n",
+    "        # print(response)\n",
+    "\n",
+    "        try:\n",
+    "            # Parse the response using the output parser.\n",
+    "            parsed_output = parser.parse(response.content)\n",
+    "            print(\"LLM response successfully parsed as JSON with questions.\")\n",
+    "            # For Pydantic v2, use model_dump() to convert the model to a dictionary.\n",
+    "            return parsed_output.model_dump()\n",
+    "        except ValidationError as ve:\n",
+    "            print(\"❌ Pydantic Validation Error:\")\n",
+    "            for error in ve.errors():\n",
+    "                print(f\" - {error['loc']}: {error['msg']}\")\n",
+    "            print(\"Raw LLM output:\")\n",
+    "            print(response.content)\n",
+    "        except Exception as e:\n",
+    "            print(\"Error parsing LLM response as JSON:\")\n",
+    "            print(\"Retrying...\")\n",
+    "            time.sleep(2)\n",
+    "    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16",
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# Define the schema for the tutorial output.\n",
     "class Set_Question(BaseModel):\n",
@@ -307,26 +390,29 @@
     "    year: str = Field(..., description=\"Year of the set\")\n",
     "    questions: list[Set_Question] = Field(..., description=\"List of questions in the set\")\n",
     "\n",
-    "llm_task = \"\"\"\n",
-    "Please follow these steps carefully:\n",
-    "    1. You can decide what to call the Set.\n",
-    "    2. Identify the year of the tutorial, if mentioned. Otherwise, use \"0\".\n",
-    "    3. Every character should match the original source exactly unless you're instructed to split content into fields.\n",
-    "    4. Identify the questions in the Input markdown and add them to the \"questions\" list.\n",
-    "    5. for each question:\n",
-    "        - Title is the only field where you are allowed to name it whatever you seem fit for the question.\n",
-    "        - Identify the content of the question, which will be always visible above the individual parts. This field uses the Milkdown editor.\n",
-    "        - Identify the parts of the question (subquestions) and their worked solutions. The parts could be obvious to find, like \"a)...\", \"b)...\", etc., or they could be implied by the question itself.\n",
-    "        - If the worked solution is not given, leave the worked solution empty.\n",
-    "        - Add the parts of the question (subquestions) and their worked solutions to the \"parts\" and \"parts_solutions\" lists, respectively.\n",
-    "    6. Output only a valid, plain, raw JSON string matching the schema above, ready to parse immediately, with no code fence or extra text. Use plain newlines (not escaped as `\\n`).\n",
-    "    7. The Text inside the JSON should be in Lexdown:\n",
-    "        1. preserving all LaTeX math delimiters (`$...$` and `$$...$$`) and all formatting exactly as in the input, without paraphrasing, summarizing, or simplifying any mathematical expressions or formulas.\n",
-    "        2. do not remove or collapse blank lines.\n",
-    "        3. Do not escape characters like `\\n` or `\\\\`.\n",
-    "\"\"\"\n",
-    "\n",
-    "def extract_questions(doc_page_content: str, extra_instruction: str = \"\") -> dict:\n",
+    "\n",
+    "# TODO: make parts completely seperate from stem to question\n",
+    "# ensure no answer in question itself, only in parts_solutions.\n",
+    "llm_task_seperate_parts = \"\"\"\n",
+    "    Your task is to seperate the questions into indicidual parts and their worked solutions.\n",
+    "    Please follow these steps carefully:\n",
+    "        1. Every character should match the original source exactly unless you're instructed to split content into fields, without adding escapes or modifications.\n",
+    "        2. Use the same name and year.\n",
+    "        3. For each question in questions:\n",
+    "            - Title is the only field where you are allowed to name it whatever you seem fit for the question.\n",
+    "            - Do not neglect any images, figures, or other media mentioned in the question.\n",
+    "            - Identify the stem and parts of the question, the parts may be obvious to find, like \"a)...\", \"b)...\", etc., or they could be implied by the question itself. All question must have at least one part, if there is only one part. :\n",
+    "                1. The stem should be placed into the \"content\" field. Text in this field should be valid in the Milkdown editor. \n",
+    "                2. the parts of the question (subquestions) should be placed into the \"parts\" field. Text in this field should be valid under Lexdown.\n",
+    "                3. for each part, identify the worked solution/answer and place it into the \"parts_solutions\" field, if not found, leave as empty string, \"\". Text in this field should be valid under Lexdown.\n",
+    "        4. Output only a valid, plain, raw JSON string matching the schema above, ready to parse immediately, with no code fence or extra text. Use plain newlines (not escaped as `\\n`).\n",
+    "        5. The Text inside the JSON should be in Lexdown:\n",
+    "            1. preserving all LaTeX math delimiters (`$...$` and `$$...$$`) and all formatting exactly as in the input, without paraphrasing, summarizing, or simplifying any mathematical expressions or formulas.\n",
+    "            2. Do not remove or collapse blank lines.\n",
+    "            3. Do not escape characters like `\\n` or `\\\\`.\n",
+    "    \"\"\"\n",
+    "\n",
+    "def extract_parts(questions_dict: dict) -> dict:\n",
     "    \"\"\"\n",
     "    Extracts the title and individual questions from a tutorial sheet.\n",
     "\n",
@@ -361,15 +447,13 @@
     "    prompt = f\"\"\"\n",
     "        Input markdown:\n",
     "        ```markdown\n",
-    "        {doc_page_content}\n",
+    "        {questions_dict}\n",
     "        ```\n",
     "\n",
     "        Your task is to extract a JSON with the following structure exactly:\n",
     "        {parser.get_format_instructions()}\n",
     "\n",
-    "        {llm_task}\n",
-    "\n",
-    "        {extra_instruction}\n",
+    "        {llm_task_seperate_parts}\n",
     "\n",
     "        Return the JSON now.\n",
     "        \"\"\"\n",
@@ -387,7 +471,7 @@
     "        try:\n",
     "            # Parse the response using the output parser.\n",
     "            parsed_output = parser.parse(response.content)\n",
-    "            print(\"LLM response successfully parsed as JSON.\")\n",
+    "            print(\"LLM response successfully parsed as JSON with parts.\")\n",
     "            # For Pydantic v2, use model_dump() to convert the model to a dictionary.\n",
     "            return parsed_output.model_dump()\n",
     "        except ValidationError as ve:\n",
@@ -404,7 +488,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "16",
+   "id": "17",
    "metadata": {},
    "source": [
     "# LLM evaluation of the content of JSON"
@@ -413,68 +497,37 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "17",
+   "id": "18",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def task_rules_obeyed_check(extracted_dict: dict) -> dict:\n",
-    "    \"\"\"\n",
-    "    Extracts the title and individual questions from a tutorial sheet.\n",
-    "    \n",
-    "    Args:\n",
-    "        md_content (str): The content of a set.\n",
-    "        \n",
-    "    Returns:\n",
-    "        dict: A dictionary containing the keys \"name\" and \"exercise\".\n",
-    "              If parsing fails, returns None.\n",
-    "    \"\"\"\n",
-    "    json_string = json.dumps(extracted_dict, indent=2)\n",
-    "    \n",
-    "    # prompt to let llm validate the JSON.\n",
-    "    validation_prompt = f\"\"\"\n",
-    "    You were given the following rules to follow:\n",
-    "    {llm_task}\n",
-    "\n",
-    "    please make sure that the content of the JSON followed the rules above and return the new JSON.\n",
-    "    {json_string}\n",
+    "llm_task_expression_check = r\"\"\"\n",
+    "    Look inside the JSON object's `questions`, specifically the `content`, `parts`, and `parts_solutions` fields. Ensure that the JSON content follows these rules:\n",
+    "        1. No extra escaping: The JSON string must contain no literal `\\\\n`, `\\\\\\\\`, or unnecessary escape sequences unless they are explicitly present in the original input text.\n",
+    "        2. Careful to make the distinction between inline and display math, i.e. do not mess up the use of `$` and `$$`.\n",
+    "        3. Math delimiters: All mathematical expressions must be fully enclosed within math delimiters — use `$...$` for inline math, and `$$...$$` for display math.\n",
+    "        4. Balanced delimiters:\n",
+    "            - All `$$` and `$` must be properly opened and closed.\n",
+    "            - No unbalanced or partial math blocks.\n",
+    "        4. Display math formatting:\n",
+    "            - The opening `$$` must appear on a new line.\n",
+    "            - The closing `$$` must also be on its own new line.\n",
+    "            - The math content must appear immediately between them, with no extra blank lines unless they are part of the input.\n",
+    "        5. Inline math rules:\n",
+    "            - `$...$` should not span multiple lines.\n",
+    "            - Avoid using `$$` for short inline expressions.\n",
+    "        6. Preserve LaTeX syntax:\n",
+    "            - All LaTeX commands, backslashes (`\\`), braces (`{}`, `[]`), and special characters must be preserved exactly as in the original input.\n",
+    "            - Do not add or remove escaping.\n",
+    "        7. Blank lines:\n",
+    "            - Preserve all blank lines inside math blocks.\n",
+    "            - Outside math, follow the structure of the original input.\n",
+    "        8. Output format:\n",
+    "            - Output a single valid JSON string.\n",
+    "            - Do not include any extra characters, explanations, or escaped formatting outside the JSON structure.\n",
     "    \"\"\"\n",
     "\n",
-    "    parser = PydanticOutputParser(pydantic_object=Set)\n",
-    "    # loop 3 times to ensure robustness.\n",
-    "    for i in range(3):\n",
-    "        \n",
-    "        # Call the LLM\n",
-    "        response = llm.invoke(validation_prompt)\n",
-    "\n",
-    "        # Debug: print the raw LLM response\n",
-    "        # print(\"Raw LLM Response:\")\n",
-    "        # print(response)\n",
     "\n",
-    "        try:\n",
-    "            # Parse the response using the output parser.\n",
-    "            parsed_output = parser.parse(response.content)\n",
-    "            print(\"LLM response successfully parsed as validatedJSON.\")\n",
-    "            # For Pydantic v2, use model_dump() to convert the model to a dictionary.\n",
-    "            return parsed_output.model_dump()\n",
-    "        except ValidationError as ve:\n",
-    "            print(\"❌ Pydantic Validation Error:\")\n",
-    "            for error in ve.errors():\n",
-    "                print(f\" - {error['loc']}: {error['msg']}\")\n",
-    "            print(\"Raw LLM output:\")\n",
-    "            print(response.content)\n",
-    "        except Exception as e:\n",
-    "            print(\"Error parsing validation LLM response as JSON:\")\n",
-    "            print(\"Retrying...\")\n",
-    "            time.sleep(2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "18",
-   "metadata": {},
-   "outputs": [],
-   "source": [
     "def content_texdown_check(validated_dict: dict) -> dict:\n",
     "    \"\"\"\n",
     "    Checks if the content of the JSON is in Texdown format.\n",
@@ -490,21 +543,12 @@
     "    \n",
     "    # prompt to let llm validate the JSON.\n",
     "    validation_prompt = f\"\"\"\n",
-    "    Here is a JSON:\n",
+    "    {llm_task_expression_check}\n",
+    "\n",
+    "    Input JSON:\n",
     "    ```json\n",
     "    {json_string}\n",
     "    ```\n",
-    "\n",
-    "    look inside questions, content, parts and parts_solutions, ensure that the content of the JSON follows these rules:\n",
-    "        1. Ensure the JSON string contains no literal \"\\n\" or \"\\\\\" characters unless explicitly part of the input text.\n",
-    "        2. All mathematical expressions and formulas must be fully enclosed within matching math delimiters: either inline math `$...$` or display math `$$...$$`.\n",
-    "        3. Verify all `$$` delimiters are properly opened and closed; no unbalanced or partial math blocks allowed.\n",
-    "        4. Verify all `$` delimiters are properly opened and closed; inline math should not span multiple lines.\n",
-    "        5. Preserve all LaTeX formatting, including backslashes and braces, exactly as in the input without adding extra escaping or modifying math commands.\n",
-    "        6. Blank lines inside math blocks must be preserved as-is.\n",
-    "        7. Output only a valid JSON string without any additional escaping or characters.\n",
-    "\n",
-    "\n",
     "    return the JSON with the content fixed if needed.\n",
     "    \"\"\"\n",
     "\n",
@@ -523,7 +567,7 @@
     "        try:\n",
     "            # Parse the response using the output parser.\n",
     "            parsed_output = parser.parse(response.content)\n",
-    "            print(\"LLM response successfully parsed as Texdown JSON.\")\n",
+    "            print(\"LLM response successfully parsed as JSON with valid $$.\")\n",
     "            # For Pydantic v2, use model_dump() to convert the model to a dictionary.\n",
     "            return parsed_output.model_dump()\n",
     "        except ValidationError as ve:\n",
@@ -556,8 +600,8 @@
     "        dict: A dictionary containing the keys \"name\" and \"exercise\".\n",
     "              If parsing fails, returns None.\n",
     "    \"\"\"\n",
-    "    extracted_dict = extract_questions(md_content)\n",
-    "    # validated_dict = task_rules_obeyed_check(extracted_dict)\n",
+    "    questions_dict = extract_questions(md_content)\n",
+    "    extracted_dict = extract_parts(questions_dict)\n",
     "    content_validated_dict = content_texdown_check(extracted_dict)\n",
     "    return content_validated_dict"
    ]