implemented strategy which procedurally turns a list of questions and solutions to JSON in mathpic_to_llm_to_procedural_to_JSON.ipynb

HarrySu123 · HarrySu123 · commit 9ec40f039262 · 2025-07-08T11:23:57.000+01:00
diff --git a/conversion2025/mathpix_to_llm_to_llm_to_JSON.ipynb b/conversion2025/mathpix_to_llm_to_llm_to_JSON.ipynb
@@ -117,7 +117,7 @@
    "outputs": [],
    "source": [
     "folder_path = \"conversion_content\"\n",
-    "output_path = f\"{folder_path}/mathpix_to_llm_to_JSON_out\"\n",
+    "output_path = f\"{folder_path}/mathpix_to_llm_to_llm_to_JSON_out\"\n",
     "media_path = f\"{output_path}/media\"\n",
     "\n",
     "Path(media_path).mkdir(parents=True, exist_ok=True)\n",
diff --git a/conversion2025/mathpix_to_llm_to_procedural_to_JSON.ipynb b/conversion2025/mathpix_to_llm_to_procedural_to_JSON.ipynb
@@ -117,7 +117,7 @@
    "outputs": [],
    "source": [
     "folder_path = \"conversion_content\"\n",
-    "output_path = f\"{folder_path}/mathpix_to_llm_to_JSON_out\"\n",
+    "output_path = f\"{folder_path}/mathpix_to_llm_to_procedural_to_JSON_out\"\n",
     "media_path = f\"{output_path}/media\"\n",
     "\n",
     "Path(media_path).mkdir(parents=True, exist_ok=True)\n",
@@ -457,74 +457,40 @@
     "    publish: bool = Field(..., description=\"Publish flag\")\n",
     "    title: str = Field(..., description=\"Question title\")\n",
     "\n",
-    "def create_question_json(question: str, solution: str) -> dict:\n",
-    "    # Initialize the output parser using the defined Pydantic model.\n",
-    "    parser = PydanticOutputParser(pydantic_object=QuestionJson)\n",
-    "\n",
-    "    # Minimum JSON template to guide the model. (Used as context.)\n",
-    "    minimum_json_template = r'''{\n",
-    "      \"orderNumber\": 0,\n",
-    "      \"displayFinalAnswer\": true,\n",
-    "      \"displayStructuredTutorial\": true,\n",
-    "      \"displayWorkedSolution\": true,\n",
-    "      \"displayChatbot\": false,\n",
-    "      \"masterContent\": \"Top level question here\",\n",
-    "      \"parts\": [\n",
-    "        {\n",
-    "          \"answerContent\": \"final answer here corresponding the part, is no answer found, leave empty\",\n",
-    "          \"content\": \"part question text here, if only one part, then leave empty\",\n",
-    "          \"orderNumber\": 0,\n",
-    "          \"responseAreas\": [],\n",
-    "          \"tutorial\": [],\n",
-    "          \"workedSolution\": {\n",
-    "            \"content\": \"Part worked solution here\",\n",
-    "            \"title\": \"\",\n",
-    "            \"children\": []\n",
-    "          }\n",
-    "        }\n",
-    "      ],\n",
-    "      \"publish\": false,\n",
-    "      \"title\": \"Question title here\"\n",
-    "    }'''\n",
-    "\n",
-    "    # Construct the prompt, appending the parser's format instructions.\n",
-    "    question_prompt = f'''\n",
-    "      JSON_TEMPLATE\n",
-    "      ```json\n",
-    "      {minimum_json_template}\n",
-    "      ```\n",
-    "\n",
-    "      IMPORTED_QUESTION\n",
-    "      ```markdown\n",
-    "      {question}\n",
-    "      ```\n",
-    "\n",
-    "      IMPORTED_SOLUTION\n",
-    "      ```markdown\n",
-    "      {solution}\n",
-    "      ```\n",
-    "\n",
-    "      Preserve the markdown math formatting to use $...$ for math expressions. Do not modify the original text of the question.\n",
-    "\n",
-    "      Infer the final answer and put it in the answerContent field of the part. \n",
-    "      The worked solution should be in the workedSolution.content field.\n",
-    "\n",
-    "      If you cannot find a suitable text for any of the sections, leave it empty.\n",
-    "\n",
-    "      {parser.get_format_instructions()}\n",
-    "      '''\n",
+    "def create_question_json(question: dict, solution: dict) -> dict:\n",
+    "    \n",
+    "    # create the list of parts from the question and solution.\n",
+    "    # Each part corresponds to a subquestion and its worked solution.\n",
+    "    # The orderNumber is the index of the subquestion in the list.\n",
+    "    parts = []\n",
+    "    for idx, (subquestion, workedSolution) in enumerate(zip(question.get(\"subquestions\", []), solution.get(\"workedSolutions\", []))):\n",
+    "        part = Part(\n",
+    "            answerContent=\"\",\n",
+    "            content=subquestion,\n",
+    "            orderNumber=idx,\n",
+    "            responseAreas=[],\n",
+    "            tutorial=[],\n",
+    "            workedSolution=WorkedSolution(\n",
+    "                content=workedSolution,\n",
+    "                title=subquestion,\n",
+    "                children=[]\n",
+    "            )\n",
+    "        )\n",
+    "        parts.append(part)\n",
+    "    \n",
+    "    # Create the QuestionJson object with the provided question and solution.\n",
+    "    return QuestionJson(\n",
+    "            orderNumber=0,\n",
+    "            displayFinalAnswer=True,\n",
+    "            displayStructuredTutorial=True,\n",
+    "            displayWorkedSolution=True,\n",
+    "            masterContent=question.get(\"content\", \"\"),\n",
+    "            parts=parts,\n",
+    "            publish=False,\n",
+    "            title=question.get(\"title\", \"\")\n",
+    "        ).model_dump()\n",
     "\n",
-    "    # Invoke the language model.\n",
-    "    response = llm.invoke(question_prompt)\n",
     "\n",
-    "    try:\n",
-    "        # Parse the response using the output parser.\n",
-    "        parsed_output = parser.parse(response.content)\n",
-    "        return parsed_output.model_dump()  # Return as a dictionary.\n",
-    "    except Exception as e:\n",
-    "        print(\"Error parsing JSON from LLM response:\", e)\n",
-    "        print(\"LLM response:\", response.content)\n",
-    "        return None\n",
     "\n"
    ]
   },