Support for extracting images as separate elements from both question…

…s and answers.
rolfis · Jan 15, 2021 · 9a54f29 · 9a54f29
1 parent ae555cb
commit 9a54f29
Show file tree

Hide file tree

Showing 7 changed files with 111 additions and 17 deletions.
diff --git a/Pipfile b/Pipfile
@@ -6,6 +6,8 @@ name = "pypi"
 [packages]
 logzero = "*"
 lxml = "*"
+python-docx = "*"
+htmldocx = "*"
 
 [dev-packages]
 

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/src/main.py b/src/main.py
@@ -4,12 +4,15 @@
 """
 
 __author__ = "Rolf Johansson"
+__license__ = "Apache License 2.0"
 __version__ = "0.1.0"
-__license__ = "Apache"
 
 from qti_parser import question_type
+import formats
 import argparse
 import json
+import re
+import hashlib
 from logzero import logger
 from lxml import etree
 
@@ -18,7 +21,6 @@
 } 
 
 def main(args):
-    """ Main entry point of the app """
     logger.info("QTI converter utility.")
     logger.info(args)
 
@@ -31,6 +33,8 @@ def main(args):
                 'title': '', 
                 'question': []
             }
+
+            # TODO: Should be prefixed with PATH part of input filename since paths in XML are relative
             this_assessment_xml = this_assessment['id'] + "/" + this_assessment['id'] + ".xml"
 
             for xml_item in etree.parse(this_assessment_xml).getroot().findall(".//{http://www.imsglobal.org/xsd/ims_qtiasiv1p2}item"):
@@ -42,6 +46,35 @@ def main(args):
                     'text': xml_item.find("{http://www.imsglobal.org/xsd/ims_qtiasiv1p2}presentation/{http://www.imsglobal.org/xsd/ims_qtiasiv1p2}material/{http://www.imsglobal.org/xsd/ims_qtiasiv1p2}mattext").text
                 }
 
+                # TODO: Fix images in a better way
+                image = []
+                if this_question['text'].lower().find("<p>.*<img"):
+                    for match in re.finditer('<p>.*<img src=\"([^\"]+)\".*>.*</p>', this_question['text'], re.DOTALL):
+                        image.append({
+                            'id': str(hashlib.md5(match.group(1).replace("%24IMS-CC-FILEBASE%24/", "").encode()).hexdigest()),
+                            'href': match.group(1).replace("%24IMS-CC-FILEBASE%24/", "")
+                        })
+                    p = re.compile('<p>.*<img src=\"([^\"]+)\".*>.*</p>')
+                    subn_tuple = p.subn('', this_question['text'])
+                    if subn_tuple[1] > 0:
+                        this_question['text'] = subn_tuple[0]
+
+                elif this_question['text'].lower().find("<img"):
+                    for match in re.finditer('<img src=\"([^\"]+)\".*>', this_question['text'], re.DOTALL):
+                        image.append({
+                            'id': str(hashlib.md5(match.group(1).replace("%24IMS-CC-FILEBASE%24/", "").encode()).hexdigest()),
+                            'href': match.group(1).replace("%24IMS-CC-FILEBASE%24/", "")
+                        })
+                    p = re.compile('<img src=\"([^\"]+)\".*>')
+                    subn_tuple = p.subn('', this_question['text'])
+                    if subn_tuple[1] > 0:
+                        this_question['text'] = subn_tuple[0]
+
+                if image:
+                    this_question['image'] = image
+
+                # <p><img src="Exercise_09_05-06_03a.png" alt="Exercise_09_05-06_03a.png" width="393" height="126"></p>
+
                 if this_question['question_type'] == "multiple_choice_question":
                     this_question['answer'] = question_type.multiple_choice.get_answers(xml_item)
                 elif this_question['question_type'] == "true_false_question":
@@ -56,8 +89,16 @@ def main(args):
 
             qti_resource['assessment'].append(this_assessment)
 
-        qti_resource_json = json.dumps(qti_resource, indent = 2)
-        print(qti_resource_json)
+        if (args.format.lower() == "json"):
+            logger.info("Output to STDOUT as JSON.")
+            qti_resource_json = json.dumps(qti_resource, indent = 2)
+            print(qti_resource_json)
+        elif (args.format.lower() == "pdf"):
+            logger.error("Format not supported yet: " + args.format)
+        elif (args.format.lower() == "docx"):
+            formats.docx.write_file(qti_resource)
+        else:
+            logger.error("Unknown format: " + args.format)
 
     except OSError as e:
         logger.error("%s", e)
@@ -68,11 +109,10 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Convert QTI files into other formats.", add_help=False)
-    parser.add_argument("input", help="QTI input file.")
+    parser.add_argument("input", help="QTI input file (imsmanifest.xml).")
     parser.add_argument("-v", action="count", default=0, help="Verbosity (-v, -vv, etc).")
     parser.add_argument("-f", action="store", dest="format", default="json", help="Output format, defaults to JSON.")
     parser.add_argument( "--version", action="version", help="Display version and exit.", version="%(prog)s (version {version})".format(version=__version__))
     parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help='Show this help message and exit.')
     args = parser.parse_args()
-    print(args)
     main(args)
diff --git a/src/qti_parser/question_type/multiple_answers.py b/src/qti_parser/question_type/multiple_answers.py
@@ -19,7 +19,8 @@ def get_answers(xml):
                 {
                     'id': xml_answer_item.get("ident"),
                     'text': xml_answer_item.find("{http://www.imsglobal.org/xsd/ims_qtiasiv1p2}material/{http://www.imsglobal.org/xsd/ims_qtiasiv1p2}mattext").text,
-                    'correct': True if xml_answer_item.get("ident") in correct_answers else False
+                    'correct': True if xml_answer_item.get("ident") in correct_answers else False,
+                    'display': True
                 }
             )
     except OSError as e:

diff --git a/src/qti_parser/question_type/multiple_choice.py b/src/qti_parser/question_type/multiple_choice.py
@@ -4,6 +4,8 @@
 
 from lxml import etree
 from logzero import logger
+import re
+import hashlib
 
 def get_answers(xml):
     """ Return an array of possible answers """
@@ -15,13 +17,29 @@ def get_answers(xml):
 
     try:
         for xml_answer_item in xml.findall(".//{http://www.imsglobal.org/xsd/ims_qtiasiv1p2}response_label"):
-            answers.append(
-                {
-                    'id': xml_answer_item.get("ident"),
-                    'text': xml_answer_item.find("{http://www.imsglobal.org/xsd/ims_qtiasiv1p2}material/{http://www.imsglobal.org/xsd/ims_qtiasiv1p2}mattext").text,
-                    'correct': True if xml_answer_item.get("ident") in correct_answers else False
-                }
-            )
+            image = []
+            this_answer = {}
+            this_answer['id'] = xml_answer_item.get("ident")
+            this_answer['text'] = xml_answer_item.find("{http://www.imsglobal.org/xsd/ims_qtiasiv1p2}material/{http://www.imsglobal.org/xsd/ims_qtiasiv1p2}mattext").text
+            this_answer['correct'] = True if xml_answer_item.get("ident") in correct_answers else False
+            this_answer['display'] = True
+
+            if this_answer['text'].lower().find("<img.*"):
+                for match in re.finditer('^<img src="([^"]+)".*>', this_answer['text'], re.DOTALL):
+                    image.append({
+                        'id': str(hashlib.md5(match.group(1).replace("%24IMS-CC-FILEBASE%24/", "").encode()).hexdigest()),
+                        'href': match.group(1).replace("%24IMS-CC-FILEBASE%24/", "")
+                    })
+                p = re.compile('<img src="([^"]+)".*>')
+                subn_tuple = p.subn('', this_answer['text'])
+                if subn_tuple[1] > 0:
+                    this_answer['text'] = subn_tuple[0]
+
+            if image:
+                this_answer['image'] = image
+
+            answers.append(this_answer)
+
     except OSError as e:
         logger.error("%s", e)
     except etree.ParseError as e:

diff --git a/src/qti_parser/question_type/short_answer.py b/src/qti_parser/question_type/short_answer.py
@@ -17,7 +17,8 @@ def get_answers(xml):
                 {
                     'id': str(i),
                     'text': xml_answer_item.text,
-                    'correct': True
+                    'correct': True,
+                    'display': False
                 }
             )
     except OSError as e:

diff --git a/src/qti_parser/question_type/true_false.py b/src/qti_parser/question_type/true_false.py
@@ -19,7 +19,8 @@ def get_answers(xml):
                 {
                     'id': xml_answer_item.get("ident"),
                     'text': xml_answer_item.find("{http://www.imsglobal.org/xsd/ims_qtiasiv1p2}material/{http://www.imsglobal.org/xsd/ims_qtiasiv1p2}mattext").text,
-                    'correct': True if xml_answer_item.get("ident") in correct_answers else False
+                    'correct': True if xml_answer_item.get("ident") in correct_answers else False,
+                    'display': True
                 }
             )
     except OSError as e: