initial working hierarchy

CrucibleSDS · Sep 29, 2022 · 7dab07f · 7dab07f
1 parent 415e3be
commit 7dab07f
Show file tree

Hide file tree

Showing 8 changed files with 254 additions and 19 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,88 @@
+# Tungsten output
+output.json
+
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# AWS User-specific
+.idea/**/aws.xml
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# SonarLint plugin
+.idea/sonarlint/
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+# idea folder, uncomment if you don't need it
+# .idea
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/tungsten.iml b/.idea/tungsten.iml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/tungsten/parsers/sigma_aldrich.py b/tungsten/parsers/sigma_aldrich.py
@@ -1,28 +1,125 @@
+from collections import deque
 from pprint import pprint
+import json
 
+import pdfminer.layout
 from pdfminer.high_level import extract_pages
 from pdfminer.layout import LTTextContainer
+from pdfminer.layout import LTComponent
+from pdfminer.layout import LTText
+from pdfminer.layout import LTItem
+
+from dataclasses import dataclass
+from enum import Enum
+
+
+class ParsingElementType(Enum):
+    TEXT = 1
+    VECTOR = 2
+    RASTER = 3
+
+
+@dataclass
+class ParsingElement:
+    """Class used to abstract PDF objects into parsing objects"""
+    x0: float
+    y0: float
+    x1: float
+    y1: float
+    type: ParsingElementType
+    element: LTItem
+    name: str
+
+    def __lt__(self, other):
+        return self.y0 < other.y0
+
+
+def convert_to_parsing_element(lt_item: LTItem):
+    return ParsingElement(lt_item.x0, lt_item.y0, lt_item.x1, lt_item.y1,
+                          None,
+                          lt_item,
+                          lt_item.get_text() if isinstance(lt_item, LTText) else type(lt_item).__name__)
+
+
+def generateUniqueName(proposedName: str, nameSet: set):
+    if proposedName in nameSet:
+        return proposedName + "'"
+    else:
+        return proposedName
 
 
 def parse_sigma_aldrich(filename) -> None:
+    # Currently this program does not catch these edge cases:
+    # - An element will be further to the left than the first element, this triggers a stack underflow
+
     elements = [element for elements in extract_pages(filename) for element in elements]
 
-    headers = [
-        (index, element)
-        for index, element in enumerate(elements)
-        if element.x0 == 53.88 and not element.is_empty()
-    ]
-
-    raw_sections = [
-        elements[header[0]:nheader[0]]
-        for (header, nheader) in zip(headers, headers[1:])
-    ][1:]
-
-    sections = [
-        [element for element in section if not element.is_empty()]
-        for section in raw_sections
-        if isinstance(section[0], LTTextContainer)
-        and not "SECTION" in section[0].get_text()
-    ]
-
-    pprint(sections)
+    parsingelements = [convert_to_parsing_element(element) for element in elements]
+    parsingelements.reverse()
+    print(parsingelements)
+
+    # Data Structures
+    docudata = {}  # nested dictionaries, represents the parsing structure
+    levelstack = []  # stack of dictionaries, used to remember higher level dictionaries
+    existingnames = []  # stack of sets, used to remember reused names in each scope
+    xstack = []  # stack of x coordinates
+
+    # Append and update base dictionary
+    levelstack.append(docudata)
+    existingnames.append(set())
+
+    # Append and update initial dictionary
+    heldelement = parsingelements.pop()
+    xstack.append(heldelement.x0)
+    levelstack.append({})
+    existingnames.append(set())
+    docudata[generateUniqueName(heldelement.name, existingnames[-1])] = levelstack[-1]
+
+    while len(parsingelements) > 0:
+        # Pop all stacks, get next element
+        heldDictionary = levelstack.pop()
+        heldElement = parsingelements.pop()
+        heldNames = existingnames.pop()
+        heldX = xstack.pop()
+        print("======================================\nTesting Element:", heldElement.name.strip())
+        # If the element is further to the right, push what we just popped back on the stack
+        # Create a new dictionary underneath the dictionary we popped
+        if heldElement.x0 > heldX:
+            print("Decision: push dict")
+            # Push stuff back onto stack
+            levelstack.append(heldDictionary)
+            existingnames.append(heldNames)
+            xstack.append(heldX)
+
+            # Add new dictionary one level down
+            newDictionary = {}
+            heldDictionary[generateUniqueName(heldElement.name, existingnames[-1])] = newDictionary
+            levelstack.append(newDictionary)
+            existingnames.append(set())
+            # Push new x level, which is further to the right
+            xstack.append(heldElement.x0)
+        # If the element is at the same level,
+        # create a new dictionary at the same level as the dictionary we popped
+        elif heldElement.x0 == heldX:
+            print("Decision: push element")
+            # The x level remains the same
+            xstack.append(heldX)
+
+            # Add new dictionary at the same level
+            newDictionary = {}
+            levelstack[-1][generateUniqueName(heldElement.name, existingnames[-1])] = newDictionary
+            levelstack.append(newDictionary)
+            existingnames.append(set())
+        # If the element is further to the left,
+        # then we just hold off on doing anything until the x level is equal to that of a previous level
+        elif heldElement.x0 < heldX:
+            print("Decision: pop and wait")
+            parsingelements.append(heldElement)
+        # Should never happen
+        else:
+            raise Exception
+        print("X coordinate stack:", xstack)
+
+    myFile = open("output.json", "w")
+    myFile.write(json.dumps(docudata, sort_keys=False, indent=2))
+    myFile.close()