diff --git a/.gitignore b/.gitignore
index b6e4761..8ee2ea0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,88 @@
+# Tungsten output
+output.json
+
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# AWS User-specific
+.idea/**/aws.xml
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn. Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# SonarLint plugin
+.idea/sonarlint/
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+# idea folder, uncomment if you don't need it
+# .idea
+
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..ff07f5d
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,17 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..0d7d3b4
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/tungsten.iml b/.idea/tungsten.iml
new file mode 100644
index 0000000..d0876a7
--- /dev/null
+++ b/.idea/tungsten.iml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tungsten/parsers/sigma_aldrich.py b/tungsten/parsers/sigma_aldrich.py
index 5bc4f28..e6ec7ed 100644
--- a/tungsten/parsers/sigma_aldrich.py
+++ b/tungsten/parsers/sigma_aldrich.py
@@ -1,28 +1,125 @@
+from collections import deque
from pprint import pprint
+import json
+import pdfminer.layout
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
+from pdfminer.layout import LTComponent
+from pdfminer.layout import LTText
+from pdfminer.layout import LTItem
+
+from dataclasses import dataclass
+from enum import Enum
+
+
+class ParsingElementType(Enum):
+ TEXT = 1
+ VECTOR = 2
+ RASTER = 3
+
+
+@dataclass
+class ParsingElement:
+ """Class used to abstract PDF objects into parsing objects"""
+ x0: float
+ y0: float
+ x1: float
+ y1: float
+ type: ParsingElementType
+ element: LTItem
+ name: str
+
+ def __lt__(self, other):
+ return self.y0 < other.y0
+
+
+def convert_to_parsing_element(lt_item: LTItem):
+ return ParsingElement(lt_item.x0, lt_item.y0, lt_item.x1, lt_item.y1,
+ None,
+ lt_item,
+ lt_item.get_text() if isinstance(lt_item, LTText) else type(lt_item).__name__)
+
+
+def generateUniqueName(proposedName: str, nameSet: set):
+ if proposedName in nameSet:
+ return proposedName + "'"
+ else:
+ return proposedName
def parse_sigma_aldrich(filename) -> None:
+ # Currently this program does not catch these edge cases:
+ # - An element will be further to the left than the first element, this triggers a stack underflow
+
elements = [element for elements in extract_pages(filename) for element in elements]
- headers = [
- (index, element)
- for index, element in enumerate(elements)
- if element.x0 == 53.88 and not element.is_empty()
- ]
-
- raw_sections = [
- elements[header[0]:nheader[0]]
- for (header, nheader) in zip(headers, headers[1:])
- ][1:]
-
- sections = [
- [element for element in section if not element.is_empty()]
- for section in raw_sections
- if isinstance(section[0], LTTextContainer)
- and not "SECTION" in section[0].get_text()
- ]
-
- pprint(sections)
+ parsingelements = [convert_to_parsing_element(element) for element in elements]
+ parsingelements.reverse()
+ print(parsingelements)
+
+ # Data Structures
+ docudata = {} # nested dictionaries, represents the parsing structure
+ levelstack = [] # stack of dictionaries, used to remember higher level dictionaries
+ existingnames = [] # stack of sets, used to remember reused names in each scope
+ xstack = [] # stack of x coordinates
+
+ # Append and update base dictionary
+ levelstack.append(docudata)
+ existingnames.append(set())
+
+ # Append and update initial dictionary
+ heldelement = parsingelements.pop()
+ xstack.append(heldelement.x0)
+ levelstack.append({})
+ existingnames.append(set())
+ docudata[generateUniqueName(heldelement.name, existingnames[-1])] = levelstack[-1]
+
+ while len(parsingelements) > 0:
+ # Pop all stacks, get next element
+ heldDictionary = levelstack.pop()
+ heldElement = parsingelements.pop()
+ heldNames = existingnames.pop()
+ heldX = xstack.pop()
+ print("======================================\nTesting Element:", heldElement.name.strip())
+ # If the element is further to the right, push what we just popped back on the stack
+ # Create a new dictionary underneath the dictionary we popped
+ if heldElement.x0 > heldX:
+ print("Decision: push dict")
+ # Push stuff back onto stack
+ levelstack.append(heldDictionary)
+ existingnames.append(heldNames)
+ xstack.append(heldX)
+
+ # Add new dictionary one level down
+ newDictionary = {}
+ heldDictionary[generateUniqueName(heldElement.name, existingnames[-1])] = newDictionary
+ levelstack.append(newDictionary)
+ existingnames.append(set())
+ # Push new x level, which is further to the right
+ xstack.append(heldElement.x0)
+ # If the element is at the same level,
+ # create a new dictionary at the same level as the dictionary we popped
+ elif heldElement.x0 == heldX:
+ print("Decision: push element")
+ # The x level remains the same
+ xstack.append(heldX)
+
+ # Add new dictionary at the same level
+ newDictionary = {}
+ levelstack[-1][generateUniqueName(heldElement.name, existingnames[-1])] = newDictionary
+ levelstack.append(newDictionary)
+ existingnames.append(set())
+ # If the element is further to the left,
+ # then we just hold off on doing anything until the x level is equal to that of a previous level
+ elif heldElement.x0 < heldX:
+ print("Decision: pop and wait")
+ parsingelements.append(heldElement)
+ # Should never happen
+ else:
+ raise Exception
+ print("X coordinate stack:", xstack)
+
+ myFile = open("output.json", "w")
+ myFile.write(json.dumps(docudata, sort_keys=False, indent=2))
+ myFile.close()