diff --git a/.gitignore b/.gitignore index b6e4761..8ee2ea0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,88 @@ +# Tungsten output +output.json + +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# AWS User-specific +.idea/**/aws.xml + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# SonarLint plugin +.idea/sonarlint/ + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +# idea folder, uncomment if you don't need it +# .idea + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..ff07f5d --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,17 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..0d7d3b4 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/tungsten.iml b/.idea/tungsten.iml new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ b/.idea/tungsten.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/tungsten/parsers/sigma_aldrich.py b/tungsten/parsers/sigma_aldrich.py index 5bc4f28..e6ec7ed 100644 --- a/tungsten/parsers/sigma_aldrich.py +++ b/tungsten/parsers/sigma_aldrich.py @@ -1,28 +1,125 @@ +from collections import deque from pprint import pprint +import json +import pdfminer.layout from pdfminer.high_level import extract_pages from pdfminer.layout import LTTextContainer +from pdfminer.layout import LTComponent +from pdfminer.layout import LTText +from pdfminer.layout import LTItem + +from dataclasses import dataclass +from enum import Enum + + +class ParsingElementType(Enum): + TEXT = 1 + VECTOR = 2 + RASTER = 3 + + +@dataclass +class ParsingElement: + """Class used to abstract PDF objects into parsing objects""" + x0: float + y0: float + x1: float + y1: float + type: ParsingElementType + element: LTItem + name: str + + def __lt__(self, other): + return self.y0 < other.y0 + + +def convert_to_parsing_element(lt_item: LTItem): + return ParsingElement(lt_item.x0, lt_item.y0, lt_item.x1, lt_item.y1, + None, + lt_item, + lt_item.get_text() if isinstance(lt_item, LTText) else type(lt_item).__name__) + + +def generateUniqueName(proposedName: str, nameSet: set): + if proposedName in nameSet: + return proposedName + "'" + else: + return proposedName def parse_sigma_aldrich(filename) -> None: + # Currently this program does not catch these edge cases: + # - An element will be further to the left than the first element, this triggers a stack underflow + elements = [element for elements in extract_pages(filename) for element in elements] - headers = [ - (index, element) - for index, element in enumerate(elements) - if element.x0 == 53.88 and not element.is_empty() - ] - - raw_sections = [ - elements[header[0]:nheader[0]] - for (header, nheader) in zip(headers, headers[1:]) - ][1:] - - sections = [ - [element for element in section if not element.is_empty()] - for section in raw_sections - if isinstance(section[0], LTTextContainer) - and not "SECTION" in section[0].get_text() - ] - - pprint(sections) + parsingelements = [convert_to_parsing_element(element) for element in elements] + parsingelements.reverse() + print(parsingelements) + + # Data Structures + docudata = {} # nested dictionaries, represents the parsing structure + levelstack = [] # stack of dictionaries, used to remember higher level dictionaries + existingnames = [] # stack of sets, used to remember reused names in each scope + xstack = [] # stack of x coordinates + + # Append and update base dictionary + levelstack.append(docudata) + existingnames.append(set()) + + # Append and update initial dictionary + heldelement = parsingelements.pop() + xstack.append(heldelement.x0) + levelstack.append({}) + existingnames.append(set()) + docudata[generateUniqueName(heldelement.name, existingnames[-1])] = levelstack[-1] + + while len(parsingelements) > 0: + # Pop all stacks, get next element + heldDictionary = levelstack.pop() + heldElement = parsingelements.pop() + heldNames = existingnames.pop() + heldX = xstack.pop() + print("======================================\nTesting Element:", heldElement.name.strip()) + # If the element is further to the right, push what we just popped back on the stack + # Create a new dictionary underneath the dictionary we popped + if heldElement.x0 > heldX: + print("Decision: push dict") + # Push stuff back onto stack + levelstack.append(heldDictionary) + existingnames.append(heldNames) + xstack.append(heldX) + + # Add new dictionary one level down + newDictionary = {} + heldDictionary[generateUniqueName(heldElement.name, existingnames[-1])] = newDictionary + levelstack.append(newDictionary) + existingnames.append(set()) + # Push new x level, which is further to the right + xstack.append(heldElement.x0) + # If the element is at the same level, + # create a new dictionary at the same level as the dictionary we popped + elif heldElement.x0 == heldX: + print("Decision: push element") + # The x level remains the same + xstack.append(heldX) + + # Add new dictionary at the same level + newDictionary = {} + levelstack[-1][generateUniqueName(heldElement.name, existingnames[-1])] = newDictionary + levelstack.append(newDictionary) + existingnames.append(set()) + # If the element is further to the left, + # then we just hold off on doing anything until the x level is equal to that of a previous level + elif heldElement.x0 < heldX: + print("Decision: pop and wait") + parsingelements.append(heldElement) + # Should never happen + else: + raise Exception + print("X coordinate stack:", xstack) + + myFile = open("output.json", "w") + myFile.write(json.dumps(docudata, sort_keys=False, indent=2)) + myFile.close()