Skip to content
This repository has been archived by the owner on Jun 15, 2024. It is now read-only.

Commit

Permalink
initial working hierarchy
Browse files Browse the repository at this point in the history
  • Loading branch information
GreenCappuccino committed Sep 29, 2022
1 parent 415e3be commit 7dab07f
Show file tree
Hide file tree
Showing 8 changed files with 254 additions and 19 deletions.
85 changes: 85 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,88 @@
# Tungsten output
output.json

### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# AWS User-specific
.idea/**/aws.xml

# Generated files
.idea/**/contentModel.xml

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# SonarLint plugin
.idea/sonarlint/

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based Rest Client
.idea/httpRequests

# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser

# idea folder, uncomment if you don't need it
# .idea

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
8 changes: 8 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions .idea/inspectionProfiles/Project_Default.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/tungsten.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

135 changes: 116 additions & 19 deletions tungsten/parsers/sigma_aldrich.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,125 @@
from collections import deque
from pprint import pprint
import json

import pdfminer.layout
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
from pdfminer.layout import LTComponent
from pdfminer.layout import LTText
from pdfminer.layout import LTItem

from dataclasses import dataclass
from enum import Enum


class ParsingElementType(Enum):
TEXT = 1
VECTOR = 2
RASTER = 3


@dataclass
class ParsingElement:
"""Class used to abstract PDF objects into parsing objects"""
x0: float
y0: float
x1: float
y1: float
type: ParsingElementType
element: LTItem
name: str

def __lt__(self, other):
return self.y0 < other.y0


def convert_to_parsing_element(lt_item: LTItem):
return ParsingElement(lt_item.x0, lt_item.y0, lt_item.x1, lt_item.y1,
None,
lt_item,
lt_item.get_text() if isinstance(lt_item, LTText) else type(lt_item).__name__)


def generateUniqueName(proposedName: str, nameSet: set):
if proposedName in nameSet:
return proposedName + "'"
else:
return proposedName


def parse_sigma_aldrich(filename) -> None:
# Currently this program does not catch these edge cases:
# - An element will be further to the left than the first element, this triggers a stack underflow

elements = [element for elements in extract_pages(filename) for element in elements]

headers = [
(index, element)
for index, element in enumerate(elements)
if element.x0 == 53.88 and not element.is_empty()
]

raw_sections = [
elements[header[0]:nheader[0]]
for (header, nheader) in zip(headers, headers[1:])
][1:]

sections = [
[element for element in section if not element.is_empty()]
for section in raw_sections
if isinstance(section[0], LTTextContainer)
and not "SECTION" in section[0].get_text()
]

pprint(sections)
parsingelements = [convert_to_parsing_element(element) for element in elements]
parsingelements.reverse()
print(parsingelements)

# Data Structures
docudata = {} # nested dictionaries, represents the parsing structure
levelstack = [] # stack of dictionaries, used to remember higher level dictionaries
existingnames = [] # stack of sets, used to remember reused names in each scope
xstack = [] # stack of x coordinates

# Append and update base dictionary
levelstack.append(docudata)
existingnames.append(set())

# Append and update initial dictionary
heldelement = parsingelements.pop()
xstack.append(heldelement.x0)
levelstack.append({})
existingnames.append(set())
docudata[generateUniqueName(heldelement.name, existingnames[-1])] = levelstack[-1]

while len(parsingelements) > 0:
# Pop all stacks, get next element
heldDictionary = levelstack.pop()
heldElement = parsingelements.pop()
heldNames = existingnames.pop()
heldX = xstack.pop()
print("======================================\nTesting Element:", heldElement.name.strip())
# If the element is further to the right, push what we just popped back on the stack
# Create a new dictionary underneath the dictionary we popped
if heldElement.x0 > heldX:
print("Decision: push dict")
# Push stuff back onto stack
levelstack.append(heldDictionary)
existingnames.append(heldNames)
xstack.append(heldX)

# Add new dictionary one level down
newDictionary = {}
heldDictionary[generateUniqueName(heldElement.name, existingnames[-1])] = newDictionary
levelstack.append(newDictionary)
existingnames.append(set())
# Push new x level, which is further to the right
xstack.append(heldElement.x0)
# If the element is at the same level,
# create a new dictionary at the same level as the dictionary we popped
elif heldElement.x0 == heldX:
print("Decision: push element")
# The x level remains the same
xstack.append(heldX)

# Add new dictionary at the same level
newDictionary = {}
levelstack[-1][generateUniqueName(heldElement.name, existingnames[-1])] = newDictionary
levelstack.append(newDictionary)
existingnames.append(set())
# If the element is further to the left,
# then we just hold off on doing anything until the x level is equal to that of a previous level
elif heldElement.x0 < heldX:
print("Decision: pop and wait")
parsingelements.append(heldElement)
# Should never happen
else:
raise Exception
print("X coordinate stack:", xstack)

myFile = open("output.json", "w")
myFile.write(json.dumps(docudata, sort_keys=False, indent=2))
myFile.close()

0 comments on commit 7dab07f

Please sign in to comment.