Skip to content
This repository has been archived by the owner on Jun 15, 2024. It is now read-only.

Commit

Permalink
Merge branch 'main' into dependabot/pip/cryptography-39.0.1
Browse files Browse the repository at this point in the history
  • Loading branch information
Den4200 authored Feb 9, 2023
2 parents 13640ae + 273f125 commit 98deac1
Show file tree
Hide file tree
Showing 10 changed files with 272 additions and 95 deletions.
30 changes: 27 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,44 @@ pip install tungsten-sds
## Usage Example

```python
import json
from pathlib import Path

from tungsten import SigmaAldrichSdsParser
from tungsten import SigmaAldrichSdsParser, SdsQueryFieldName, \
SigmaAldrichFieldMapper

sds_parser = SigmaAldrichSdsParser()
sds_path = Path("sigma_aldrich_w4502.pdf")
sds_path = Path("CERILLIAN_L-001.pdf")

# Convert PDF file to parsed data
with open(sds_path, "rb") as f:
sds = sds_parser.parse_to_ghs_sds(f, sds_name=sds_path.stem)
sds = sds_parser.parse_to_ghs_sds(f)

field_mapper = SigmaAldrichFieldMapper()

fields = [
SdsQueryFieldName.PRODUCT_NAME,
SdsQueryFieldName.PRODUCT_NUMBER,
SdsQueryFieldName.PRODUCT_BRAND,
SdsQueryFieldName.RECOMMENDED_USE_AND_RESTRICTIONS,
SdsQueryFieldName.SUPPLIER_ADDRESS,
SdsQueryFieldName.SUPPLIER_TELEPHONE,
SdsQueryFieldName.SUPPLIER_FAX,
SdsQueryFieldName.EMERGENCY_TELEPHONE,
SdsQueryFieldName.IDENTIFICATION_OTHER,
SdsQueryFieldName.SUBSTANCE_CLASSIFICATION,
SdsQueryFieldName.PICTOGRAM,
SdsQueryFieldName.SIGNAL_WORD,
SdsQueryFieldName.HNOC_HAZARD,
]

# Serialize parsed data to JSON and dump to a file
with open(sds_path.stem + ".json", "w") as f:
sds.dump(f)
# Also print out mapped fields
for field in fields:
print(field.name, field_mapper.getField(field, json.loads(sds.dumps())))

```

## License
Expand Down
7 changes: 6 additions & 1 deletion tungsten/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,16 @@
from tungsten.globally_harmonized_system.safety_data_sheet import (
GhsSdsJsonEncoder
)
from tungsten.parsers.field_parse import SdsQueryFieldName
from tungsten.parsers.supplier.sigma_aldrich.field_parse import (
SigmaAldrichFieldMapper
)
from tungsten.parsers.supplier.sigma_aldrich.sds_parser import (
SigmaAldrichSdsParser
)

os.environ["TABULA_JAR"] = str(
(Path(__file__).parent.parent / "tabula-1.0.6-SNAPSHOT-jar-with-dependencies.jar").resolve())

__all__ = ("GhsSdsJsonEncoder", "SigmaAldrichSdsParser")
__all__ = ("GhsSdsJsonEncoder", "SigmaAldrichSdsParser", "SigmaAldrichFieldMapper",
"SdsQueryFieldName")
7 changes: 7 additions & 0 deletions tungsten/globally_harmonized_system/safety_data_sheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ def dumps(self, **kwargs: dict) -> str:
return json.dumps(self.to_dict(), cls=GhsSdsJsonEncoder, **kwargs)


class GhsSdsMetaTitle(Enum):
VERSION = enum.auto()
REVISION_DATE = enum.auto()
PRINT_DATE = enum.auto()


@dataclass
class GhsSdsSection:
"""Representation of a GHS SDS section in :class:`GhsSafetyDataSheet`"""
Expand Down Expand Up @@ -68,6 +74,7 @@ class GhsSdsItem:
Note that the UN GHS SDS structure is a representation of the SDS document itself, and not
necessarily a structured representation of all fields and data."""
type: GhsSdsItemType
name: str
data: any

def __str__(self):
Expand Down
89 changes: 0 additions & 89 deletions tungsten/parse.py

This file was deleted.

76 changes: 76 additions & 0 deletions tungsten/parsers/field_parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from __future__ import annotations

import abc
import enum
from dataclasses import dataclass
from enum import Enum
from re import Pattern
from typing import Callable


class FieldMapper(metaclass=abc.ABCMeta):
def getField(self, field: SdsQueryFieldName, target: dict):
mapping = self.getFieldMappings(field)
commands, post_process = mapping
for command in commands:
target = command.match(target)
try:
result = post_process(target)
except (KeyError, AttributeError):
result = None
return result

@abc.abstractmethod
def getFieldMappings(self, field: SdsQueryFieldName) -> tuple[list[SelectCommand], Callable]:
pass


@dataclass
class SelectCommand:
key: str
where_value: any | Pattern

def __init__(self, key, where_value=None):
self.key = key
self.where_value = where_value

def match(self, targets: dict | list):
if isinstance(targets, dict):
return (targets if isinstance(targets, dict) else targets.__dict__)[self.key]
if targets is None:
return None
for target in targets:
compare = (target if isinstance(target, dict) else target.__dict__)[self.key]
if isinstance(self.where_value, Pattern):
if self.where_value.match(compare) is not None:
return target
else:
if self.where_value == compare:
return target


class SdsQueryFieldName(Enum):
META_VERSION = enum.auto() # Document meta version
META_REVISION_DATE = enum.auto() # Document supplier-provided revision date
META_PRINT_DATE = enum.auto() # Document supplier-provided print date

PRODUCT_NAME = enum.auto()
PRODUCT_NUMBER = enum.auto()
PRODUCT_BRAND = enum.auto()
RECOMMENDED_USE_AND_RESTRICTIONS = enum.auto() # Ref. RECOMMENDED_USE_AND_RESTRICTIONS

SUPPLIER_ADDRESS = enum.auto() # Ref. (INSIDE OF!!!) SUPPLIER_DETAILS
SUPPLIER_TELEPHONE = enum.auto() # Ref. (INSIDE OF!!!) SUPPLIER_DETAILS
SUPPLIER_FAX = enum.auto() # Ref. (INSIDE OF!!!) SUPPLIER_DETAILS

EMERGENCY_TELEPHONE = enum.auto() # Ref. EMERGENCY_PHONE_NUMBER

# Ref. OTHER_MEANS_OF_IDENTIFICATION, IDENTIFICATION_OTHER Data dump! No Schema!
IDENTIFICATION_OTHER = enum.auto()

SUBSTANCE_CLASSIFICATION = enum.auto() # Ref. GHS_SUBSTANCE_CLASSIFICATION

PICTOGRAM = enum.auto() # Ref. (INSIDE OF!!!) GHS_LABEL_ELEMENTS
SIGNAL_WORD = enum.auto() # Ref. (INSIDE OF!!!) GHS_LABEL_ELEMENTS

HNOC_HAZARD = enum.auto() # Ref. OTHER_HAZARDS, HAZARDS_OTHER. Data dump! No Schema!
2 changes: 1 addition & 1 deletion tungsten/parsers/parsing_hierarchy.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,5 +85,5 @@ def __lt__(self, other: HierarchyElement):

def __str__(self):
s = f"{self.text_content.strip() if self.text_content.strip() != '' else self.class_name}"\
f"(x{self.page_x0},y{self.page_y0}),(x{self.page_x1},y{self.page_y1})"
# f"(x{self.page_x0},y{self.page_y0}),(x{self.page_x1},y{self.page_y1})"
return s
Empty file.
131 changes: 131 additions & 0 deletions tungsten/parsers/supplier/sigma_aldrich/field_parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import re
from typing import Callable

from tungsten.parsers.field_parse import (
FieldMapper,
SdsQueryFieldName,
SelectCommand
)


class SigmaAldrichFieldMapper(FieldMapper):
def getFieldMappings(self, field: SdsQueryFieldName) -> tuple[list[SelectCommand], Callable]:
return {
# SdsQueryFieldName.META_VERSION: [],
# SdsQueryFieldName.META_REVISION_DATE: [],
# SdsQueryFieldName.META_PRINT_DATE: [],
SdsQueryFieldName.PRODUCT_NAME: ([
SelectCommand(key="sections"),
SelectCommand(key="title", where_value="IDENTIFICATION"),
SelectCommand(key="subsections"),
SelectCommand(key="title", where_value="GHS_PRODUCT_IDENTIFIER"),
SelectCommand(key="items"),
SelectCommand(key="name", where_value=re.compile(r"Product\sname", re.IGNORECASE)),
SelectCommand(key="data")
], lambda x: re.match(r"\:?\s*(.*)", x[0]).group(1)),
SdsQueryFieldName.PRODUCT_NUMBER: ([
SelectCommand(key="sections"),
SelectCommand(key="title", where_value="IDENTIFICATION"),
SelectCommand(key="subsections"),
SelectCommand(key="title", where_value="GHS_PRODUCT_IDENTIFIER"),
SelectCommand(key="items"),
SelectCommand(key="name",
where_value=re.compile(r"Product\sNumber", re.IGNORECASE)),
SelectCommand(key="data")
], lambda x: re.match(r"\:?\s*(.*)", x[0]).group(1)),
SdsQueryFieldName.PRODUCT_BRAND: ([
SelectCommand(key="sections"),
SelectCommand(key="title", where_value="IDENTIFICATION"),
SelectCommand(key="subsections"),
SelectCommand(key="title", where_value="GHS_PRODUCT_IDENTIFIER"),
SelectCommand(key="items"),
SelectCommand(key="name", where_value=re.compile(r"Brand", re.IGNORECASE)),
SelectCommand(key="data")
], lambda x: re.match(r"\:?\s*(.*)", x[0]).group(1)),
SdsQueryFieldName.RECOMMENDED_USE_AND_RESTRICTIONS: ([
SelectCommand(key="sections"),
SelectCommand(key="title", where_value="IDENTIFICATION"),
SelectCommand(key="subsections"),
SelectCommand(key="title", where_value="RECOMMENDED_USE_AND_RESTRICTIONS"),
SelectCommand(key="items"),
SelectCommand(key="name",
where_value=re.compile(r"Identified\suses", re.IGNORECASE)),
SelectCommand(key="data")
], lambda x: re.match(r"\:?\s*(.*)", x[0]).group(1)),
SdsQueryFieldName.SUPPLIER_ADDRESS: ([
SelectCommand(key="sections"),
SelectCommand(key="title", where_value="IDENTIFICATION"),
SelectCommand(key="subsections"),
SelectCommand(key="title", where_value="SUPPLIER_DETAILS"),
SelectCommand(key="items"),
SelectCommand(key="name", where_value=re.compile(r"Company", re.IGNORECASE)),
SelectCommand(key="data")
], lambda x: re.match(r"\:?\s*(.*)", "".join(x), re.DOTALL).group(1)),
SdsQueryFieldName.SUPPLIER_TELEPHONE: ([
SelectCommand(key="sections"),
SelectCommand(key="title", where_value="IDENTIFICATION"),
SelectCommand(key="subsections"),
SelectCommand(key="title", where_value="SUPPLIER_DETAILS"),
SelectCommand(key="items"),
SelectCommand(key="name", where_value=re.compile(r"Telephone", re.IGNORECASE)),
SelectCommand(key="data")
], lambda x: re.match(r"\:?\s*(.*)", x[0]).group(1)),
SdsQueryFieldName.SUPPLIER_FAX: ([
SelectCommand(key="sections"),
SelectCommand(key="title", where_value="IDENTIFICATION"),
SelectCommand(key="subsections"),
SelectCommand(key="title", where_value="SUPPLIER_DETAILS"),
SelectCommand(key="items"),
SelectCommand(key="name", where_value=re.compile(r"Fax", re.IGNORECASE)),
SelectCommand(key="data")
], lambda x: re.match(r"\:?\s*(.*)", x[0]).group(1)),
SdsQueryFieldName.EMERGENCY_TELEPHONE: ([
SelectCommand(key="sections"),
SelectCommand(key="title", where_value="IDENTIFICATION"),
SelectCommand(key="subsections"),
SelectCommand(key="title", where_value="EMERGENCY_PHONE_NUMBER"),
SelectCommand(key="items"),
SelectCommand(key="name",
where_value=re.compile(r"Emergency\sPhone", re.IGNORECASE)),
SelectCommand(key="data")
], lambda x: re.match(r"\:?\s*(.*)", "".join(x), re.DOTALL).group(1)),
SdsQueryFieldName.IDENTIFICATION_OTHER: ([
SelectCommand(key="sections"),
SelectCommand(key="title", where_value="IDENTIFICATION"),
SelectCommand(key="subsections"),
SelectCommand(key="title", where_value="IDENTIFICATION_OTHER"),
SelectCommand(key="items")
], lambda x: x),
SdsQueryFieldName.SUBSTANCE_CLASSIFICATION: ([
SelectCommand(key="sections"),
SelectCommand(key="title", where_value="HAZARDS"),
SelectCommand(key="subsections"),
SelectCommand(key="title", where_value="GHS_SUBSTANCE_CLASSIFICATION"),
SelectCommand(key="items")
], lambda x: "\n".join(map(lambda y: y["name"], x))),
SdsQueryFieldName.PICTOGRAM: ([
SelectCommand(key="sections"),
SelectCommand(key="title", where_value="HAZARDS"),
SelectCommand(key="subsections"),
SelectCommand(key="title", where_value="GHS_LABEL_ELEMENTS"),
SelectCommand(key="items"),
SelectCommand(key="name", where_value=re.compile(r"Pictogram", re.IGNORECASE)),
SelectCommand(key="data")
], lambda x: x),
SdsQueryFieldName.SIGNAL_WORD: ([
SelectCommand(key="sections"),
SelectCommand(key="title", where_value="HAZARDS"),
SelectCommand(key="subsections"),
SelectCommand(key="title", where_value="GHS_LABEL_ELEMENTS"),
SelectCommand(key="items"),
SelectCommand(key="name", where_value=re.compile(r"Signal\sword", re.IGNORECASE)),
SelectCommand(key="data")
], lambda x: re.search(r"(danger|warning)", "".join(x), re.IGNORECASE).group(1)),
SdsQueryFieldName.HNOC_HAZARD: ([
SelectCommand(key="sections"),
SelectCommand(key="title", where_value="HAZARDS"),
SelectCommand(key="subsections"),
SelectCommand(key="title", where_value="OTHER_HAZARDS"),
SelectCommand(key="items"),
], lambda x: x)
}[field]
Loading

0 comments on commit 98deac1

Please sign in to comment.