Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions docs/dd/how-to/ligands.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,11 @@ from deeporigin.drug_discovery import Ligand
from rdkit import Chem

# Create an RDKit molecule
mol = Chem.MolFromSmiles("CCO") # Ethanol
mol = Chem.MolFromSmiles("C/C=C/Cn1cc(-c2cccc(C(=O)N(C)C)c2)c2cc[nH]c2c1=O")

# Convert to a Ligand
ligand = Ligand.from_rdkit_mol(
mol=mol,
name="Ethanol", # Optional name for the ligand
)
```

Expand All @@ -146,7 +145,7 @@ You can also create a `LigandSet` from a list of RDKit molecules:
from deeporigin.drug_discovery import LigandSet
from rdkit import Chem

mols = [Chem.MolFromSmiles("CCO"), Chem.MolFromSmiles("CCCO")]
mols = [Chem.MolFromSmiles("C/C=C/Cn1cc(-c2cccc(C(=O)N(C)C)c2)c2cc[nH]c2c1=O"), Chem.MolFromSmiles("CCCC")]
ligands = LigandSet.from_rdkit_mols(mols)
```

Expand Down
2 changes: 1 addition & 1 deletion docs/dd/ref/ligand.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Example:
```python
from deeporigin.drug_discovery.structures import Ligand

lig = Ligand.from_smiles("CCO", name="Ethanol")
lig = Ligand.from_smiles("C/C=C/Cn1cc(-c2cccc(C(=O)N(C)C)c2)c2cc[nH]c2c1=O")
lig.prepare() # Preserves hydrogens by default
lig.prepare(remove_hydrogens=True) # Remove hydrogens from SMILES
```
127 changes: 127 additions & 0 deletions src/drug_discovery/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,130 @@
Ligands containing atoms outside this set should be rejected by preparation
utilities.
"""


ELEMENT_SYMBOLS = {
"Ac",
"Ag",
"Al",
"Am",
"Ar",
"As",
"At",
"Au",
"B",
"Ba",
"Be",
"Bh",
"Bi",
"Bk",
"Br",
"C",
"Ca",
"Cd",
"Ce",
"Cf",
"Cl",
"Cm",
"Cn",
"Co",
"Cr",
"Cs",
"Cu",
"Db",
"Ds",
"Dy",
"Er",
"Es",
"Eu",
"F",
"Fe",
"Fl",
"Fm",
"Fr",
"Ga",
"Gd",
"Ge",
"H",
"He",
"Hf",
"Hg",
"Ho",
"Hs",
"I",
"In",
"Ir",
"K",
"Kr",
"La",
"Li",
"Lr",
"Lu",
"Lv",
"Md",
"Mg",
"Mn",
"Mo",
"Mt",
"N",
"Na",
"Nb",
"Nd",
"Ne",
"Ni",
"No",
"Np",
"O",
"Os",
"P",
"Pa",
"Pb",
"Pd",
"Pm",
"Po",
"Pr",
"Pt",
"Pu",
"Ra",
"Rb",
"Re",
"Rf",
"Rg",
"Rh",
"Rn",
"Ru",
"S",
"Sb",
"Sc",
"Se",
"Sg",
"Si",
"Sm",
"Sn",
"Sr",
"Ta",
"Tb",
"Tc",
"Te",
"Th",
"Ti",
"Tl",
"Tm",
"U",
"Uuo",
"Uup",
"Uus",
"Uut",
"V",
"W",
"Xe",
"Y",
"Yb",
"Zn",
"Zr",
"b",
"c",
"n",
"o",
"p",
}
49 changes: 49 additions & 0 deletions src/drug_discovery/structures/ligand.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@

from deeporigin.drug_discovery.constants import LIGANDS_DIR, SUPPORTED_ATOM_SYMBOLS
from deeporigin.drug_discovery.utilities.visualize import jupyter_visualization
from deeporigin.drug_discovery.validation import (
is_smiles_like,
is_valid_smiles,
matches_mol_rules,
)
from deeporigin.exceptions import DeepOriginException
from deeporigin.platform.client import DeepOriginClient
from deeporigin.utils.constants import number
Expand Down Expand Up @@ -593,9 +598,31 @@ def __post_init__(self):
"mol must be provided when initializing from an identifier, file path, SMILES string, or block content."
)

# Validate SMILES if provided before processing
if self.smiles is not None and not is_valid_smiles(self.smiles):
raise DeepOriginException(
f"Invalid SMILES string provided: {self.smiles}"
) from None

self.process_mol()
self.smiles = Chem.MolToSmiles(Chem.RemoveHs(self.mol), canonical=True)

# Validate the generated SMILES
if not is_valid_smiles(self.smiles):
raise DeepOriginException(
f"Generated SMILES string is invalid: {self.smiles}"
) from None

if not is_smiles_like(self.smiles):
raise DeepOriginException(
f"Generated SMILES string does not match SMILES pattern: {self.smiles}"
) from None

if not matches_mol_rules(self.smiles):
raise DeepOriginException(
f"SMILES string does not match basic molecular rules: {self.smiles}"
) from None

if not self.mol.GetConformers():
AllChem.Compute2DCoords(self.mol)

Expand Down Expand Up @@ -1088,6 +1115,28 @@ class LigandSet:
ligands: list[Ligand] = field(default_factory=list)
network: dict = field(default_factory=dict)

def __post_init__(self):
"""Validate all ligands in the set."""
for i, ligand in enumerate(self.ligands):
if not isinstance(ligand, Ligand):
raise DeepOriginException(
f"All items in LigandSet must be Ligand instances, "
f"but found {type(ligand).__name__} at index {i}"
)
if ligand.smiles is not None:
if not is_valid_smiles(ligand.smiles):
raise DeepOriginException(
f"Ligand at index {i} has invalid SMILES: {ligand.smiles}"
)
if not is_smiles_like(ligand.smiles):
raise DeepOriginException(
f"Ligand at index {i} SMILES does not match SMILES pattern: {ligand.smiles}"
)
if not matches_mol_rules(ligand.smiles):
raise DeepOriginException(
f"Ligand at index {i} SMILES does not match basic molecular rules: {ligand.smiles}"
)

def __len__(self):
return len(self.ligands)

Expand Down
141 changes: 141 additions & 0 deletions src/drug_discovery/validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""This module contains functions for validating Ligands and Proteins."""

import re
from typing import Any

from beartype import beartype
from rdkit import Chem

from deeporigin.drug_discovery.constants import ELEMENT_SYMBOLS

# https://github.dev/mcs07/ChemDataExtractor
SMILES_RE = re.compile(
r"^([BCNOPSFIbcnosp*]|Cl|Br|\[\d*(%(e)s|se|as|\*)(@+([THALSPBO]\d+)?)?(H\d?)?([\-+]+\d*)?(:\d+)?\])"
r"([BCNOPSFIbcnosp*]|Cl|Br|\[\d*(%(e)s|se|as|\*)(@+([THALSPBO]\d+)?)?(H\d?)?([\-+]+\d*)?(:\d+)?\]|"
r"[\-=#$:\\/\(\)%%\.+\d])*$" % {"e": "|".join(ELEMENT_SYMBOLS)}
)


def check_brackets(text: str) -> tuple[int, list[tuple[int, str]]]:
"""Check bracket balance in the input text and return unmatched bracket info.

Args:
text: Input string to check for bracket balance.

Returns:
A tuple containing:
- nesting_level: Final depth of unmatched opening brackets
(-1 if an unmatched closing bracket was found).
- unmatched_stack: List of (index, char) tuples for unmatched brackets.
"""
opening_to_closing = {"(": ")", "[": "]", "{": "}"}
closing_to_opening = {v: k for k, v in opening_to_closing.items()}

unmatched_stack: list[tuple[int, str]] = []
for index, char in enumerate(text):
if char in opening_to_closing:
unmatched_stack.append((index, char))
elif char in closing_to_opening:
if unmatched_stack and unmatched_stack[-1][1] == closing_to_opening[char]:
unmatched_stack.pop()
else:
# Found unmatched closing bracket
return -1, [(index, char)]

return len(unmatched_stack), unmatched_stack


@beartype
def is_valid_smiles(smiles: Any) -> bool:
"""Check if a string is a valid SMILES representation.

Uses RDKit to parse the SMILES string and verify it can be converted to a
molecule object. Handles empty strings, None values, and strings wrapped in
quotes.

Args:
smiles: Input to validate. Can be any type, but only strings are checked.

Returns:
True if the input is a valid SMILES string, False otherwise.
"""
if isinstance(smiles, str) and smiles:
smiles = smiles.strip().strip("'").strip('"')
try:
mol = Chem.MolFromSmiles(smiles)
return mol is not None
except Exception:
return False
return False


@beartype
def is_smiles_like(string: str) -> bool:
"""Check if a string looks like a SMILES string using pattern matching.

Validates that the string has balanced brackets and matches a SMILES-like
pattern. This is a faster but less accurate check than is_valid_smiles.

Args:
string: Input string to check.

Returns:
True if the string appears to be SMILES-like, False otherwise.
"""
if not isinstance(string, str) or not string.strip():
return False
string = string.strip()

level, _ = check_brackets(string)
if level != 0:
return False

return bool(SMILES_RE.match(string))


@beartype
def matches_mol_rules(smiles_like_str: str) -> bool:
"""Check if a SMILES-like string matches basic molecular rules.

Validates that the string represents a reasonable molecule by checking:
- Has at least 4 atoms
- Has at least 2 carbon atoms
- Common organic elements (C, N, O, S, F) make up at least 40% of atoms

Args:
smiles_like_str: Input SMILES-like string to validate.

Returns:
True if the string matches molecular rules, False otherwise.
"""
smiles_like_str = smiles_like_str.lower()

def count_atoms(inp: str) -> dict[str, int]:
"""Count occurrences of each atom type character in the input string.

Args:
inp: Input string to count atoms in.

Returns:
Dictionary mapping atom type characters to their counts.
"""
atom_types = "abcdefghiklmnopqrstuvwxyz" # except j
counts: dict[str, int] = dict.fromkeys(atom_types, 0)
pattern = re.compile(r"([a-ik-z])")
matches = pattern.findall(inp)
for match in matches:
if match in counts:
counts[match] += 1
return counts

counts = count_atoms(smiles_like_str)

n_atoms = sum(counts.values())

score = n_atoms < 4
score += counts["c"] < 2
score += (
counts["c"] + counts["n"] + counts["o"] + counts["s"] + counts["f"]
) > 0.4 * n_atoms

return score < 2
Loading