deeporiginbio · sg-s · Jan 21, 2026 · Jan 21, 2026
@@ -129,12 +129,11 @@ from deeporigin.drug_discovery import Ligand
 from rdkit import Chem
 
 # Create an RDKit molecule
-mol = Chem.MolFromSmiles("CCO")  # Ethanol
+mol = Chem.MolFromSmiles("C/C=C/Cn1cc(-c2cccc(C(=O)N(C)C)c2)c2cc[nH]c2c1=O")  
 
 # Convert to a Ligand
 ligand = Ligand.from_rdkit_mol(
     mol=mol,
-    name="Ethanol",  # Optional name for the ligand
 )
 ```
 
@@ -146,7 +145,7 @@ You can also create a `LigandSet` from a list of RDKit molecules:
 from deeporigin.drug_discovery import LigandSet
 from rdkit import Chem
 
-mols = [Chem.MolFromSmiles("CCO"), Chem.MolFromSmiles("CCCO")]
+mols = [Chem.MolFromSmiles("C/C=C/Cn1cc(-c2cccc(C(=O)N(C)C)c2)c2cc[nH]c2c1=O"), Chem.MolFromSmiles("CCCC")]
 ligands = LigandSet.from_rdkit_mols(mols)
 ```
 

@@ -28,7 +28,7 @@ Example:
 ```python
 from deeporigin.drug_discovery.structures import Ligand
 
-lig = Ligand.from_smiles("CCO", name="Ethanol")
+lig = Ligand.from_smiles("C/C=C/Cn1cc(-c2cccc(C(=O)N(C)C)c2)c2cc[nH]c2c1=O")
 lig.prepare()  # Preserves hydrogens by default
 lig.prepare(remove_hydrogens=True)  # Remove hydrogens from SMILES
 ```
@@ -152,3 +152,130 @@
 Ligands containing atoms outside this set should be rejected by preparation
 utilities.
 """
+
+
+ELEMENT_SYMBOLS = {
+    "Ac",
+    "Ag",
+    "Al",
+    "Am",
+    "Ar",
+    "As",
+    "At",
+    "Au",
+    "B",
+    "Ba",
+    "Be",
+    "Bh",
+    "Bi",
+    "Bk",
+    "Br",
+    "C",
+    "Ca",
+    "Cd",
+    "Ce",
+    "Cf",
+    "Cl",
+    "Cm",
+    "Cn",
+    "Co",
+    "Cr",
+    "Cs",
+    "Cu",
+    "Db",
+    "Ds",
+    "Dy",
+    "Er",
+    "Es",
+    "Eu",
+    "F",
+    "Fe",
+    "Fl",
+    "Fm",
+    "Fr",
+    "Ga",
+    "Gd",
+    "Ge",
+    "H",
+    "He",
+    "Hf",
+    "Hg",
+    "Ho",
+    "Hs",
+    "I",
+    "In",
+    "Ir",
+    "K",
+    "Kr",
+    "La",
+    "Li",
+    "Lr",
+    "Lu",
+    "Lv",
+    "Md",
+    "Mg",
+    "Mn",
+    "Mo",
+    "Mt",
+    "N",
+    "Na",
+    "Nb",
+    "Nd",
+    "Ne",
+    "Ni",
+    "No",
+    "Np",
+    "O",
+    "Os",
+    "P",
+    "Pa",
+    "Pb",
+    "Pd",
+    "Pm",
+    "Po",
+    "Pr",
+    "Pt",
+    "Pu",
+    "Ra",
+    "Rb",
+    "Re",
+    "Rf",
+    "Rg",
+    "Rh",
+    "Rn",
+    "Ru",
+    "S",
+    "Sb",
+    "Sc",
+    "Se",
+    "Sg",
+    "Si",
+    "Sm",
+    "Sn",
+    "Sr",
+    "Ta",
+    "Tb",
+    "Tc",
+    "Te",
+    "Th",
+    "Ti",
+    "Tl",
+    "Tm",
+    "U",
+    "Uuo",
+    "Uup",
+    "Uus",
+    "Uut",
+    "V",
+    "W",
+    "Xe",
+    "Y",
+    "Yb",
+    "Zn",
+    "Zr",
+    "b",
+    "c",
+    "n",
+    "o",
+    "p",
+}
@@ -22,6 +22,11 @@
 
 from deeporigin.drug_discovery.constants import LIGANDS_DIR, SUPPORTED_ATOM_SYMBOLS
 from deeporigin.drug_discovery.utilities.visualize import jupyter_visualization
+from deeporigin.drug_discovery.validation import (
+    is_smiles_like,
+    is_valid_smiles,
+    matches_mol_rules,
+)
 from deeporigin.exceptions import DeepOriginException
 from deeporigin.platform.client import DeepOriginClient
 from deeporigin.utils.constants import number
@@ -593,9 +598,31 @@ def __post_init__(self):
                 "mol must be provided when initializing from an identifier, file path, SMILES string, or block content."
             )
 
+        # Validate SMILES if provided before processing
+        if self.smiles is not None and not is_valid_smiles(self.smiles):
+            raise DeepOriginException(
+                f"Invalid SMILES string provided: {self.smiles}"
+            ) from None
+
         self.process_mol()
         self.smiles = Chem.MolToSmiles(Chem.RemoveHs(self.mol), canonical=True)
 
+        # Validate the generated SMILES
+        if not is_valid_smiles(self.smiles):
+            raise DeepOriginException(
+                f"Generated SMILES string is invalid: {self.smiles}"
+            ) from None
+
+        if not is_smiles_like(self.smiles):
+            raise DeepOriginException(
+                f"Generated SMILES string does not match SMILES pattern: {self.smiles}"
+            ) from None
+
+        if not matches_mol_rules(self.smiles):
+            raise DeepOriginException(
+                f"SMILES string does not match basic molecular rules: {self.smiles}"
+            ) from None
+
         if not self.mol.GetConformers():
             AllChem.Compute2DCoords(self.mol)
 
@@ -1088,6 +1115,28 @@ class LigandSet:
     ligands: list[Ligand] = field(default_factory=list)
     network: dict = field(default_factory=dict)
 
+    def __post_init__(self):
+        """Validate all ligands in the set."""
+        for i, ligand in enumerate(self.ligands):
+            if not isinstance(ligand, Ligand):
+                raise DeepOriginException(
+                    f"All items in LigandSet must be Ligand instances, "
+                    f"but found {type(ligand).__name__} at index {i}"
+                )
+            if ligand.smiles is not None:
+                if not is_valid_smiles(ligand.smiles):
+                    raise DeepOriginException(
+                        f"Ligand at index {i} has invalid SMILES: {ligand.smiles}"
+                    )
+                if not is_smiles_like(ligand.smiles):
+                    raise DeepOriginException(
+                        f"Ligand at index {i} SMILES does not match SMILES pattern: {ligand.smiles}"
+                    )
+                if not matches_mol_rules(ligand.smiles):
+                    raise DeepOriginException(
+                        f"Ligand at index {i} SMILES does not match basic molecular rules: {ligand.smiles}"
+                    )
+
     def __len__(self):
         return len(self.ligands)
 

@@ -0,0 +1,141 @@
+"""This module contains functions for validating Ligands and Proteins."""
+
+import re
+from typing import Any
+
+from beartype import beartype
+from rdkit import Chem
+
+from deeporigin.drug_discovery.constants import ELEMENT_SYMBOLS
+
+# https://github.dev/mcs07/ChemDataExtractor
+SMILES_RE = re.compile(
+    r"^([BCNOPSFIbcnosp*]|Cl|Br|\[\d*(%(e)s|se|as|\*)(@+([THALSPBO]\d+)?)?(H\d?)?([\-+]+\d*)?(:\d+)?\])"
+    r"([BCNOPSFIbcnosp*]|Cl|Br|\[\d*(%(e)s|se|as|\*)(@+([THALSPBO]\d+)?)?(H\d?)?([\-+]+\d*)?(:\d+)?\]|"
+    r"[\-=#$:\\/\(\)%%\.+\d])*$" % {"e": "|".join(ELEMENT_SYMBOLS)}
+)
+
+
+def check_brackets(text: str) -> tuple[int, list[tuple[int, str]]]:
+    """Check bracket balance in the input text and return unmatched bracket info.
+
+    Args:
+        text: Input string to check for bracket balance.
+
+    Returns:
+        A tuple containing:
+            - nesting_level: Final depth of unmatched opening brackets
+              (-1 if an unmatched closing bracket was found).
+            - unmatched_stack: List of (index, char) tuples for unmatched brackets.
+    """
+    opening_to_closing = {"(": ")", "[": "]", "{": "}"}
+    closing_to_opening = {v: k for k, v in opening_to_closing.items()}
+
+    unmatched_stack: list[tuple[int, str]] = []
+    for index, char in enumerate(text):
+        if char in opening_to_closing:
+            unmatched_stack.append((index, char))
+        elif char in closing_to_opening:
+            if unmatched_stack and unmatched_stack[-1][1] == closing_to_opening[char]:
+                unmatched_stack.pop()
+            else:
+                # Found unmatched closing bracket
+                return -1, [(index, char)]
+
+    return len(unmatched_stack), unmatched_stack
+
+
+@beartype
+def is_valid_smiles(smiles: Any) -> bool:
+    """Check if a string is a valid SMILES representation.
+
+    Uses RDKit to parse the SMILES string and verify it can be converted to a
+    molecule object. Handles empty strings, None values, and strings wrapped in
+    quotes.
+
+    Args:
+        smiles: Input to validate. Can be any type, but only strings are checked.
+
+    Returns:
+        True if the input is a valid SMILES string, False otherwise.
+    """
+    if isinstance(smiles, str) and smiles:
+        smiles = smiles.strip().strip("'").strip('"')
+        try:
+            mol = Chem.MolFromSmiles(smiles)
+            return mol is not None
+        except Exception:
+            return False
+    return False
+
+
+@beartype
+def is_smiles_like(string: str) -> bool:
+    """Check if a string looks like a SMILES string using pattern matching.
+
+    Validates that the string has balanced brackets and matches a SMILES-like
+    pattern. This is a faster but less accurate check than is_valid_smiles.
+
+    Args:
+        string: Input string to check.
+
+    Returns:
+        True if the string appears to be SMILES-like, False otherwise.
+    """
+    if not isinstance(string, str) or not string.strip():
+        return False
+    string = string.strip()
+
+    level, _ = check_brackets(string)
+    if level != 0:
+        return False
+
+    return bool(SMILES_RE.match(string))
+
+
+@beartype
+def matches_mol_rules(smiles_like_str: str) -> bool:
+    """Check if a SMILES-like string matches basic molecular rules.
+
+    Validates that the string represents a reasonable molecule by checking:
+    - Has at least 4 atoms
+    - Has at least 2 carbon atoms
+    - Common organic elements (C, N, O, S, F) make up at least 40% of atoms
+
+    Args:
+        smiles_like_str: Input SMILES-like string to validate.
+
+    Returns:
+        True if the string matches molecular rules, False otherwise.
+    """
+    smiles_like_str = smiles_like_str.lower()
+
+    def count_atoms(inp: str) -> dict[str, int]:
+        """Count occurrences of each atom type character in the input string.
+
+        Args:
+            inp: Input string to count atoms in.
+
+        Returns:
+            Dictionary mapping atom type characters to their counts.
+        """
+        atom_types = "abcdefghiklmnopqrstuvwxyz"  # except j
+        counts: dict[str, int] = dict.fromkeys(atom_types, 0)
+        pattern = re.compile(r"([a-ik-z])")
+        matches = pattern.findall(inp)
+        for match in matches:
+            if match in counts:
+                counts[match] += 1
+        return counts
+
+    counts = count_atoms(smiles_like_str)
+
+    n_atoms = sum(counts.values())
+
+    score = n_atoms < 4
+    score += counts["c"] < 2
+    score += (
+        counts["c"] + counts["n"] + counts["o"] + counts["s"] + counts["f"]
+    ) > 0.4 * n_atoms
+
+    return score < 2