Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/dd/how-to/proteins.md
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,13 @@ ligand = protein.extract_ligand()
!!! warning "Method mutates the `Protein` object"
The `extract_ligand()` method not only extracts the ligand but also **mutates the protein object** by removing the ligand from the protein structure. This means that after calling this method, the protein will no longer contain the ligand atoms.

!!! note "Water molecules are automatically excluded"
The `extract_ligand()` method automatically filters out water molecules (HOH, WAT, H2O) and other excluded residue names. You can customize which residues to exclude by passing the `exclude_resnames` parameter:

```python
# Exclude custom residue names
ligand = protein.extract_ligand(exclude_resnames={"HOH", "CUSTOM_RES"})
```

!!! note "Note about atom counts"
In this example, the atom count might not change significantly because the ligand atoms are typically a small fraction of the total protein structure. However, the protein's internal structure and `block_content` are updated to exclude the ligand. Additionally, the PDB file's MASTER record is automatically updated to reflect the new atom and CONECT record counts.
Expand Down
97 changes: 64 additions & 33 deletions src/drug_discovery/structures/protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,49 +677,80 @@ def extract_ligand(self, exclude_resnames: Optional[set[str]] = None) -> Ligand:
from rdkit import Chem

if exclude_resnames is None:
exclude_resnames = {"HOH"}
exclude_resnames = {"HOH", "WAT", "H2O"}
else:
# Normalize to uppercase for comparison
exclude_resnames = {resname.upper() for resname in exclude_resnames}

ligand_lines = []
conect_lines = []
ligand_resnames = set()
ligand_atom_serials = set()

# For CIF files, we need to convert to PDB format first to extract HETATM records
if self.block_type == "cif":
# Convert CIF to PDB format temporarily
with tempfile.NamedTemporaryFile(
mode="w", suffix=".pdb", delete=False
) as temp_file:
temp_pdb_path = temp_file.name
try:
self.to_pdb(temp_pdb_path)
with open(temp_pdb_path, "r") as f:
content_lines = list(f)
finally:
# Clean up temporary file
if os.path.exists(temp_pdb_path):
os.remove(temp_pdb_path)
elif self.block_content:
# Split by newline and add newlines back to match file reading behavior
content_lines = [line + "\n" for line in self.block_content.split("\n")]
elif self.file_path:
# Read file line by line to preserve newlines
with open(self.file_path, "r") as f:
content_lines = list(f)
else:
raise ValueError(
"No block_content or file_path available to extract ligand from."
)

# First pass: collect HETATM lines and their residue names
with open(self.file_path, "r") as f:
for line in f:
if line.startswith("HETATM"):
resname = line[17:20].strip()
altloc = line[16].strip()
if resname in exclude_resnames:
continue
for line in content_lines:
if line.startswith("HETATM"):
# Extract resname and normalize to uppercase for comparison
resname = line[17:20].strip().upper()
altloc = line[16].strip()

if altloc not in ("", "A"): # skip altLocs other than primary
continue
ligand_lines.append(line)
ligand_resnames.add(resname)
# Store atom serial for later removal from structure
try:
atom_serial = int(line[6:11].strip())
ligand_atom_serials.add(atom_serial)
except ValueError:
continue
# Skip excluded residue names (e.g., water)
if resname in exclude_resnames:
continue

if altloc not in ("", "A"): # skip altLocs other than primary
continue
ligand_lines.append(line)
ligand_resnames.add(resname)
# Store atom serial for later removal from structure
try:
atom_serial = int(line[6:11].strip())
ligand_atom_serials.add(atom_serial)
except ValueError:
continue

# Second pass: collect CONECT records for the ligand atoms
with open(self.file_path, "r") as f:
for line in f:
if line.startswith("CONECT"):
try:
atom1 = int(line[6:11].strip())
# Check if this CONECT involves any ligand atoms
# We'll need to check against the atom serial numbers in our HETATM records
for hetatm_line in ligand_lines:
hetatm_atom_serial = int(hetatm_line[6:11].strip())
if atom1 == hetatm_atom_serial:
conect_lines.append(line)
break
except ValueError:
# Skip malformed CONECT records
continue
for line in content_lines:
if line.startswith("CONECT"):
try:
atom1 = int(line[6:11].strip())
# Check if this CONECT involves any ligand atoms
# We'll need to check against the atom serial numbers in our HETATM records
for hetatm_line in ligand_lines:
hetatm_atom_serial = int(hetatm_line[6:11].strip())
if atom1 == hetatm_atom_serial:
conect_lines.append(line)
break
except ValueError:
# Skip malformed CONECT records
continue

if not ligand_lines:
raise ValueError("No ligand HETATM records found in the PDB.")
Expand Down
Loading