deeporiginbio · sg-s · Jan 12, 2026 · Jan 12, 2026 · Jan 12, 2026
@@ -214,6 +214,13 @@ ligand = protein.extract_ligand()
 !!! warning "Method mutates the `Protein` object"
     The `extract_ligand()` method not only extracts the ligand but also **mutates the protein object** by removing the ligand from the protein structure. This means that after calling this method, the protein will no longer contain the ligand atoms.
 
+!!! note "Water molecules are automatically excluded"
+    The `extract_ligand()` method automatically filters out water molecules (HOH, WAT, H2O) and other excluded residue names. You can customize which residues to exclude by passing the `exclude_resnames` parameter:
+
+    ```python
+    # Exclude custom residue names
+    ligand = protein.extract_ligand(exclude_resnames={"HOH", "CUSTOM_RES"})
+    ```
 
 !!! note "Note about atom counts"
     In this example, the atom count might not change significantly because the ligand atoms are typically a small fraction of the total protein structure. However, the protein's internal structure and `block_content` are updated to exclude the ligand. Additionally, the PDB file's MASTER record is automatically updated to reflect the new atom and CONECT record counts.

@@ -677,49 +677,80 @@ def extract_ligand(self, exclude_resnames: Optional[set[str]] = None) -> Ligand:
         from rdkit import Chem
 
         if exclude_resnames is None:
-            exclude_resnames = {"HOH"}
+            exclude_resnames = {"HOH", "WAT", "H2O"}
+        else:
+            # Normalize to uppercase for comparison
+            exclude_resnames = {resname.upper() for resname in exclude_resnames}
 
         ligand_lines = []
         conect_lines = []
         ligand_resnames = set()
         ligand_atom_serials = set()
 
+        # For CIF files, we need to convert to PDB format first to extract HETATM records
+        if self.block_type == "cif":
+            # Convert CIF to PDB format temporarily
+            with tempfile.NamedTemporaryFile(
+                mode="w", suffix=".pdb", delete=False
+            ) as temp_file:
+                temp_pdb_path = temp_file.name
+            try:
+                self.to_pdb(temp_pdb_path)
+                with open(temp_pdb_path, "r") as f:
+                    content_lines = list(f)
+            finally:
+                # Clean up temporary file
+                if os.path.exists(temp_pdb_path):
+                    os.remove(temp_pdb_path)
+        elif self.block_content:
+            # Split by newline and add newlines back to match file reading behavior
+            content_lines = [line + "\n" for line in self.block_content.split("\n")]
+        elif self.file_path:
+            # Read file line by line to preserve newlines
+            with open(self.file_path, "r") as f:
+                content_lines = list(f)
+        else:
+            raise ValueError(
+                "No block_content or file_path available to extract ligand from."
+            )
+
         # First pass: collect HETATM lines and their residue names
-        with open(self.file_path, "r") as f:
-            for line in f:
-                if line.startswith("HETATM"):
-                    resname = line[17:20].strip()
-                    altloc = line[16].strip()
-                    if resname in exclude_resnames:
-                        continue
+        for line in content_lines:
+            if line.startswith("HETATM"):
+                # Extract resname and normalize to uppercase for comparison
+                resname = line[17:20].strip().upper()
+                altloc = line[16].strip()
 
-                    if altloc not in ("", "A"):  # skip altLocs other than primary
-                        continue
-                    ligand_lines.append(line)
-                    ligand_resnames.add(resname)
-                    # Store atom serial for later removal from structure
-                    try:
-                        atom_serial = int(line[6:11].strip())
-                        ligand_atom_serials.add(atom_serial)
-                    except ValueError:
-                        continue
+                # Skip excluded residue names (e.g., water)
+                if resname in exclude_resnames:
+                    continue
+
+                if altloc not in ("", "A"):  # skip altLocs other than primary
+                    continue
+                ligand_lines.append(line)
+                ligand_resnames.add(resname)
+                # Store atom serial for later removal from structure
+                try:
+                    atom_serial = int(line[6:11].strip())
+                    ligand_atom_serials.add(atom_serial)
+                except ValueError:
+                    continue
 
         # Second pass: collect CONECT records for the ligand atoms
-        with open(self.file_path, "r") as f:
-            for line in f:
-                if line.startswith("CONECT"):
-                    try:
-                        atom1 = int(line[6:11].strip())
-                        # Check if this CONECT involves any ligand atoms
-                        # We'll need to check against the atom serial numbers in our HETATM records
-                        for hetatm_line in ligand_lines:
-                            hetatm_atom_serial = int(hetatm_line[6:11].strip())
-                            if atom1 == hetatm_atom_serial:
-                                conect_lines.append(line)
-                                break
-                    except ValueError:
-                        # Skip malformed CONECT records
-                        continue
+        for line in content_lines:
+            if line.startswith("CONECT"):
+                try:
+                    atom1 = int(line[6:11].strip())
+                    # Check if this CONECT involves any ligand atoms
+                    # We'll need to check against the atom serial numbers in our HETATM records
+                    for hetatm_line in ligand_lines:
+                        hetatm_atom_serial = int(hetatm_line[6:11].strip())
+                        if atom1 == hetatm_atom_serial:
+                            conect_lines.append(line)
+                            break
+                except ValueError:
+                    # Skip malformed CONECT records
+                    continue
 
         if not ligand_lines:
             raise ValueError("No ligand HETATM records found in the PDB.")