forked from ishanigan/REINVENT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
debug_data_structs.py
44 lines (38 loc) · 1.34 KB
/
debug_data_structs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# %%
import numpy as np
import random
import re
import pickle
from rdkit import Chem
import sys
import time
import torch
from torch.utils.data import Dataset
from utils import Variable
# %%
def filter_mol(mol, max_heavy_atoms=50, min_heavy_atoms=10, element_list=[6,7,8,9,16,17,35]):
"""Filters molecules on number of heavy atoms and atom types"""
if mol is not None:
num_heavy = min_heavy_atoms<mol.GetNumHeavyAtoms()<max_heavy_atoms
elements = all([atom.GetAtomicNum() in element_list for atom in mol.GetAtoms()])
print('heavy: ' + str(mol.GetNumHeavyAtoms()))
print('elements' + str(elements))
if num_heavy and elements:
return True
else:
return False
def canonicalize_smiles_from_file(fname):
"""Reads a SMILES file and returns a list of RDKIT SMILES"""
with open(fname, 'r') as f:
smiles_list = []
for i, line in enumerate(f):
if i % 100000 == 0:
print("{} lines processed.".format(i))
smiles = line.split(" ")[0]
print(smiles)
mol = Chem.MolFromSmiles(smiles)
if filter_mol(mol):
smiles_list.append(Chem.MolToSmiles(mol))
print("{} SMILES retrieved".format(len(smiles_list)))
return smiles_list
canonicalize_smiles_from_file('mols.smi')