Skip to content

Commit 2d565d9

Browse files
authored
Merge pull request #3 from yuxuanliao/main
Add the multidimensional filtering and the CCS prediction for the same molecule with different coordinates
2 parents b9500fd + ed94a8d commit 2d565d9

5 files changed

+2137
-1
lines changed

README.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@ The following files are in the [others](others) folder:
8686
- [UMAPDataset.py](others/UMAPDataset.py). for generating graph datasets.
8787
- [theoretical calculation.ipynb](others/theoretical%20calculation.ipynb). investigate of the relationship between SigmaCCS and theoretical calculation
8888
- [Filtering.ipynb](others/Filtering.ipynb). Filtering of target unknown molecules based on the CCS and *mz* of the molecules
89+
- [90 and 180 degrees rotations.ipynb](others/90%20and%20180%20degrees%20rotations.ipynb). Performance on the test set with 90 and 180 degrees rotations
90+
- [Completely random rotations.ipynb](others/Completely%20random%20rotations.ipynb). Performance on the test set with completely random rotations
91+
- [Visualization of the 3D conformers using different seeds of ETKDG.ipynb](others/Visualization%20of%20the%203D%20conformers%20using%20different%20seeds%20of%20ETKDG.ipynb)
92+
- [RotationMatrix.py](others/RotationMatrix.py). define rotation matrix for rotating the molecules
8993
- *[CFM-ID4](others/CFM-ID4)*. the code for generating MS/MS spectra with CFM-ID 4.0.
9094
- *[GNN-RT](others/GNN-RT)*:
9195
- [README.md](others/GNN-RT/README.md)
@@ -105,7 +109,12 @@ The following files are in the [others](others) folder:
105109
- LJ_data.csv (Get the LJ interaction parameters of different elements according to LJ_data.csv)
106110
- *Coordinate data* (Store the 3D coordinate data of all molecules in data.csv)
107111
- *Filtering data*
108-
- data.csv
112+
- data.csv
113+
- *[MultidimensionalFiltering](others/MultidimensionalFiltering)*:
114+
- *[Example.ipynb](others/MultidimensionalFiltering/Example.ipynb)*. Taken the lipid (PubChem CID: 114944) as an example to show the multidimensional filtering assisted by SigmaCCS
115+
- *[LipidBlastPredCCS.xlsx](others/MultidimensionalFiltering/LipidBlastPredCCS.xlsx)*. the predicted CCS values of the lipids in the LipidBlast
116+
- *[MListWithPredictedRTs_329.xlsx](others/MultidimensionalFiltering/MListWithPredictedRTs_329.xlsx)*. retention times of the molecules in the MList of the lipid (PubChem CID: 114944). Please refer to github GNN-RT repository for details of RT prediction
117+
- *[Mouse_lung_adduct_negative.xlsx](others/MultidimensionalFiltering/Mouse_lung_adduct_negative.xlsx)*. the mouse lung dataset with 761 lipids in negative ion mode after the removal of unpredictable adducts and empty SMILES strings
109118

110119
### Package required:
111120
- [UMAP](https://github.com/lmcinnes/umap) 0.5.1
@@ -121,3 +130,4 @@ The following files are in the [`slurm`](slurm) folder
121130
- zmzhang@csu.edu.cn
122131
- youjiazhang126@163.com
123132
- renfengguo05@foxmail.com
133+
- 212311021@csu.edu.cn
Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "cdedc3a0",
6+
"metadata": {},
7+
"source": [
8+
"# 90 and 180 degrees rotations"
9+
]
10+
},
11+
{
12+
"cell_type": "code",
13+
"execution_count": 1,
14+
"id": "64dcf84e",
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"from RotationMatrix import *\n",
19+
"import sys\n",
20+
"sys.path.append(\"..\")\n",
21+
"from sigma.GraphData import *\n",
22+
"from sigma.model import *\n",
23+
"import pandas as pd\n",
24+
"import random\n",
25+
"from pandas import Series,DataFrame\n",
26+
"import numpy as np\n",
27+
"import math\n",
28+
"import copy\n",
29+
"from tqdm import tqdm\n",
30+
"import statistics"
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": 2,
36+
"id": "c1e1129f",
37+
"metadata": {},
38+
"outputs": [],
39+
"source": [
40+
"ifile = '../data/TestData.csv'\n",
41+
"ParameterPath = '../parameter/parameter.pkl'\n",
42+
"mfileh5 = '../model/model_new.h5'"
43+
]
44+
},
45+
{
46+
"cell_type": "markdown",
47+
"id": "abffdfa3",
48+
"metadata": {},
49+
"source": [
50+
"## Performance on the test set with initial position"
51+
]
52+
},
53+
{
54+
"cell_type": "code",
55+
"execution_count": 3,
56+
"id": "d62bc948",
57+
"metadata": {},
58+
"outputs": [
59+
{
60+
"name": "stdout",
61+
"output_type": "stream",
62+
"text": [
63+
"The generation of 3d conformers: 100%|█████████████████▉| 558/559 [02:19<00:00, 3.99it/s]\n"
64+
]
65+
},
66+
{
67+
"name": "stderr",
68+
"output_type": "stream",
69+
"text": [
70+
"..\\sigma\\GraphData.py:187: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
71+
" features = np.asarray(features)\n",
72+
"..\\sigma\\GraphData.py:188: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
73+
" edge_features = np.asarray(edge_features)\n"
74+
]
75+
},
76+
{
77+
"name": "stdout",
78+
"output_type": "stream",
79+
"text": [
80+
"Predictions: 100%|█████████████████████████████████████▉| 558/559 [00:07<00:00, 75.44it/s]\n",
81+
"The number of molecules in the test set : 559 \n",
82+
"\n",
83+
"R2 Score : 0.9937075292150549 \n",
84+
"\n",
85+
"Median Relative Error : 1.2091052412873495 %\n"
86+
]
87+
}
88+
],
89+
"source": [
90+
"smiles, adduct, ccs = read_data(ifile)\n",
91+
"param = parameter.Parameter()\n",
92+
"with open(ParameterPath,'rb') as file:\n",
93+
" param = pickle.loads(file.read()) \n",
94+
"smiles, adduct, ccs, Coordinate = Generating_coordinates(smiles, adduct, ccs, param.All_Atoms) #559\n",
95+
"\n",
96+
"CoordinateStd = copy.deepcopy(Coordinate)\n",
97+
"for z in range(len(CoordinateStd)):\n",
98+
" CoordinateStd[z] = (np.array(CoordinateStd[z]) - param.Min_Coor) / (param.Max_Coor - param.Min_Coor)\n",
99+
" \n",
100+
"adj, features, edge_features = convertToGraph(smiles, CoordinateStd, param.All_Atoms)\n",
101+
"DataSet = MyDataset(features, adj, edge_features, ccs)\n",
102+
"\n",
103+
"ECC_Model = load_Model_from_file(mfileh5)\n",
104+
"\n",
105+
"re = predict(ECC_Model,param.adduct_SET,DataSet,adduct,)\n",
106+
"print(\"The number of molecules in the test set :\", len(re), '\\n')\n",
107+
"\n",
108+
"Sigma = re\n",
109+
"CCS = ccs\n",
110+
"SigmaPer = R2_MRE(CCS, Sigma)"
111+
]
112+
},
113+
{
114+
"cell_type": "markdown",
115+
"id": "5fe0a744",
116+
"metadata": {},
117+
"source": [
118+
"## Performance on the test set with 90 degree rotation"
119+
]
120+
},
121+
{
122+
"cell_type": "code",
123+
"execution_count": 4,
124+
"id": "852caa4a",
125+
"metadata": {},
126+
"outputs": [
127+
{
128+
"name": "stderr",
129+
"output_type": "stream",
130+
"text": [
131+
"..\\sigma\\GraphData.py:187: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
132+
" features = np.asarray(features)\n",
133+
"..\\sigma\\GraphData.py:188: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
134+
" edge_features = np.asarray(edge_features)\n"
135+
]
136+
},
137+
{
138+
"name": "stdout",
139+
"output_type": "stream",
140+
"text": [
141+
"Predictions: 100%|█████████████████████████████████████▉| 558/559 [00:07<00:00, 75.13it/s]\n",
142+
"R2 Score : 0.9936971125028244 \n",
143+
"\n",
144+
"Median Relative Error : 1.1653698978192344 %\n"
145+
]
146+
}
147+
],
148+
"source": [
149+
"CoordinateRotate = copy.deepcopy(Coordinate)\n",
150+
"for k in range(len(CoordinateRotate)):\n",
151+
" thetax = 0.25* math.pi\n",
152+
" thetay = 0.25* math.pi\n",
153+
" thetaz = 0.25* math.pi\n",
154+
" for ii in range(len(CoordinateRotate[k])):\n",
155+
" CoordinateRotate[k][ii] = CoordinateRotate[k][ii] @Rx(thetax) @Ry(thetay) @Rz(thetaz)\n",
156+
"for i in range(len(CoordinateRotate)):\n",
157+
" CoordinateRotate[i] = (np.array(CoordinateRotate[i]) - param.Min_Coor) / (param.Max_Coor - param.Min_Coor)\n",
158+
"\n",
159+
"adj, features, edge_features = convertToGraph(smiles, CoordinateRotate, param.All_Atoms)\n",
160+
"DataSet = MyDataset(features, adj, edge_features, ccs)\n",
161+
"\n",
162+
"ECC_Model = load_Model_from_file(mfileh5)\n",
163+
"\n",
164+
"re = predict(ECC_Model,param.adduct_SET,DataSet,adduct,)\n",
165+
"\n",
166+
"Sigma = re\n",
167+
"CCS = ccs\n",
168+
"SigmaPer = R2_MRE(CCS, Sigma)"
169+
]
170+
},
171+
{
172+
"cell_type": "markdown",
173+
"id": "8a30e39c",
174+
"metadata": {},
175+
"source": [
176+
"## Performance on the test set with 180 degree rotation"
177+
]
178+
},
179+
{
180+
"cell_type": "code",
181+
"execution_count": 5,
182+
"id": "ef24f01e",
183+
"metadata": {},
184+
"outputs": [
185+
{
186+
"name": "stderr",
187+
"output_type": "stream",
188+
"text": [
189+
"..\\sigma\\GraphData.py:187: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
190+
" features = np.asarray(features)\n",
191+
"..\\sigma\\GraphData.py:188: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
192+
" edge_features = np.asarray(edge_features)\n"
193+
]
194+
},
195+
{
196+
"name": "stdout",
197+
"output_type": "stream",
198+
"text": [
199+
"Predictions: 100%|█████████████████████████████████████▉| 558/559 [00:07<00:00, 75.23it/s]\n",
200+
"R2 Score : 0.9935965379786625 \n",
201+
"\n",
202+
"Median Relative Error : 1.2743489126141125 %\n"
203+
]
204+
}
205+
],
206+
"source": [
207+
"CoordinateRotate = copy.deepcopy(Coordinate)\n",
208+
"for k in range(len(CoordinateRotate)):\n",
209+
" thetax = 0.5*math.pi\n",
210+
" thetay = 0.5*math.pi\n",
211+
" thetaz = 0.5*math.pi\n",
212+
" for ii in range(len(CoordinateRotate[k])):\n",
213+
" CoordinateRotate[k][ii] = CoordinateRotate[k][ii] @Rx(thetax) @Ry(thetay) @Rz(thetaz)\n",
214+
"for i in range(len(CoordinateRotate)):\n",
215+
" CoordinateRotate[i] = (np.array(CoordinateRotate[i]) - param.Min_Coor) / (param.Max_Coor - param.Min_Coor)\n",
216+
"\n",
217+
"adj, features, edge_features = convertToGraph(smiles, CoordinateRotate, param.All_Atoms)\n",
218+
"DataSet = MyDataset(features, adj, edge_features, ccs)\n",
219+
"\n",
220+
"ECC_Model = load_Model_from_file(mfileh5)\n",
221+
"\n",
222+
"re = predict(ECC_Model,param.adduct_SET,DataSet,adduct,)\n",
223+
"\n",
224+
"Sigma = re\n",
225+
"CCS = ccs\n",
226+
"SigmaPer = R2_MRE(CCS, Sigma)"
227+
]
228+
}
229+
],
230+
"metadata": {
231+
"kernelspec": {
232+
"display_name": "Python 3 (ipykernel)",
233+
"language": "python",
234+
"name": "python3"
235+
},
236+
"language_info": {
237+
"codemirror_mode": {
238+
"name": "ipython",
239+
"version": 3
240+
},
241+
"file_extension": ".py",
242+
"mimetype": "text/x-python",
243+
"name": "python",
244+
"nbconvert_exporter": "python",
245+
"pygments_lexer": "ipython3",
246+
"version": "3.7.7"
247+
}
248+
},
249+
"nbformat": 4,
250+
"nbformat_minor": 5
251+
}

0 commit comments

Comments
 (0)