-
Notifications
You must be signed in to change notification settings - Fork 26
/
ProteinToDNADriver.java
399 lines (340 loc) · 18.5 KB
/
ProteinToDNADriver.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package org.twentyn.proteintodna;
import act.server.MongoDB;
import act.shared.Chemical;
import act.shared.Reaction;
import act.shared.Seq;
import com.act.biointerpretation.metadata.ProteinMetadata;
import com.act.biointerpretation.metadata.RankPathway;
import com.act.reachables.Cascade;
import com.act.reachables.ReactionPath;
import com.act.utils.CLIUtil;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBObject;
import com.mongodb.MongoClient;
import com.mongodb.MongoInternalException;
import com.mongodb.ServerAddress;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.JSONArray;
import org.json.JSONObject;
import org.mongojack.DBCursor;
import org.mongojack.DBQuery;
import org.mongojack.JacksonDBCollection;
import org.mongojack.WriteResult;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class ProteinToDNADriver {
private static final Logger LOGGER = LogManager.getFormatterLogger(ProteinToDNADriver.class);
private static final String DEFAULT_DB_HOST = "localhost";
private static final String DEFAULT_DB_PORT = "27017";
private static final String DEFAULT_OUTPUT_DB_NAME = "wiki_reachables";
private static final String DEFAULT_INPUT_DB_NAME = "SHOULD_COME_FROM_CMDLINE"; // "jarvis_2016-12-09";
public static final String DEFAULT_INPUT_PATHWAY_COLLECTION_NAME = "SHOULD_COME_FROM_CMDLINE"; // "pathways_jarvis";
public static final String DEFAULT_OUTPUT_PATHWAY_COLLECTION_NAME = "SHOULD_COME_FROM_CMDLINE"; // "pathways_vijay";
public static final String DEFAULT_OUTPUT_DNA_SEQ_COLLECTION_NAME = "SHOULD_COME_FROM_CMDLINE"; // "dna_designs";
private static final String OPTION_DB_HOST = "H";
private static final String OPTION_DB_PORT = "p";
private static final String OPTION_OUTPUT_DB_NAME = "o";
private static final String OPTION_INPUT_DB_NAME = "i";
private static final String OPTION_INPUT_PATHWAY_COLLECTION_NAME = "c";
private static final String OPTION_OUTPUT_PATHWAY_COLLECTION_NAME = "d";
private static final String OPTION_OUTPUT_DNA_SEQ_COLLECTION_NAME = "e";
private static final String OPTION_DESIGN_SOME = "m";
private static final Integer HIGHEST_SCORING_INFERRED_SEQ_INDEX = 0;
private static final Set<String> BLACKLISTED_WORDS_IN_INFERRED_SEQ = new HashSet<>(Arrays.asList("Fragment"));
private static final Pattern REGEX_ID = Pattern.compile("^\\d+$");
private static final Pattern REGEX_INCHI = Pattern.compile("^InChI=1S?/");
// Based on https://en.wikipedia.org/wiki/International_Chemical_Identifier#InChIKey
private static final Pattern REGEX_INCHI_KEY = Pattern.compile("^[A-Z]{14}-[A-Z]{10}-[A-Z]$");
public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
add(Option.builder(OPTION_DB_HOST)
.argName("hostname")
.desc(String.format("The DB host to which to connect (default: %s)", DEFAULT_DB_HOST))
.hasArg()
.longOpt("db-host")
);
add(Option.builder(OPTION_DB_PORT)
.argName("port")
.desc(String.format("The DB port to which to connect (default: %s)", DEFAULT_DB_PORT))
.hasArg()
.longOpt("db-port")
);
add(Option.builder(OPTION_OUTPUT_DB_NAME)
.argName("output-db-name")
.desc(String.format("The name of the database to read pathways and write seqs to (default: %s)", DEFAULT_OUTPUT_DB_NAME))
.hasArg()
.longOpt("output-db-name")
);
add(Option.builder(OPTION_INPUT_DB_NAME)
.argName("input-db-name")
.desc(String.format("The name of the database to read reactions from (default: %s)", DEFAULT_INPUT_DB_NAME))
.hasArg()
.required()
.longOpt("source-db-name")
);
add(Option.builder(OPTION_INPUT_PATHWAY_COLLECTION_NAME)
.argName("collection-name")
.desc(String.format("The name of the input pathway collection to read from (default: %s)", DEFAULT_INPUT_PATHWAY_COLLECTION_NAME))
.hasArg()
.required()
.longOpt("input-pathway-collection")
);
add(Option.builder(OPTION_OUTPUT_PATHWAY_COLLECTION_NAME)
.argName("collection-name")
.desc(String.format("The name of the output pathway collection to write to (default: %s)", DEFAULT_OUTPUT_PATHWAY_COLLECTION_NAME))
.hasArg()
.required()
.longOpt("output-pathway-collection")
);
add(Option.builder(OPTION_OUTPUT_DNA_SEQ_COLLECTION_NAME)
.argName("collection-name")
.desc(String.format("The name of the output dna seq collection to write to (default: %s)", DEFAULT_OUTPUT_DNA_SEQ_COLLECTION_NAME))
.hasArg()
.required()
.longOpt("output-dna-seq-collection")
);
add(Option.builder(OPTION_DESIGN_SOME)
.argName("molecule")
.desc("Generate designs for specified molecules; can be a numeric ids, InChIs, or InChI Keys, *separated by '|'* " +
"for InChI compatibility. Molecules must exist in chemical source DB if InChI or InChI Key is used.")
.hasArgs().valueSeparator('|')
.longOpt("design-this")
);
}};
public static final String HELP_MESSAGE =
"This class is the driver to extract protein sequences from pathways and construct DNA designs from these proteins.";
private static final CLIUtil CLI_UTIL = new CLIUtil(ProteinToDNADriver.class, HELP_MESSAGE, OPTION_BUILDERS);
/**
* This function gets all protein combinations of a pathway from candidate protein sequences from each reaction on
* the pathway.
* @param listOfSetOfProteinSequences A list of sets of candidate protein sequences in the pathway
* @return A set of all possible combinations of proteins from all the reactions in the pathway.
*/
public static Set<List<String>> makePermutations(List<Set<String>> listOfSetOfProteinSequences) {
Set<List<String>> accum = new HashSet<>();
makePermutationsHelper(accum, listOfSetOfProteinSequences, new ArrayList<>());
return accum;
}
private static void makePermutationsHelper(Set<List<String>> accum, List<Set<String>> input, List<String> prefix) {
// Base case: no more sequences to add. Accumulate and return.
if (input.isEmpty()) {
accum.add(prefix);
return;
}
// Recursive case: iterate through next level of input sequences, appending each to a prefix and recurring.
Set<String> head = input.get(0);
// Avoid index out of bounds exception.
List<Set<String>> rest = input.size() > 1 ? input.subList(1, input.size()) : Collections.emptyList();
for (String next : head) {
List<String> newPrefix = new ArrayList<>(prefix);
newPrefix.add(next);
makePermutationsHelper(accum, rest, newPrefix);
}
}
public static void main(String[] args) throws Exception {
CommandLine cl = CLI_UTIL.parseCommandLine(args);
String reactionDbName = cl.getOptionValue(OPTION_INPUT_DB_NAME, DEFAULT_INPUT_DB_NAME);
String dbHost = cl.getOptionValue(OPTION_DB_HOST, DEFAULT_DB_HOST);
Integer dbPort = Integer.valueOf(cl.getOptionValue(OPTION_DB_PORT, DEFAULT_DB_PORT));
MongoDB reactionDB = new MongoDB(dbHost, dbPort, reactionDbName);
MongoClient inputClient = new MongoClient(new ServerAddress(dbHost, dbPort));
String outDB = cl.getOptionValue(OPTION_OUTPUT_DB_NAME, DEFAULT_OUTPUT_DB_NAME);
DB db = inputClient.getDB(outDB);
String inputPathwaysCollectionName = cl.getOptionValue(OPTION_INPUT_PATHWAY_COLLECTION_NAME, DEFAULT_INPUT_PATHWAY_COLLECTION_NAME);
String outputPathwaysCollectionName = cl.getOptionValue(OPTION_OUTPUT_PATHWAY_COLLECTION_NAME, DEFAULT_OUTPUT_PATHWAY_COLLECTION_NAME);
String outputDnaDeqCollectionName = cl.getOptionValue(OPTION_OUTPUT_DNA_SEQ_COLLECTION_NAME, DEFAULT_OUTPUT_DNA_SEQ_COLLECTION_NAME);
JacksonDBCollection<ReactionPath, String> inputPathwayCollection = JacksonDBCollection.wrap(db.getCollection(inputPathwaysCollectionName), ReactionPath.class, String.class);
JacksonDBCollection<DNADesign, String> dnaDesignCollection = JacksonDBCollection.wrap(db.getCollection(outputDnaDeqCollectionName), DNADesign.class, String.class);
JacksonDBCollection<ReactionPath, String> outputPathwayCollection = JacksonDBCollection.wrap(db.getCollection(outputPathwaysCollectionName), ReactionPath.class, String.class);
Map<String, Set<ProteinInformation>> proteinSeqToOrgInfo = new HashMap<>();
ProteinsToDNA2 p2d = ProteinsToDNA2.initiate();
List<Long> reachableIds = cl.hasOption(OPTION_DESIGN_SOME) ?
Arrays.stream(cl.getOptionValues(OPTION_DESIGN_SOME)).
map(m -> lookupMolecule(reactionDB, m)).collect(Collectors.toList()) :
Collections.emptyList();
// If there is a list of molecules for which to create designs, only consider their pathways. Otherwise, find all.
DBObject query = reachableIds.isEmpty() ? new BasicDBObject() : DBQuery.in("target", reachableIds);
/* Extract all pathway ids, then read pathways one id at a time. This will reduce the likelihood of the cursor
* timing out, which has a tendency to happen when doing expensive operations before advancing (as done here). */
DBCursor<ReactionPath> cursor = inputPathwayCollection.find(query, new BasicDBObject("_id", true));
List<String> pathwayIds = new ArrayList<>();
while (cursor.hasNext()) {
pathwayIds.add(cursor.next().getId());
}
for (String pathwayId : pathwayIds) {
ReactionPath reactionPath = inputPathwayCollection.findOne(DBQuery.is("_id", pathwayId));
List<List<Pair<ProteinMetadata, Integer>>> processedP = RankPathway.processSinglePathAsJava(reactionPath, reactionDbName, outDB);
if (processedP == null) {
LOGGER.info(String.format("Process pathway was filtered out possibly because there were more than %s seqs for a given pathway",
RankPathway.MAX_PROTEINS_PER_PATH()));
outputPathwayCollection.insert(reactionPath);
continue;
}
Boolean atleastOneSeqMissingInPathway = false;
List<Set<String>> proteinPaths = new ArrayList<>();
for (Cascade.NodeInformation nodeInformation :
reactionPath.getPath().stream().filter(nodeInfo -> nodeInfo.getIsReaction()).collect(Collectors.toList())) {
Set<String> proteinSeqs = new HashSet<>();
for (Long id : nodeInformation.getReactionIds()) {
// If the id is negative, it is a reaction in the reverse direction. Moreover, the enzyme for this reverse
// reaction is the same, so can use the actual positive reaction id's protein seq reference.
// TODO: Add a preference for the positive forward direction compared to the negative backward direction seq.
if (id < 0) {
LOGGER.info("Found a negative reaction id", id);
id = Reaction.reverseID(id);
}
Reaction reaction = reactionDB.getReactionFromUUID(id);
for (JSONObject data : reaction.getProteinData()) {
// Get the sequences
if (data.has("sequences")) {
JSONArray seqs = data.getJSONArray("sequences");
for (int i = 0; i < seqs.length(); i++) {
Long s = seqs.getLong(i);
if (s != null) {
Seq sequenceInfo = reactionDB.getSeqFromID(s);
String dnaSeq = sequenceInfo.getSequence();
if (dnaSeq == null) {
LOGGER.info(String.format("Sequence string for seq id %d, reaction id %d and reaction path %s is null",
s, id, reactionPath.getId()));
continue;
}
// odd sequence
if (dnaSeq.length() <= 80 || dnaSeq.charAt(0) != 'M') {
JSONObject metadata = sequenceInfo.getMetadata();
if (!metadata.has("inferred_sequences") || metadata.getJSONArray("inferred_sequences").length() == 0) {
continue;
}
JSONArray inferredSequences = metadata.getJSONArray("inferred_sequences");
// get the first inferred sequence since it has the highest hmmer score
JSONObject object = inferredSequences.getJSONObject(HIGHEST_SCORING_INFERRED_SEQ_INDEX);
for (String blacklistWord : BLACKLISTED_WORDS_IN_INFERRED_SEQ) {
if (object.getString("fasta_header").contains(blacklistWord)) {
continue;
}
}
dnaSeq = object.getString("sequence");
}
proteinSeqs.add(dnaSeq);
ProteinInformation proteinInformation = new ProteinInformation(sequenceInfo.getOrgName(), sequenceInfo.getEc(),
sequenceInfo.getSequence(), reaction.getReactionName());
if (!proteinSeqToOrgInfo.containsKey(dnaSeq)) {
proteinSeqToOrgInfo.put(dnaSeq, new HashSet<>());
}
proteinSeqToOrgInfo.get(dnaSeq).add(proteinInformation);
}
}
}
}
}
if (proteinSeqs.size() == 0) {
LOGGER.info("The reaction does not have any viable protein sequences");
atleastOneSeqMissingInPathway = true;
break;
}
// Now we select two representative protein seqs from the reaction. In order to do this deterministically,
// we sort and pick the first and middle index protein seqs.
List<String> proteinSeqArray = new ArrayList<>(proteinSeqs);
Collections.sort(proteinSeqArray);
int firstIndex = 0;
int middleIndex = proteinSeqs.size() / 2;
// get first seq
Set<String> combination = new HashSet<>();
combination.add(proteinSeqArray.get(firstIndex));
// get middle index of the protein seq array
if (proteinSeqs.size() > 1) {
combination.add(proteinSeqArray.get(middleIndex));
}
proteinPaths.add(combination);
}
if (atleastOneSeqMissingInPathway) {
LOGGER.info(String.format("There is at least one reaction with no sequence in reaction path id: %s", reactionPath.getId()));
} else {
LOGGER.info(String.format("All reactions in reaction path have at least one viable seq: %s", reactionPath.getId()));
// We only compute the dna design if we can find at least one sequence for each reaction in the pathway.
Set<List<String>> pathwayProteinCombinations = makePermutations(proteinPaths);
Set<DNAOrgECNum> dnaDesigns = new HashSet<>();
for (List<String> proteinsInPathway : pathwayProteinCombinations) {
try {
Construct dna = p2d.computeDNA(proteinsInPathway, Host.Ecoli);
List<Set<ProteinInformation>> seqMetadata = new ArrayList<>();
for (String protein : proteinsInPathway) {
seqMetadata.add(proteinSeqToOrgInfo.get(protein));
}
DNAOrgECNum instance = new DNAOrgECNum(dna.toSeq(), seqMetadata, proteinsInPathway.size());
dnaDesigns.add(instance);
} catch (Exception ex) {
LOGGER.error("The error thrown while trying to call computeDNA", ex.getMessage());
}
}
DNADesign dnaDesignSeq = new DNADesign(dnaDesigns);
try {
WriteResult<DNADesign, String> result = dnaDesignCollection.insert(dnaDesignSeq);
String id = result.getSavedId();
reactionPath.setDnaDesignRef(id);
} catch (MongoInternalException e) {
// This condition happens whent the protein designs are too big and cannot be inserted in to the JSON object.
// TODO: Handle this case without dropping the record
LOGGER.error(String.format("Mongo internal exception caught while inserting dna design: %s", e.getMessage()));
}
}
outputPathwayCollection.insert(reactionPath);
}
}
private static Long lookupMolecule(MongoDB db, String someKey) {
if (REGEX_ID.matcher(someKey).find()) {
// Note: this doesn't verify that the chemical id is valid. Maybe we should do that?
return Long.valueOf(someKey);
} else if (REGEX_INCHI.matcher(someKey).find()) {
Chemical c = db.getChemicalFromInChI(someKey);
if (c != null) {
return c.getUuid();
}
} else if (REGEX_INCHI_KEY.matcher(someKey).find()) {
Chemical c = db.getChemicalFromInChIKey(someKey);
if (c != null) {
return c.getUuid();
}
} else {
String msg = String.format("Unable to find key type for query '%s'", someKey);
LOGGER.error(msg);
throw new IllegalArgumentException(msg);
}
String msg = String.format("Unable to find matching chemical for query %s", someKey);
LOGGER.error(msg);
throw new IllegalArgumentException(msg);
}
}