Skip to content

Commit d36486c

Browse files
authored
Support Exporting Codon Analysis to MrBayes Block Files (#266)
* Codon Model * Fix Compiler Error due to Merge Conflicts * Fix Codon Model `NucModel` Parameter * Fix Empirical Warning + Indentation of Warnings * Improve Start-Of-File Warnings * Fix Indentation in alignment.cpp
1 parent 02df654 commit d36486c

17 files changed

+193
-60
lines changed

alignment/alignment.cpp

Lines changed: 43 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
#include <Eigen/LU>
2525
#ifdef USE_BOOST
26+
#include <boost/bimap.hpp>
2627
#include <boost/math/distributions/binomial.hpp>
2728
#endif
2829

@@ -62,6 +63,8 @@ char genetic_code23[] = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSS
6263
char genetic_code24[] = "KNKNTTTTSSKSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF"; // Pterobranchia mitochondrial
6364
char genetic_code25[] = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSGCWCLFLF"; // Candidate Division SR1 and Gracilibacteria
6465

66+
boost::bimap<int, char*> genetic_code_map;
67+
6568
Alignment::Alignment()
6669
: vector<Pattern>()
6770
{
@@ -1603,7 +1606,32 @@ void Alignment::convertStateStr(string &str, SeqType seq_type) {
16031606
(*it) = convertState(*it, seq_type);
16041607
}
16051608
*/
1606-
1609+
1610+
boost::bimap<int, char*> getGeneticCodeMap() {
1611+
if (!genetic_code_map.empty()) return genetic_code_map;
1612+
1613+
genetic_code_map.insert({1, genetic_code1});
1614+
genetic_code_map.insert({2, genetic_code2});
1615+
genetic_code_map.insert({3, genetic_code3});
1616+
genetic_code_map.insert({4, genetic_code4});
1617+
genetic_code_map.insert({5, genetic_code5});
1618+
genetic_code_map.insert({6, genetic_code6});
1619+
genetic_code_map.insert({9, genetic_code9});
1620+
genetic_code_map.insert({10, genetic_code10});
1621+
genetic_code_map.insert({11, genetic_code11});
1622+
genetic_code_map.insert({12, genetic_code12});
1623+
genetic_code_map.insert({13, genetic_code13});
1624+
genetic_code_map.insert({14, genetic_code14});
1625+
genetic_code_map.insert({16, genetic_code16});
1626+
genetic_code_map.insert({21, genetic_code21});
1627+
genetic_code_map.insert({22, genetic_code22});
1628+
genetic_code_map.insert({23, genetic_code23});
1629+
genetic_code_map.insert({24, genetic_code24});
1630+
genetic_code_map.insert({25, genetic_code25});
1631+
1632+
return genetic_code_map;
1633+
}
1634+
16071635
void Alignment::initCodon(char *gene_code_id) {
16081636
// build index from 64 codons to non-stop codons
16091637
int transl_table = 1;
@@ -1613,30 +1641,13 @@ void Alignment::initCodon(char *gene_code_id) {
16131641
} catch (string &str) {
16141642
outError("Wrong genetic code ", gene_code_id);
16151643
}
1616-
switch (transl_table) {
1617-
case 1: genetic_code = genetic_code1; break;
1618-
case 2: genetic_code = genetic_code2; break;
1619-
case 3: genetic_code = genetic_code3; break;
1620-
case 4: genetic_code = genetic_code4; break;
1621-
case 5: genetic_code = genetic_code5; break;
1622-
case 6: genetic_code = genetic_code6; break;
1623-
case 9: genetic_code = genetic_code9; break;
1624-
case 10: genetic_code = genetic_code10; break;
1625-
case 11: genetic_code = genetic_code11; break;
1626-
case 12: genetic_code = genetic_code12; break;
1627-
case 13: genetic_code = genetic_code13; break;
1628-
case 14: genetic_code = genetic_code14; break;
1629-
case 15: genetic_code = genetic_code15; break;
1630-
case 16: genetic_code = genetic_code16; break;
1631-
case 21: genetic_code = genetic_code21; break;
1632-
case 22: genetic_code = genetic_code22; break;
1633-
case 23: genetic_code = genetic_code23; break;
1634-
case 24: genetic_code = genetic_code24; break;
1635-
case 25: genetic_code = genetic_code25; break;
1636-
default:
1644+
auto codeMap = getGeneticCodeMap();
1645+
auto found = codeMap.left.find(transl_table);
1646+
if (found == codeMap.left.end()) {
16371647
outError("Wrong genetic code ", gene_code_id);
1638-
break;
1648+
return;
16391649
}
1650+
genetic_code = found->second;
16401651
} else {
16411652
genetic_code = genetic_code1;
16421653
}
@@ -1669,6 +1680,15 @@ void Alignment::initCodon(char *gene_code_id) {
16691680
// cout << "num_states = " << num_states << endl;
16701681
}
16711682

1683+
int Alignment::getGeneticCodeId() {
1684+
if (seq_type != SEQ_CODON || genetic_code == nullptr) return 0;
1685+
1686+
auto code_map = getGeneticCodeMap();
1687+
auto found = code_map.right.find(genetic_code);
1688+
if (found == code_map.right.end()) return 0;
1689+
return found->second;
1690+
}
1691+
16721692
int getMorphStates(StrVector &sequences) {
16731693
char maxstate = 0;
16741694
for (StrVector::iterator it = sequences.begin(); it != sequences.end(); it++)

alignment/alignment.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1010,6 +1010,12 @@ class Alignment : public vector<Pattern>, public CharSet, public StateSpace {
10101010
*/
10111011
void extractMapleFile(const std::string& aln_name, const InputType& format);
10121012

1013+
/**
1014+
* Get the numerical id of the genetic code
1015+
* @return id the genetic code id, or 0 if not a codon type
1016+
*/
1017+
int getGeneticCodeId();
1018+
10131019
protected:
10141020

10151021

main/phyloanalysis.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2608,11 +2608,21 @@ void printMrBayesBlockFile(const char* filename, IQTree* &iqtree, bool inclParam
26082608
out.exceptions(ios::failbit | ios::badbit);
26092609
out.open(filename);
26102610

2611+
string provide = "basic models";
2612+
if (inclParams) provide = "optimized values";
2613+
else if (iqtree->isSuperTree()) provide = "basic partition structure and models";
2614+
26112615
// Write Warning
26122616
out << "#nexus" << endl << endl
2613-
<<"[This MrBayes Block Declaration provides the retrieved values from the IQTree Run.]" << endl
2614-
<< "[Note that MrBayes does not support a large collection of models, so defaults of 'nst=6' for DNA and 'wag' for Protein will be used if a model that does not exist in MrBayes is selected.]" << endl
2615-
<< "[Furthermore, the Model Parameter '+R' will be replaced by '+G'.]" << endl
2617+
<<"[This MrBayes Block Declaration provides the " << provide << " from the IQTree Run.]" << endl
2618+
<< "[Note that MrBayes does not support a large collection of models, so defaults of 'nst=6' for DNA and 'wag' for Protein will be used if a model that does not exist in MrBayes is used.]" << endl;
2619+
2620+
if (inclParams)
2621+
out << "[However, for those cases, there will still be a rate matrix provided.]" << endl
2622+
<< "[For DNA, this will still mean the rates may vary outside the restrictions of the model.]" << endl
2623+
<< "[For Protein, this is essentially a perfect replacement.]" << endl;
2624+
2625+
out << "[Furthermore, the Model Parameter '+R' will be replaced by '+G'.]" << endl
26162626
<< "[This should be used as a Template Only.]" << endl << endl;
26172627

26182628
// Begin File, Print Charsets
@@ -2626,7 +2636,7 @@ void printMrBayesBlockFile(const char* filename, IQTree* &iqtree, bool inclParam
26262636
if (!iqtree->rooted) out << " outgroup " << iqtree->root->name << ";" << endl << endl;
26272637

26282638
out << " [Using Model '" << iqtree->getModelName() << "']" << endl;
2629-
iqtree->getModel()->printMrBayesModelText(iqtree->getRate(), out, "all", "", false, inclParams);
2639+
iqtree->getModel()->printMrBayesModelText(out, "all", "", false, inclParams);
26302640

26312641
out << endl << "end;" << endl;
26322642
out.close();
@@ -2670,7 +2680,7 @@ void printMrBayesBlockFile(const char* filename, IQTree* &iqtree, bool inclParam
26702680

26712681
// MrBayes Partitions are 1-indexed
26722682
out << " [Partition No. " << convertIntToString(part + 1) << ", Using Model '" << currentTree->getModelName() << "']" << endl;
2673-
currentTree->getModel()->printMrBayesModelText(currentTree->getRate(), out,
2683+
currentTree->getModel()->printMrBayesModelText(out,
26742684
convertIntToString(part + 1), saln->partitions[part]->name, true, inclParams);
26752685
out << endl;
26762686
}

model/modelbin.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,9 @@ string ModelBIN::getNameParams(bool show_fixed_params) {
7777
return retname.str();
7878
}
7979

80-
void ModelBIN::printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
80+
void ModelBIN::printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
81+
RateHeterogeneity* rate = phylo_tree->getRate();
82+
8183
// MrBayes does not support Invariable Modifier for Binary data
8284
if (rate->isFreeRate() || rate->getPInvar() > 0.0) {
8385
warnLogStream("MrBayes does not support Invariable Sites with Binary Data! +I has been ignored!", out);

model/modelbin.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ class ModelBIN : public ModelMarkov
6565
* @param isSuperTree whether the tree is a super tree. Useful for retrieving information from the checkpoint file, which has different locations for PhyloTree and PhyloSuperTree
6666
* @param inclParams whether to include IQTree optimized parameters for the model
6767
*/
68-
virtual void printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
68+
virtual void printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
6969

7070
};
7171

model/modelcodon.cpp

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,6 +1142,72 @@ double ModelCodon::optimizeParameters(double gradient_epsilon) {
11421142
return score;
11431143
}
11441144

1145+
void ModelCodon::printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
1146+
// NST should be 1 if fix_kappa is true (no ts/tv ratio), else it should be 2
1147+
int nst = fix_kappa ? 1 : 2;
1148+
int code = phylo_tree->aln->getGeneticCodeId();
1149+
string mrBayesCode = getMrBayesGeneticCode(code);
1150+
RateHeterogeneity* rate = phylo_tree->getRate();
1151+
1152+
if (rate->isFreeRate() || rate->getGammaShape() > 0.0 || rate->getPInvar() > 0.0) {
1153+
warnLogStream("MrBayes Codon Models do not support any Heterogenity Rate Modifiers! (+I, +G, +R) They have been ignored!", out);
1154+
}
1155+
1156+
if (mrBayesCode.empty()) {
1157+
warnLogStream("MrBayes Does Not Support Codon Code " + convertIntToString(code) + "! Defaulting to Universal (Code 1).", out);
1158+
mrBayesCode = "universal";
1159+
}
1160+
1161+
if (num_params == 0 || name.find('_') != string::npos) { // If this model is a Empirical / Semi-Empirical Model
1162+
warnLogStream("MrBayes does not support Empirical or Semi-Empirical Codon Models. State Frequency will still be set, but no rate matrix will be set.", out);
1163+
}
1164+
1165+
out << " lset applyto=(" << partition << ") nucmodel=codon omegavar=equal nst=" << nst << " code=" << mrBayesCode << ";" << endl;
1166+
1167+
if (!inclParams) return;
1168+
1169+
out << " prset applyto=(" << partition << ") statefreqpr=dirichlet(";
1170+
1171+
// State Frequency Prior
1172+
bool isFirst = true;
1173+
for (int i = 0; i < num_states; i++) {
1174+
if (phylo_tree->aln->isStopCodon(i)) continue;
1175+
1176+
if (!isFirst) out << ", ";
1177+
else isFirst = false;
1178+
1179+
out << state_freq[i];
1180+
}
1181+
1182+
out << ")";
1183+
1184+
/*
1185+
* TS/TV Ratio (Kappa)
1186+
* Ratio Should be:
1187+
* `beta(kappa, 1)` when `codon_kappa_style` is `CK_ONE_KAPPA` (kappa here represents the ts/tv rate ratio)
1188+
* `beta(kappa, 1)` when `codon_kappa_style` is `CK_ONE_KAPPA_TS` (kappa here represents the transition rate)
1189+
* `beta(1, kappa)` when `codon_kappa_style` is `CK_ONE_KAPPA_TV` (kappa here represents the transversion rate)
1190+
* `beta(kappa, kappa2)` when `codon_kappa_style` is `CK_TWO_KAPPA` (kappa here represents the transition rate, and kappa2 represents the transversion rate)
1191+
*/
1192+
if (!fix_kappa) {
1193+
double v1 = codon_kappa_style == CK_ONE_KAPPA_TV ? 1 : kappa;
1194+
double v2 = 1;
1195+
if (codon_kappa_style == CK_ONE_KAPPA_TV)
1196+
v2 = kappa;
1197+
else if (codon_kappa_style == CK_TWO_KAPPA)
1198+
v2 = kappa2;
1199+
1200+
out << " tratiopr=beta(" << v1 << ", " << v2 << ")";
1201+
}
1202+
1203+
// DN/DS Ratio (Omega)
1204+
// Ratio is set to `dirichlet(omega, 1)`
1205+
if (!fix_omega) {
1206+
out << " omegapr=dirichlet(" << omega << ", 1)";
1207+
}
1208+
1209+
out << ";" << endl;
1210+
}
11451211

11461212
void ModelCodon::writeInfo(ostream &out) {
11471213
if (name.find('_') == string::npos)

model/modelcodon.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,17 @@ class ModelCodon: public ModelMarkov {
210210
*/
211211
virtual bool getVariables(double *variables);
212212

213+
/**
214+
* Print the model information in a format that can be accepted by MrBayes, using lset and prset.<br>
215+
* By default, it simply prints a warning to the log and to the stream, stating that this model is not supported by MrBayes.
216+
* @param out the ofstream to print the result to
217+
* @param partition the partition to apply lset and prset to
218+
* @param charset the current partition's charset. Useful for getting information from the checkpoint file
219+
* @param isSuperTree whether the tree is a super tree. Useful for retrieving information from the checkpoint file, which has different locations for PhyloTree and PhyloSuperTree
220+
* @param inclParams whether to include IQTree optimized parameters for the model
221+
*/
222+
virtual void printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
223+
213224
};
214225

215226
#endif /* MODELCODON_H_ */

model/modeldna.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -565,9 +565,10 @@ void ModelDNA::setVariables(double *variables) {
565565
// }
566566
}
567567

568-
void ModelDNA::printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
568+
void ModelDNA::printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
569569
bool equalFreq = freq_type == FREQ_EQUAL;
570570
short nst = 6;
571+
RateHeterogeneity* rate = phylo_tree->getRate();
571572

572573
// Find NST value (1 for JC/JC69/F81, 2 for K80/K2P/HKY/HKY85, 6 for SYM/GTR)
573574
// NST is set to 6 by default (above), so check for JC/JC69/F81 and K80/K2P/HKY/HKY85

model/modeldna.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,14 +117,13 @@ class ModelDNA : public ModelMarkov
117117
/**
118118
* Print the model information in a format that can be accepted by MrBayes, using lset and prset.<br>
119119
* By default, it simply prints a warning to the log and to the stream, stating that this model is not supported by MrBayes.
120-
* @param rate the rate information
121120
* @param out the ofstream to print the result to
122121
* @param partition the partition to apply lset and prset to
123122
* @param charset the current partition's charset. Useful for getting information from the checkpoint file
124123
* @param isSuperTree whether the tree is a super tree. Useful for retrieving information from the checkpoint file, which has different locations for PhyloTree and PhyloSuperTree
125124
* @param inclParams whether to include IQTree optimized parameters for the model
126125
*/
127-
virtual void printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
126+
virtual void printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
128127

129128
protected:
130129

model/modelmorphology.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,11 +151,13 @@ void ModelMorphology::writeInfo(ostream &out) {
151151
}
152152
}
153153

154-
void ModelMorphology::printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
154+
void ModelMorphology::printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
155155
warnLogStream("MrBayes only supports Morphological Data with states from {0-9}!", out);
156156
warnLogStream("Morphological Data with states {A-Z} may cause errors!", out);
157157
warnLogStream("Use Morphological Models in MrBayes with Caution!", out);
158158

159+
RateHeterogeneity* rate = phylo_tree->getRate();
160+
159161
// MrBayes does not support Invariable Modifier for Morph data
160162
if (rate->isFreeRate() || rate->getPInvar() > 0.0) {
161163
warnLogStream("MrBayes does not support Invariable Sites with Morphological Data! +I has been ignored!", out);

model/modelmorphology.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,14 +82,13 @@ class ModelMorphology: public ModelMarkov {
8282
/**
8383
* Print the model information in a format that can be accepted by MrBayes, using lset and prset.<br>
8484
* By default, it simply prints a warning to the log and to the stream, stating that this model is not supported by MrBayes.
85-
* @param rate the rate information
8685
* @param out the ofstream to print the result to
8786
* @param partition the partition to apply lset and prset to
8887
* @param charset the current partition's charset. Useful for getting information from the checkpoint file
8988
* @param isSuperTree whether the tree is a super tree. Useful for retrieving information from the checkpoint file, which has different locations for PhyloTree and PhyloSuperTree
9089
* @param inclParams whether to include IQTree optimized parameters for the model
9190
*/
92-
virtual void printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
91+
virtual void printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
9392
};
9493

9594
#endif /* MODELMORPHOLOGY_H_ */

model/modelprotein.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1162,10 +1162,12 @@ void ModelProtein::computeTipLikelihood(PML::StateType state, double *state_lk)
11621162
state_lk[ambi_aa[cstate*2+1]] = 1.0;
11631163
}
11641164

1165-
void ModelProtein::printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
1165+
void ModelProtein::printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
11661166
// Lset Parameters
11671167
out << " lset applyto=(" << partition << ") nucmodel=protein rates=";
11681168

1169+
RateHeterogeneity* rate = phylo_tree->getRate();
1170+
11691171
// RHAS Specification
11701172
// Free Rate should be substituted by +G+I
11711173
bool hasGamma = rate->getGammaShape() != 0.0 || rate->isFreeRate();

model/modelprotein.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,14 +82,13 @@ class ModelProtein : public ModelMarkov
8282
/**
8383
* Print the model information in a format that can be accepted by MrBayes, using lset and prset.<br>
8484
* By default, it simply prints a warning to the log and to the stream, stating that this model is not supported by MrBayes.
85-
* @param rate the rate information
8685
* @param out the ofstream to print the result to
8786
* @param partition the partition to apply lset and prset to
8887
* @param charset the current partition's charset. Useful for getting information from the checkpoint file
8988
* @param isSuperTree whether the tree is a super tree. Useful for retrieving information from the checkpoint file, which has different locations for PhyloTree and PhyloSuperTree
9089
* @param inclParams whether to include IQTree optimized parameters for the model
9190
*/
92-
virtual void printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
91+
virtual void printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
9392

9493
private:
9594
ModelsBlock *models_block;

model/modelsubst.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ void ModelSubst::printMrBayesFreeRateReplacement(bool isSuperTree, string &chars
246246
out << " pinvarpr=fixed(" << minValueCheckMrBayes(p_invar) << ")";
247247
}
248248

249-
void ModelSubst::printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
249+
void ModelSubst::printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
250250
out << " [MrBayes Block Output is not supported by this model!]" << endl;
251251
outWarning("MrBayes Block Output is not supported by this model of name '" + name + "'!");
252252
}

model/modelsubst.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -405,14 +405,13 @@ class ModelSubst: public Optimization, public CheckpointFactory
405405
/**
406406
* Print the model information in a format that can be accepted by MrBayes, using lset and prset.<br>
407407
* By default, it simply prints a warning to the log and to the stream, stating that this model is not supported by MrBayes.
408-
* @param rate the rate information
409408
* @param out the ofstream to print the result to
410409
* @param partition the partition to apply lset and prset to
411410
* @param charset the current partition's charset. Useful for getting information from the checkpoint file
412411
* @param isSuperTree whether the tree is a super tree. Useful for retrieving information from the checkpoint file, which has different locations for PhyloTree and PhyloSuperTree
413412
* @param inclParams whether to include IQTree optimized parameters for the model
414413
*/
415-
virtual void printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
414+
virtual void printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
416415

417416
/*****************************************************
418417
Checkpointing facility

0 commit comments

Comments
 (0)