Skip to content

Commit 02df654

Browse files
authored
Support Exporting Protein, Binary and Morphological Data to MrBayes (#264)
* Protein Model * Morphological Models Support * Cleanup Move Model Specific Functions to each model class Move other functions from phylotree and phylosupertree to phyloanalysis * Cleanup Imports * Binary Model Support * Misc Cleanup Misc Cleanup * Output Files Readability, Default Warning & Help Message * Fix Edge Case: Importing Values < 0.01 into MrBayes * Fix Edge Case: Extra Characters in Charset * Fix +G+I or +R Inputs * Fix Issues with Binary Model * Fix Issues with Morphology Model
1 parent 777c238 commit 02df654

17 files changed

+546
-321
lines changed

main/phyloanalysis.cpp

Lines changed: 80 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2602,6 +2602,82 @@ void printTrees(vector<string> trees, Params &params, string suffix) {
26022602
treesOut.close();
26032603
}
26042604

2605+
void printMrBayesBlockFile(const char* filename, IQTree* &iqtree, bool inclParams) {
2606+
ofstream out;
2607+
try {
2608+
out.exceptions(ios::failbit | ios::badbit);
2609+
out.open(filename);
2610+
2611+
// Write Warning
2612+
out << "#nexus" << endl << endl
2613+
<<"[This MrBayes Block Declaration provides the retrieved values from the IQTree Run.]" << endl
2614+
<< "[Note that MrBayes does not support a large collection of models, so defaults of 'nst=6' for DNA and 'wag' for Protein will be used if a model that does not exist in MrBayes is selected.]" << endl
2615+
<< "[Furthermore, the Model Parameter '+R' will be replaced by '+G'.]" << endl
2616+
<< "[This should be used as a Template Only.]" << endl << endl;
2617+
2618+
// Begin File, Print Charsets
2619+
out << "begin mrbayes;" << endl;
2620+
} catch (ios::failure &) {
2621+
outError(ERR_WRITE_OUTPUT, filename);
2622+
}
2623+
2624+
if (!iqtree->isSuperTree()) {
2625+
// Set Outgroup (if available)
2626+
if (!iqtree->rooted) out << " outgroup " << iqtree->root->name << ";" << endl << endl;
2627+
2628+
out << " [Using Model '" << iqtree->getModelName() << "']" << endl;
2629+
iqtree->getModel()->printMrBayesModelText(iqtree->getRate(), out, "all", "", false, inclParams);
2630+
2631+
out << endl << "end;" << endl;
2632+
out.close();
2633+
return;
2634+
}
2635+
2636+
auto superTree = (PhyloSuperTree*) iqtree;
2637+
auto saln = (SuperAlignment*) superTree->aln;
2638+
auto size = superTree->size();
2639+
2640+
for (int part = 0; part < size; part++) {
2641+
string name = saln->partitions[part]->name;
2642+
replace(name.begin(), name.end(), '+', '_');
2643+
out << " charset " << name << " = ";
2644+
2645+
string pos = saln->partitions[part]->position_spec;
2646+
replace(pos.begin(), pos.end(), ',' , ' ');
2647+
out << pos << ";" << endl;
2648+
}
2649+
2650+
// Create Partition
2651+
out << endl << " partition iqtree = " << size << ": ";
2652+
for (int part = 0; part < size; part++) {
2653+
if (part != 0) out << ", ";
2654+
2655+
string name = saln->partitions[part]->name;
2656+
replace(name.begin(), name.end(), '+', '_');
2657+
out << name;
2658+
}
2659+
out << ";" << endl;
2660+
2661+
// Set Partition for Use
2662+
out << " set partition = iqtree;" << endl;
2663+
2664+
// Set Outgroup (if available)
2665+
if (!superTree->rooted) out << " outgroup " << superTree->root->name << ";" << endl << endl;
2666+
2667+
// Partition-Specific Model Information
2668+
for (int part = 0; part < size; part++) {
2669+
PhyloTree* currentTree = superTree->at(part);
2670+
2671+
// MrBayes Partitions are 1-indexed
2672+
out << " [Partition No. " << convertIntToString(part + 1) << ", Using Model '" << currentTree->getModelName() << "']" << endl;
2673+
currentTree->getModel()->printMrBayesModelText(currentTree->getRate(), out,
2674+
convertIntToString(part + 1), saln->partitions[part]->name, true, inclParams);
2675+
out << endl;
2676+
}
2677+
out << "end;" << endl;
2678+
out.close();
2679+
}
2680+
26052681
/************************************************************
26062682
* MAIN TREE RECONSTRUCTION
26072683
***********************************************************/
@@ -3111,8 +3187,10 @@ void runTreeReconstruction(Params &params, IQTree* &iqtree) {
31113187
superTree->printBestPartitionParams((string(params.out_prefix) + ".best_model.nex").c_str());
31123188
}
31133189
if (params.mr_bayes_output) {
3114-
iqtree->printMrBayesBlock((string(params.out_prefix) + ".mr_bayes_scheme.nex").c_str(), false);
3115-
iqtree->printMrBayesBlock((string(params.out_prefix) + ".mr_bayes_model.nex").c_str(), true);
3190+
cout << endl << "Writing MrBayes Block Files..." << endl;
3191+
printMrBayesBlockFile((string(params.out_prefix) + ".mr_bayes_scheme.nex").c_str(), iqtree, false);
3192+
printMrBayesBlockFile((string(params.out_prefix) + ".mr_bayes_model.nex").c_str(), iqtree, true);
3193+
cout << endl;
31163194
}
31173195

31183196
cout << "BEST SCORE FOUND : " << iqtree->getCurScore() << endl;

model/modelbin.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,3 +76,56 @@ string ModelBIN::getNameParams(bool show_fixed_params) {
7676
}
7777
return retname.str();
7878
}
79+
80+
void ModelBIN::printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
81+
// MrBayes does not support Invariable Modifier for Binary data
82+
if (rate->isFreeRate() || rate->getPInvar() > 0.0) {
83+
warnLogStream("MrBayes does not support Invariable Sites with Binary Data! +I has been ignored!", out);
84+
}
85+
86+
// Lset Parameters
87+
out << " lset applyto=(" << partition << ") rates=";
88+
89+
// Free Rate should be substituted by +G (+I not supported)
90+
bool hasGamma = rate->getGammaShape() != 0.0 || rate->isFreeRate();
91+
if (hasGamma) {
92+
// Rate Categories + Gamma
93+
out << "gamma ngammacat=" << rate->getNRate();
94+
} else
95+
out << "equal";
96+
97+
out << ";" << endl;
98+
99+
if (!inclParams) {
100+
if (freq_type == FREQ_EQUAL) out << " prset applyto=(" << partition << ") statefreqpr=fixed(equal);" << endl;
101+
return;
102+
}
103+
104+
// Prset Parameters
105+
out << " prset applyto=(" << partition << ")";
106+
107+
// Freerate (+R)
108+
// Get replacement Gamma Shape
109+
if (rate->isFreeRate()) {
110+
printMrBayesFreeRateReplacement(rate, charset, out, false);
111+
}
112+
113+
// Gamma Distribution (+G/+R)
114+
// Dirichlet is not available here, use fixed
115+
if (rate->getGammaShape() > 0.0)
116+
out << " shapepr=fixed(" << minValueCheckMrBayes(rate->getGammaShape()) << ")";
117+
118+
// State Frequencies
119+
if (freq_type == FREQ_EQUAL)
120+
out << " statefreqpr=fixed(equal)";
121+
else {
122+
out << " statefreqpr=dirichlet(";
123+
for (int i = 0; i < num_states; ++i) {
124+
if (i != 0) out << ", ";
125+
out << minValueCheckMrBayes(state_freq[i]);
126+
}
127+
out << ")";
128+
}
129+
130+
out << ";" << endl;
131+
}

model/modelbin.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,18 @@ class ModelBIN : public ModelMarkov
5555
*/
5656
virtual void startCheckpoint();
5757

58+
/**
59+
* Print the model information in a format that can be accepted by MrBayes, using lset and prset.<br>
60+
* By default, it simply prints a warning to the log and to the stream, stating that this model is not supported by MrBayes.
61+
* @param rate the rate information
62+
* @param out the ofstream to print the result to
63+
* @param partition the partition to apply lset and prset to
64+
* @param charset the current partition's charset. Useful for getting information from the checkpoint file
65+
* @param isSuperTree whether the tree is a super tree. Useful for retrieving information from the checkpoint file, which has different locations for PhyloTree and PhyloSuperTree
66+
* @param inclParams whether to include IQTree optimized parameters for the model
67+
*/
68+
virtual void printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
69+
5870
};
5971

6072
#endif

model/modeldna.cpp

Lines changed: 96 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,100 @@ void ModelDNA::setVariables(double *variables) {
565565
// }
566566
}
567567

568-
string ModelDNA::getModelDNACode() {
569-
return param_spec;
568+
void ModelDNA::printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
569+
bool equalFreq = freq_type == FREQ_EQUAL;
570+
short nst = 6;
571+
572+
// Find NST value (1 for JC/JC69/F81, 2 for K80/K2P/HKY/HKY85, 6 for SYM/GTR)
573+
// NST is set to 6 by default (above), so check for JC/JC69/F81 and K80/K2P/HKY/HKY85
574+
// If it returns a valid dna code, we can extract information from there (needed for user-defined models)
575+
if (!param_spec.empty()) {
576+
// if all equal
577+
if (param_spec.find_first_not_of(param_spec[0]) == string::npos) {
578+
nst = 1;
579+
// if A-G=C-T and everything else is equal to first part of dna code
580+
} else if (param_spec[1] == param_spec[4] &&
581+
param_spec[0] == param_spec[2] && param_spec[0] == param_spec[3] && param_spec[0] == param_spec[5]) {
582+
nst = 2;
583+
}
584+
} else if (!name.empty()) {
585+
// Check the name of the model
586+
if (strcmp(name.c_str(), "JC") == 0 || strcmp(name.c_str(), "JC69") == 0 || strcmp(name.c_str(), "F81") == 0) {
587+
nst = 1;
588+
} else if (strcmp(name.c_str(), "K80") == 0 || strcmp(name.c_str(), "K2P") == 0 || strcmp(name.c_str(), "HKY") == 0 || strcmp(name.c_str(), "HKY85") == 0) {
589+
nst = 2;
590+
}
591+
}
592+
593+
// NucModel = 4By4: DNA Nucleotides (4 Options, A, C, G, T)
594+
out << " lset applyto=(" << partition << ") nucmodel=4by4 nst=" << nst << " rates=";
595+
596+
// RHAS Specification
597+
// Free Rate should be substituted by +G+I
598+
bool hasGamma = rate->getGammaShape() != 0.0 || rate->isFreeRate();
599+
bool hasInvariable = rate->getPInvar() != 0.0 || rate->isFreeRate();
600+
if (hasGamma) {
601+
if (hasInvariable)
602+
out << "invgamma";
603+
else
604+
out << "gamma";
605+
} else if (hasInvariable)
606+
out << "propinv";
607+
else
608+
out << "equal";
609+
610+
// Rate Categories
611+
if (hasGamma)
612+
out << " ngammacat=" << rate->getNRate();
613+
614+
out << ";" << endl;
615+
616+
// Priors (apart from Fixed Freq)
617+
if (!inclParams) {
618+
// If not include other params, simply set fixed frequency and return
619+
if (!equalFreq) return;
620+
621+
out << " prset applyto=(" << partition << ") statefreqpr=fixed(equal);" << endl;
622+
return;
623+
}
624+
625+
out << " prset applyto=(" << partition << ")";
626+
627+
// Freerate (+R)
628+
// Get replacement Gamma Shape + Invariable Sites
629+
if (rate->isFreeRate()) {
630+
printMrBayesFreeRateReplacement(isSuperTree, charset, out);
631+
}
632+
633+
// Gamma Distribution (+G/+R)
634+
// Dirichlet is not available here, use fixed
635+
if (rate->getGammaShape() > 0.0)
636+
out << " shapepr=fixed(" << minValueCheckMrBayes(rate->getGammaShape()) << ")";
637+
638+
// Invariable Sites (+I)
639+
// Dirichlet is not available here, use fixed
640+
if (rate->getPInvar() > 0.0)
641+
out << " pinvarpr=fixed(" << minValueCheckMrBayes(rate->getPInvar()) << ")";
642+
643+
// Frequency of Nucleotides (+F)
644+
if (equalFreq)
645+
out << " statefreqpr=fixed(equal)";
646+
else {
647+
out << " statefreqpr=dirichlet(";
648+
for (int i = 0; i < num_states; ++i) {
649+
if (i != 0) out << ", ";
650+
out << minValueCheckMrBayes(state_freq[i]);
651+
}
652+
out << ")";
653+
}
654+
655+
// Reversible Rate Matrix
656+
out << " revmatpr=dirichlet(";
657+
for (int i = 0; i < getNumRateEntries(); ++i) {
658+
if (i != 0) out << ", ";
659+
out << minValueCheckMrBayes(rates[i]);
660+
}
661+
662+
// Close revmatpr + prset
663+
out << ");" << endl;
570664
}

model/modeldna.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,10 +115,16 @@ class ModelDNA : public ModelMarkov
115115
virtual void computeTipLikelihood(PML::StateType state, double *state_lk);
116116

117117
/**
118-
* Get the Model DNA 'code', in form 'abcdef', used with ModelDNA model
119-
* Returns empty string by default (this is not a dna specific model)
118+
* Print the model information in a format that can be accepted by MrBayes, using lset and prset.<br>
119+
* By default, it simply prints a warning to the log and to the stream, stating that this model is not supported by MrBayes.
120+
* @param rate the rate information
121+
* @param out the ofstream to print the result to
122+
* @param partition the partition to apply lset and prset to
123+
* @param charset the current partition's charset. Useful for getting information from the checkpoint file
124+
* @param isSuperTree whether the tree is a super tree. Useful for retrieving information from the checkpoint file, which has different locations for PhyloTree and PhyloSuperTree
125+
* @param inclParams whether to include IQTree optimized parameters for the model
120126
*/
121-
virtual string getModelDNACode();
127+
virtual void printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
122128

123129
protected:
124130

model/modelmorphology.cpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,3 +150,54 @@ void ModelMorphology::writeInfo(ostream &out) {
150150
out << endl;
151151
}
152152
}
153+
154+
void ModelMorphology::printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
155+
warnLogStream("MrBayes only supports Morphological Data with states from {0-9}!", out);
156+
warnLogStream("Morphological Data with states {A-Z} may cause errors!", out);
157+
warnLogStream("Use Morphological Models in MrBayes with Caution!", out);
158+
159+
// MrBayes does not support Invariable Modifier for Morph data
160+
if (rate->isFreeRate() || rate->getPInvar() > 0.0) {
161+
warnLogStream("MrBayes does not support Invariable Sites with Morphological Data! +I has been ignored!", out);
162+
}
163+
// MrBayes does not support State Frequency for Morph data
164+
if (freq_type != FREQ_EQUAL) {
165+
warnLogStream("MrBayes does not support non-equal frequencies for Morphological Data! Frequencies are left as the default! (All equal)", out);
166+
}
167+
168+
// Lset Parameters
169+
out << " lset applyto=(" << partition << ") rates=";
170+
171+
// Free Rate should be substituted by +G (+I not supported)
172+
bool hasGamma = rate->getGammaShape() != 0.0 || rate->isFreeRate();
173+
if (hasGamma) {
174+
// Rate Categories + Gamma
175+
out << "gamma ngammacat=" << rate->getNRate();
176+
} else
177+
out << "equal";
178+
179+
out << ";" << endl;
180+
181+
// ctype (ordered or not)
182+
if (strcmp(name.c_str(), "ORDERED") == 0)
183+
out << " ctype ordered: " << charset << ";" << endl;
184+
185+
// Since morphological doesn't have state frequencies, if there are no parameters to prset, return
186+
if (!inclParams || !rate->isFreeRate() && rate->getGammaShape() <= 0.0) return;
187+
188+
// Prset Parameters
189+
out << " prset applyto=(" << partition << ")";
190+
191+
// Freerate (+R)
192+
// Get replacement Gamma Shape
193+
if (rate->isFreeRate()) {
194+
printMrBayesFreeRateReplacement(rate, charset, out, false);
195+
}
196+
197+
// Gamma Distribution (+G/+R)
198+
// Dirichlet is not available here, use fixed
199+
if (rate->getGammaShape() > 0.0)
200+
out << " shapepr=fixed(" << minValueCheckMrBayes(rate->getGammaShape()) << ")";
201+
202+
out << ";" << endl;
203+
}

model/modelmorphology.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,18 @@ class ModelMorphology: public ModelMarkov {
7878
virtual void readRates(istream &in) noexcept(false);
7979

8080
virtual ~ModelMorphology();
81+
82+
/**
83+
* Print the model information in a format that can be accepted by MrBayes, using lset and prset.<br>
84+
* By default, it simply prints a warning to the log and to the stream, stating that this model is not supported by MrBayes.
85+
* @param rate the rate information
86+
* @param out the ofstream to print the result to
87+
* @param partition the partition to apply lset and prset to
88+
* @param charset the current partition's charset. Useful for getting information from the checkpoint file
89+
* @param isSuperTree whether the tree is a super tree. Useful for retrieving information from the checkpoint file, which has different locations for PhyloTree and PhyloSuperTree
90+
* @param inclParams whether to include IQTree optimized parameters for the model
91+
*/
92+
virtual void printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
8193
};
8294

8395
#endif /* MODELMORPHOLOGY_H_ */

0 commit comments

Comments
 (0)