Skip to content

Support Exporting Codon Analysis to MrBayes Block Files #266

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 43 additions & 23 deletions alignment/alignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

#include <Eigen/LU>
#ifdef USE_BOOST
#include <boost/bimap.hpp>
#include <boost/math/distributions/binomial.hpp>
#endif

Expand Down Expand Up @@ -62,6 +63,8 @@ char genetic_code23[] = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSS
char genetic_code24[] = "KNKNTTTTSSKSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF"; // Pterobranchia mitochondrial
char genetic_code25[] = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSGCWCLFLF"; // Candidate Division SR1 and Gracilibacteria

boost::bimap<int, char*> genetic_code_map;

Alignment::Alignment()
: vector<Pattern>()
{
Expand Down Expand Up @@ -1603,7 +1606,32 @@ void Alignment::convertStateStr(string &str, SeqType seq_type) {
(*it) = convertState(*it, seq_type);
}
*/


boost::bimap<int, char*> getGeneticCodeMap() {
if (!genetic_code_map.empty()) return genetic_code_map;

genetic_code_map.insert({1, genetic_code1});
genetic_code_map.insert({2, genetic_code2});
genetic_code_map.insert({3, genetic_code3});
genetic_code_map.insert({4, genetic_code4});
genetic_code_map.insert({5, genetic_code5});
genetic_code_map.insert({6, genetic_code6});
genetic_code_map.insert({9, genetic_code9});
genetic_code_map.insert({10, genetic_code10});
genetic_code_map.insert({11, genetic_code11});
genetic_code_map.insert({12, genetic_code12});
genetic_code_map.insert({13, genetic_code13});
genetic_code_map.insert({14, genetic_code14});
genetic_code_map.insert({16, genetic_code16});
genetic_code_map.insert({21, genetic_code21});
genetic_code_map.insert({22, genetic_code22});
genetic_code_map.insert({23, genetic_code23});
genetic_code_map.insert({24, genetic_code24});
genetic_code_map.insert({25, genetic_code25});

return genetic_code_map;
}

void Alignment::initCodon(char *gene_code_id) {
// build index from 64 codons to non-stop codons
int transl_table = 1;
Expand All @@ -1613,30 +1641,13 @@ void Alignment::initCodon(char *gene_code_id) {
} catch (string &str) {
outError("Wrong genetic code ", gene_code_id);
}
switch (transl_table) {
case 1: genetic_code = genetic_code1; break;
case 2: genetic_code = genetic_code2; break;
case 3: genetic_code = genetic_code3; break;
case 4: genetic_code = genetic_code4; break;
case 5: genetic_code = genetic_code5; break;
case 6: genetic_code = genetic_code6; break;
case 9: genetic_code = genetic_code9; break;
case 10: genetic_code = genetic_code10; break;
case 11: genetic_code = genetic_code11; break;
case 12: genetic_code = genetic_code12; break;
case 13: genetic_code = genetic_code13; break;
case 14: genetic_code = genetic_code14; break;
case 15: genetic_code = genetic_code15; break;
case 16: genetic_code = genetic_code16; break;
case 21: genetic_code = genetic_code21; break;
case 22: genetic_code = genetic_code22; break;
case 23: genetic_code = genetic_code23; break;
case 24: genetic_code = genetic_code24; break;
case 25: genetic_code = genetic_code25; break;
default:
auto codeMap = getGeneticCodeMap();
auto found = codeMap.left.find(transl_table);
if (found == codeMap.left.end()) {
outError("Wrong genetic code ", gene_code_id);
break;
return;
}
genetic_code = found->second;
} else {
genetic_code = genetic_code1;
}
Expand Down Expand Up @@ -1669,6 +1680,15 @@ void Alignment::initCodon(char *gene_code_id) {
// cout << "num_states = " << num_states << endl;
}

int Alignment::getGeneticCodeId() {
if (seq_type != SEQ_CODON || genetic_code == nullptr) return 0;

auto code_map = getGeneticCodeMap();
auto found = code_map.right.find(genetic_code);
if (found == code_map.right.end()) return 0;
return found->second;
}

int getMorphStates(StrVector &sequences) {
char maxstate = 0;
for (StrVector::iterator it = sequences.begin(); it != sequences.end(); it++)
Expand Down
6 changes: 6 additions & 0 deletions alignment/alignment.h
Original file line number Diff line number Diff line change
Expand Up @@ -1010,6 +1010,12 @@ class Alignment : public vector<Pattern>, public CharSet, public StateSpace {
*/
void extractMapleFile(const std::string& aln_name, const InputType& format);

/**
* Get the numerical id of the genetic code
* @return id the genetic code id, or 0 if not a codon type
*/
int getGeneticCodeId();

protected:


Expand Down
20 changes: 15 additions & 5 deletions main/phyloanalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2608,11 +2608,21 @@ void printMrBayesBlockFile(const char* filename, IQTree* &iqtree, bool inclParam
out.exceptions(ios::failbit | ios::badbit);
out.open(filename);

string provide = "basic models";
if (inclParams) provide = "optimized values";
else if (iqtree->isSuperTree()) provide = "basic partition structure and models";

// Write Warning
out << "#nexus" << endl << endl
<<"[This MrBayes Block Declaration provides the retrieved values from the IQTree Run.]" << endl
<< "[Note that MrBayes does not support a large collection of models, so defaults of 'nst=6' for DNA and 'wag' for Protein will be used if a model that does not exist in MrBayes is selected.]" << endl
<< "[Furthermore, the Model Parameter '+R' will be replaced by '+G'.]" << endl
<<"[This MrBayes Block Declaration provides the " << provide << " from the IQTree Run.]" << endl
<< "[Note that MrBayes does not support a large collection of models, so defaults of 'nst=6' for DNA and 'wag' for Protein will be used if a model that does not exist in MrBayes is used.]" << endl;

if (inclParams)
out << "[However, for those cases, there will still be a rate matrix provided.]" << endl
<< "[For DNA, this will still mean the rates may vary outside the restrictions of the model.]" << endl
<< "[For Protein, this is essentially a perfect replacement.]" << endl;

out << "[Furthermore, the Model Parameter '+R' will be replaced by '+G'.]" << endl
<< "[This should be used as a Template Only.]" << endl << endl;

// Begin File, Print Charsets
Expand All @@ -2626,7 +2636,7 @@ void printMrBayesBlockFile(const char* filename, IQTree* &iqtree, bool inclParam
if (!iqtree->rooted) out << " outgroup " << iqtree->root->name << ";" << endl << endl;

out << " [Using Model '" << iqtree->getModelName() << "']" << endl;
iqtree->getModel()->printMrBayesModelText(iqtree->getRate(), out, "all", "", false, inclParams);
iqtree->getModel()->printMrBayesModelText(out, "all", "", false, inclParams);

out << endl << "end;" << endl;
out.close();
Expand Down Expand Up @@ -2670,7 +2680,7 @@ void printMrBayesBlockFile(const char* filename, IQTree* &iqtree, bool inclParam

// MrBayes Partitions are 1-indexed
out << " [Partition No. " << convertIntToString(part + 1) << ", Using Model '" << currentTree->getModelName() << "']" << endl;
currentTree->getModel()->printMrBayesModelText(currentTree->getRate(), out,
currentTree->getModel()->printMrBayesModelText(out,
convertIntToString(part + 1), saln->partitions[part]->name, true, inclParams);
out << endl;
}
Expand Down
4 changes: 3 additions & 1 deletion model/modelbin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,9 @@ string ModelBIN::getNameParams(bool show_fixed_params) {
return retname.str();
}

void ModelBIN::printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
void ModelBIN::printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
RateHeterogeneity* rate = phylo_tree->getRate();

// MrBayes does not support Invariable Modifier for Binary data
if (rate->isFreeRate() || rate->getPInvar() > 0.0) {
warnLogStream("MrBayes does not support Invariable Sites with Binary Data! +I has been ignored!", out);
Expand Down
2 changes: 1 addition & 1 deletion model/modelbin.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ class ModelBIN : public ModelMarkov
* @param isSuperTree whether the tree is a super tree. Useful for retrieving information from the checkpoint file, which has different locations for PhyloTree and PhyloSuperTree
* @param inclParams whether to include IQTree optimized parameters for the model
*/
virtual void printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
virtual void printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);

};

Expand Down
66 changes: 66 additions & 0 deletions model/modelcodon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1142,6 +1142,72 @@ double ModelCodon::optimizeParameters(double gradient_epsilon) {
return score;
}

void ModelCodon::printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
// NST should be 1 if fix_kappa is true (no ts/tv ratio), else it should be 2
int nst = fix_kappa ? 1 : 2;
int code = phylo_tree->aln->getGeneticCodeId();
string mrBayesCode = getMrBayesGeneticCode(code);
RateHeterogeneity* rate = phylo_tree->getRate();

if (rate->isFreeRate() || rate->getGammaShape() > 0.0 || rate->getPInvar() > 0.0) {
warnLogStream("MrBayes Codon Models do not support any Heterogenity Rate Modifiers! (+I, +G, +R) They have been ignored!", out);
}

if (mrBayesCode.empty()) {
warnLogStream("MrBayes Does Not Support Codon Code " + convertIntToString(code) + "! Defaulting to Universal (Code 1).", out);
mrBayesCode = "universal";
}

if (num_params == 0 || name.find('_') != string::npos) { // If this model is a Empirical / Semi-Empirical Model
warnLogStream("MrBayes does not support Empirical or Semi-Empirical Codon Models. State Frequency will still be set, but no rate matrix will be set.", out);
}

out << " lset applyto=(" << partition << ") nucmodel=codon omegavar=equal nst=" << nst << " code=" << mrBayesCode << ";" << endl;

if (!inclParams) return;

out << " prset applyto=(" << partition << ") statefreqpr=dirichlet(";

// State Frequency Prior
bool isFirst = true;
for (int i = 0; i < num_states; i++) {
if (phylo_tree->aln->isStopCodon(i)) continue;

if (!isFirst) out << ", ";
else isFirst = false;

out << state_freq[i];
}

out << ")";

/*
* TS/TV Ratio (Kappa)
* Ratio Should be:
* `beta(kappa, 1)` when `codon_kappa_style` is `CK_ONE_KAPPA` (kappa here represents the ts/tv rate ratio)
* `beta(kappa, 1)` when `codon_kappa_style` is `CK_ONE_KAPPA_TS` (kappa here represents the transition rate)
* `beta(1, kappa)` when `codon_kappa_style` is `CK_ONE_KAPPA_TV` (kappa here represents the transversion rate)
* `beta(kappa, kappa2)` when `codon_kappa_style` is `CK_TWO_KAPPA` (kappa here represents the transition rate, and kappa2 represents the transversion rate)
*/
if (!fix_kappa) {
double v1 = codon_kappa_style == CK_ONE_KAPPA_TV ? 1 : kappa;
double v2 = 1;
if (codon_kappa_style == CK_ONE_KAPPA_TV)
v2 = kappa;
else if (codon_kappa_style == CK_TWO_KAPPA)
v2 = kappa2;

out << " tratiopr=beta(" << v1 << ", " << v2 << ")";
}

// DN/DS Ratio (Omega)
// Ratio is set to `dirichlet(omega, 1)`
if (!fix_omega) {
out << " omegapr=dirichlet(" << omega << ", 1)";
}

out << ";" << endl;
}

void ModelCodon::writeInfo(ostream &out) {
if (name.find('_') == string::npos)
Expand Down
11 changes: 11 additions & 0 deletions model/modelcodon.h
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,17 @@ class ModelCodon: public ModelMarkov {
*/
virtual bool getVariables(double *variables);

/**
* Print the model information in a format that can be accepted by MrBayes, using lset and prset.<br>
* By default, it simply prints a warning to the log and to the stream, stating that this model is not supported by MrBayes.
* @param out the ofstream to print the result to
* @param partition the partition to apply lset and prset to
* @param charset the current partition's charset. Useful for getting information from the checkpoint file
* @param isSuperTree whether the tree is a super tree. Useful for retrieving information from the checkpoint file, which has different locations for PhyloTree and PhyloSuperTree
* @param inclParams whether to include IQTree optimized parameters for the model
*/
virtual void printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);

};

#endif /* MODELCODON_H_ */
3 changes: 2 additions & 1 deletion model/modeldna.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -565,9 +565,10 @@ void ModelDNA::setVariables(double *variables) {
// }
}

void ModelDNA::printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
void ModelDNA::printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
bool equalFreq = freq_type == FREQ_EQUAL;
short nst = 6;
RateHeterogeneity* rate = phylo_tree->getRate();

// Find NST value (1 for JC/JC69/F81, 2 for K80/K2P/HKY/HKY85, 6 for SYM/GTR)
// NST is set to 6 by default (above), so check for JC/JC69/F81 and K80/K2P/HKY/HKY85
Expand Down
3 changes: 1 addition & 2 deletions model/modeldna.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,13 @@ class ModelDNA : public ModelMarkov
/**
* Print the model information in a format that can be accepted by MrBayes, using lset and prset.<br>
* By default, it simply prints a warning to the log and to the stream, stating that this model is not supported by MrBayes.
* @param rate the rate information
* @param out the ofstream to print the result to
* @param partition the partition to apply lset and prset to
* @param charset the current partition's charset. Useful for getting information from the checkpoint file
* @param isSuperTree whether the tree is a super tree. Useful for retrieving information from the checkpoint file, which has different locations for PhyloTree and PhyloSuperTree
* @param inclParams whether to include IQTree optimized parameters for the model
*/
virtual void printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
virtual void printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);

protected:

Expand Down
4 changes: 3 additions & 1 deletion model/modelmorphology.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,11 +151,13 @@ void ModelMorphology::writeInfo(ostream &out) {
}
}

void ModelMorphology::printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
void ModelMorphology::printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
warnLogStream("MrBayes only supports Morphological Data with states from {0-9}!", out);
warnLogStream("Morphological Data with states {A-Z} may cause errors!", out);
warnLogStream("Use Morphological Models in MrBayes with Caution!", out);

RateHeterogeneity* rate = phylo_tree->getRate();

// MrBayes does not support Invariable Modifier for Morph data
if (rate->isFreeRate() || rate->getPInvar() > 0.0) {
warnLogStream("MrBayes does not support Invariable Sites with Morphological Data! +I has been ignored!", out);
Expand Down
3 changes: 1 addition & 2 deletions model/modelmorphology.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,13 @@ class ModelMorphology: public ModelMarkov {
/**
* Print the model information in a format that can be accepted by MrBayes, using lset and prset.<br>
* By default, it simply prints a warning to the log and to the stream, stating that this model is not supported by MrBayes.
* @param rate the rate information
* @param out the ofstream to print the result to
* @param partition the partition to apply lset and prset to
* @param charset the current partition's charset. Useful for getting information from the checkpoint file
* @param isSuperTree whether the tree is a super tree. Useful for retrieving information from the checkpoint file, which has different locations for PhyloTree and PhyloSuperTree
* @param inclParams whether to include IQTree optimized parameters for the model
*/
virtual void printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
virtual void printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
};

#endif /* MODELMORPHOLOGY_H_ */
4 changes: 3 additions & 1 deletion model/modelprotein.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1162,10 +1162,12 @@ void ModelProtein::computeTipLikelihood(PML::StateType state, double *state_lk)
state_lk[ambi_aa[cstate*2+1]] = 1.0;
}

void ModelProtein::printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
void ModelProtein::printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
// Lset Parameters
out << " lset applyto=(" << partition << ") nucmodel=protein rates=";

RateHeterogeneity* rate = phylo_tree->getRate();

// RHAS Specification
// Free Rate should be substituted by +G+I
bool hasGamma = rate->getGammaShape() != 0.0 || rate->isFreeRate();
Expand Down
3 changes: 1 addition & 2 deletions model/modelprotein.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,13 @@ class ModelProtein : public ModelMarkov
/**
* Print the model information in a format that can be accepted by MrBayes, using lset and prset.<br>
* By default, it simply prints a warning to the log and to the stream, stating that this model is not supported by MrBayes.
* @param rate the rate information
* @param out the ofstream to print the result to
* @param partition the partition to apply lset and prset to
* @param charset the current partition's charset. Useful for getting information from the checkpoint file
* @param isSuperTree whether the tree is a super tree. Useful for retrieving information from the checkpoint file, which has different locations for PhyloTree and PhyloSuperTree
* @param inclParams whether to include IQTree optimized parameters for the model
*/
virtual void printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
virtual void printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);

private:
ModelsBlock *models_block;
Expand Down
2 changes: 1 addition & 1 deletion model/modelsubst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ void ModelSubst::printMrBayesFreeRateReplacement(bool isSuperTree, string &chars
out << " pinvarpr=fixed(" << minValueCheckMrBayes(p_invar) << ")";
}

void ModelSubst::printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
void ModelSubst::printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams) {
out << " [MrBayes Block Output is not supported by this model!]" << endl;
outWarning("MrBayes Block Output is not supported by this model of name '" + name + "'!");
}
Expand Down
3 changes: 1 addition & 2 deletions model/modelsubst.h
Original file line number Diff line number Diff line change
Expand Up @@ -405,14 +405,13 @@ class ModelSubst: public Optimization, public CheckpointFactory
/**
* Print the model information in a format that can be accepted by MrBayes, using lset and prset.<br>
* By default, it simply prints a warning to the log and to the stream, stating that this model is not supported by MrBayes.
* @param rate the rate information
* @param out the ofstream to print the result to
* @param partition the partition to apply lset and prset to
* @param charset the current partition's charset. Useful for getting information from the checkpoint file
* @param isSuperTree whether the tree is a super tree. Useful for retrieving information from the checkpoint file, which has different locations for PhyloTree and PhyloSuperTree
* @param inclParams whether to include IQTree optimized parameters for the model
*/
virtual void printMrBayesModelText(RateHeterogeneity* rate, ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);
virtual void printMrBayesModelText(ofstream& out, string partition, string charset, bool isSuperTree, bool inclParams);

/*****************************************************
Checkpointing facility
Expand Down
Loading