diff --git a/model/modeldna.cpp b/model/modeldna.cpp index b4b7593f3..85acca1f9 100644 --- a/model/modeldna.cpp +++ b/model/modeldna.cpp @@ -347,10 +347,8 @@ bool ModelDNA::setRateType(const char *rate_str) { int ModelDNA::getNDim() { assert(freq_type != FREQ_UNKNOWN); - int ndim = num_params; - if (freq_type == FREQ_ESTIMATE) - ndim += num_states-1; - return ndim; + // possible TO-DO: cache nFreqParams(freq_type) to avoid repeat calls. + return (num_params+nFreqParams(freq_type)); } void ModelDNA::writeParameters(ostream &out) { @@ -369,68 +367,68 @@ void ModelDNA::writeParameters(ostream &out) { } } - +/* + * getVariables *writes* the variables (i.e. model parameters). + * Returns true if any variables have changed, false if not. + */ bool ModelDNA::getVariables(double *variables) { - int i; + int i; bool changed = false; - if (num_params > 0) { - int num_all = param_spec.length(); - if (verbose_mode >= VB_MAX) { - for (i = 1; i <= num_params; i++) - cout << " estimated variables[" << i << "] = " << variables[i] << endl; - } - for (i = 0; i < num_all; i++) - if (!param_fixed[param_spec[i]]) { + if (num_params > 0) { + int num_all = param_spec.length(); + if (verbose_mode >= VB_MAX) { + for (i = 1; i <= num_params; i++) + cout << " estimated variables[" << i << "] = " << variables[i] << endl; + } + for (i = 0; i < num_all; i++) + if (!param_fixed[param_spec[i]]) { changed |= (rates[i] != variables[(int)param_spec[i]]); - rates[i] = variables[(int)param_spec[i]]; - } - } - if (freq_type == FREQ_ESTIMATE) { - // 2015-09-07: relax the sum of state_freq to be 1, this will be done at the end of optimization - int ndim = getNDim(); - changed |= memcmpcpy(state_freq, variables+(ndim-num_states+2), (num_states-1)*sizeof(double)); -// double sum = 0; -// for (i = 0; i < num_states-1; i++) -// sum += state_freq[i]; -// state_freq[num_states-1] = 1.0 - sum; + rates[i] = variables[(int)param_spec[i]]; + } + + } + changed |= freqsFromParams(state_freq,variables+num_params+1,freq_type); + return changed; +// double sum = 0; +// for (i = 0; i < num_states-1; i++) +// sum += state_freq[i]; +// state_freq[num_states-1] = 1.0 - sum; // BUG FIX 2015.08.28 // int nrate = getNDim(); // if (freq_type == FREQ_ESTIMATE) nrate -= (num_states-1); -// double sum = 1.0; -// int i, j; -// for (i = 1; i < num_states; i++) -// sum += variables[nrate+i]; -// for (i = 0, j = 1; i < num_states; i++) -// if (i != highest_freq_state) { -// state_freq[i] = variables[nrate+j] / sum; -// j++; -// } -// state_freq[highest_freq_state] = 1.0/sum; - } - return changed; +// double sum = 1.0; +// int i, j; +// for (i = 1; i < num_states; i++) +// sum += variables[nrate+i]; +// for (i = 0, j = 1; i < num_states; i++) +// if (i != highest_freq_state) { +// state_freq[i] = variables[nrate+j] / sum; +// j++; +// } +// state_freq[highest_freq_state] = 1.0/sum; } +/* + * setVariables *reads* the variables (i.e. model parameters). + */ void ModelDNA::setVariables(double *variables) { - if (num_params > 0) { - int num_all = param_spec.length(); - for (int i = 0; i < num_all; i++) - if (!param_fixed[param_spec[i]]) - variables[(int)param_spec[i]] = rates[i]; - } - if (freq_type == FREQ_ESTIMATE) { - // 2015-09-07: relax the sum of state_freq to be 1, this will be done at the end of optimization - int ndim = getNDim(); - memcpy(variables+(ndim-num_states+2), state_freq, (num_states-1)*sizeof(double)); + if (num_params > 0) { + int num_all = param_spec.length(); + for (int i = 0; i < num_all; i++) + if (!param_fixed[param_spec[i]]) + variables[(int)param_spec[i]] = rates[i]; + } + // and copy parameters for base frequencies + paramsFromFreqs(variables+num_params+1, state_freq, freq_type); // BUG FIX 2015.08.28 // int nrate = getNDim(); // if (freq_type == FREQ_ESTIMATE) nrate -= (num_states-1); -// int i, j; -// for (i = 0, j = 1; i < num_states; i++) -// if (i != highest_freq_state) { -// variables[nrate+j] = state_freq[i] / state_freq[highest_freq_state]; -// j++; -// } - } +// int i, j; +// for (i = 0, j = 1; i < num_states; i++) +// if (i != highest_freq_state) { +// variables[nrate+j] = state_freq[i] / state_freq[highest_freq_state]; +// j++; +// } } diff --git a/model/modelfactory.cpp b/model/modelfactory.cpp index 1c33fd719..040aa0914 100644 --- a/model/modelfactory.cpp +++ b/model/modelfactory.cpp @@ -321,53 +321,77 @@ ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree, ModelsBlock *models_ if (freq_str.length() > 2 && freq_str[2] == OPEN_BRACKET) { if (freq_type == FREQ_MIXTURE) outError("Mixture frequency with user-defined frequency is not allowed"); - close_bracket = freq_str.find(CLOSE_BRACKET); - if (close_bracket == string::npos) - outError("Close bracket not found in ", freq_str); - if (close_bracket != freq_str.length()-1) - outError("Wrong close bracket position ", freq_str); - freq_type = FREQ_USER_DEFINED; - freq_params = freq_str.substr(3, close_bracket-3); - } else if (freq_str == "+FC" || freq_str == "+Fc" || freq_str == "+F") { + close_bracket = freq_str.find(CLOSE_BRACKET); + if (close_bracket == string::npos) + outError("Close bracket not found in ", freq_str); + if (close_bracket != freq_str.length()-1) + outError("Wrong close bracket position ", freq_str); + freq_type = FREQ_USER_DEFINED; + freq_params = freq_str.substr(3, close_bracket-3); + } else if (freq_str == "+FC" || freq_str == "+Fc" || freq_str == "+F") { if (freq_type == FREQ_MIXTURE) { freq_params = "empirical," + freq_params; optimize_mixmodel_weight = true; } else freq_type = FREQ_EMPIRICAL; - } else if (freq_str == "+FU" || freq_str == "+Fu") { + } else if (freq_str == "+FU" || freq_str == "+Fu") { if (freq_type == FREQ_MIXTURE) outError("Mixture frequency with user-defined frequency is not allowed"); else freq_type = FREQ_USER_DEFINED; - } else if (freq_str == "+FQ" || freq_str == "+Fq") { + } else if (freq_str == "+FQ" || freq_str == "+Fq") { if (freq_type == FREQ_MIXTURE) outError("Mixture frequency with equal frequency is not allowed"); else - freq_type = FREQ_EQUAL; - } else if (freq_str == "+FO" || freq_str == "+Fo") { + freq_type = FREQ_EQUAL; + } else if (freq_str == "+FO" || freq_str == "+Fo") { if (freq_type == FREQ_MIXTURE) { freq_params = "optimize," + freq_params; optimize_mixmodel_weight = true; } else freq_type = FREQ_ESTIMATE; - } else if (freq_str == "+F1x4" || freq_str == "+F1X4") { + } else if (freq_str == "+F1x4" || freq_str == "+F1X4") { if (freq_type == FREQ_MIXTURE) outError("Mixture frequency with " + freq_str + " is not allowed"); else freq_type = FREQ_CODON_1x4; - } else if (freq_str == "+F3x4" || freq_str == "+F3X4") { + } else if (freq_str == "+F3x4" || freq_str == "+F3X4") { if (freq_type == FREQ_MIXTURE) outError("Mixture frequency with " + freq_str + " is not allowed"); else freq_type = FREQ_CODON_3x4; - } else if (freq_str == "+F3x4C" || freq_str == "+F3x4c" || freq_str == "+F3X4C" || freq_str == "+F3X4c") { + } else if (freq_str == "+F3x4C" || freq_str == "+F3x4c" || freq_str == "+F3X4C" || freq_str == "+F3X4c") { if (freq_type == FREQ_MIXTURE) outError("Mixture frequency with " + freq_str + " is not allowed"); else freq_type = FREQ_CODON_3x4C; - } else outError("Unknown state frequency type ",freq_str); -// model_str = model_str.substr(0, posfreq); + } else if (freq_str == "+FRY") { + // MDW to Minh: I don't know how these should interact with FREQ_MIXTURE, + // so as nearly everything else treats it as an error, I do too. + if (freq_type == FREQ_MIXTURE) + outError("Mixture frequency with " + freq_str + " is not allowed"); + else + freq_type = FREQ_DNA_RY; + } else if (freq_str == "+FWS") { + if (freq_type == FREQ_MIXTURE) + outError("Mixture frequency with " + freq_str + " is not allowed"); + else + freq_type = FREQ_DNA_WS; + } else if (freq_str == "+FMK") { + if (freq_type == FREQ_MIXTURE) + outError("Mixture frequency with " + freq_str + " is not allowed"); + else + freq_type = FREQ_DNA_MK; + } else { + // might be "+F####" where # are digits + try { + freq_type = parseStateFreqDigits(freq_str.substr(2)); // throws an error if not in +F#### format + } catch (...) { + outError("Unknown state frequency type ",freq_str); + } } +// model_str = model_str.substr(0, posfreq); + } /******************** initialize model ****************************/ diff --git a/model/modelliemarkov.cpp b/model/modelliemarkov.cpp index 28adc0ae2..ff409afb3 100644 --- a/model/modelliemarkov.cpp +++ b/model/modelliemarkov.cpp @@ -344,6 +344,17 @@ void ModelLieMarkov::init(const char *model_name, string model_params, StateFreq } setBasis(); // sets basis and num_params + /* + * Check validity of freq_type. (After setBasis because that sets bdf, + * if needed could move before setBasis with minor changes) + */ + if (!validFreqType()) { + throw("Can't use base frequency type " + +freqTypeString(freq_type) + +" with Lie-Markov model " + +name); + } + if (model_parameters) delete[] model_parameters; model_parameters = new double [num_params]; @@ -381,6 +392,60 @@ ModelLieMarkov::~ModelLieMarkov() { // Do nothing, for now. model_parameters is reclaimed in ~ModelMarkov } +/* + * Return 'true' if freq type is compatible with this Lie-Markov model. + * NOTE: Any freq_type exept FREQ_USER_DEFINED, FREQ_EMPIRICAL or + * FREQ_ESTIMATE is at best redundant, and worst incompatible. + * The above three are really the only freq types which should be used + * with an LM model. + * Actually, the +F1123s are valid with a BDF=3 LM model, but + * I haven't coded for this possibility so reject it. Could be fixed. + * + * Also for FREQ_USER_DEFINED and FREQ_EMPIRICAL, for LM models with + * BDF<3, compatibility depends on the given base freqs. There + * is code elsewhere which prints a warning if incompatible base freqs + * (and actual model base freqs will be 'close to' the requested freqs.) + */ +bool ModelLieMarkov::validFreqType() { + int bdf=BDF[model_num]; + switch(getFreqType()) { + case FREQ_USER_DEFINED: + case FREQ_EMPIRICAL: + case FREQ_ESTIMATE: + return true; + case FREQ_UNKNOWN: + case FREQ_CODON_1x4: + case FREQ_CODON_3x4: + case FREQ_CODON_3x4C: + case FREQ_MIXTURE: + case FREQ_DNA_1112: + case FREQ_DNA_1121: + case FREQ_DNA_1211: + case FREQ_DNA_2111: + case FREQ_DNA_1123: + case FREQ_DNA_1213: + case FREQ_DNA_1231: + case FREQ_DNA_2113: + case FREQ_DNA_2131: + case FREQ_DNA_2311: + return false; + case FREQ_EQUAL: + return (bdf==0); + case FREQ_DNA_RY: return("+FRY"); + return (bdf==2 && symmetry==0); + case FREQ_DNA_WS: return("+FWS"); + return (bdf==2 && symmetry==1); + case FREQ_DNA_MK: return("+FMK"); + return (bdf==2 && symmetry==2); + case FREQ_DNA_1122: return("+F1122"); + return (bdf==1 && symmetry==2); + case FREQ_DNA_1212: return("+F1212"); + return (bdf==1 && symmetry==0); + case FREQ_DNA_1221: return("+F1221"); + return (bdf==1 && symmetry==1); + default: throw("Unrecoginzed freq_type in validFreqType - can't happen"); + } +} /* static */ bool ModelLieMarkov::validModelName(string model_name) { int model_num, symmetry; diff --git a/model/modelliemarkov.h b/model/modelliemarkov.h index 64ee6fd89..555572da5 100644 --- a/model/modelliemarkov.h +++ b/model/modelliemarkov.h @@ -62,6 +62,6 @@ class ModelLieMarkov: public ModelMarkov { const static int NUM_RATES; const static int NUM_LM_MODELS; */ + bool validFreqType(); }; - #endif /* MODELLIEMARKOV_H_ */ diff --git a/model/modelmarkov.cpp b/model/modelmarkov.cpp index 7d84822f3..f9eec59e0 100644 --- a/model/modelmarkov.cpp +++ b/model/modelmarkov.cpp @@ -154,6 +154,11 @@ void ModelMarkov::setTree(PhyloTree *tree) { } string ModelMarkov::getName() { + // MDW note to Minh for code review: I don't really understand what getName() + // is used for. I've tried to keep the old behaviour while adding + // the new freq_types, but give this change extra attention please. + return name+freqTypeString(getFreqType()); + /* if (getFreqType() == FREQ_EMPIRICAL) return name + "+F"; else if (getFreqType() == FREQ_CODON_1x4) @@ -168,6 +173,7 @@ string ModelMarkov::getName() { return name + "+FQ"; else return name; + */ } string ModelMarkov::getNameParams() { @@ -187,66 +193,83 @@ string ModelMarkov::getNameParams() { } void ModelMarkov::getNameParamsFreq(ostream &retname) { - if (getFreqType() == FREQ_EMPIRICAL || (getFreqType() == FREQ_USER_DEFINED && phylo_tree->aln->seq_type == SEQ_DNA)) { - retname << "+F"; + retname << freqTypeString(freq_type); // "+F..." but without {frequencies} + if (phylo_tree->aln->seq_type == SEQ_DNA && + freq_type != FREQ_UNKNOWN && + freq_type != FREQ_EQUAL && + freq_type != FREQ_MIXTURE) { + // Add "{}" retname << "{" << state_freq[0]; for (int i = 1; i < num_states; i++) retname << "," << state_freq[i]; retname << "}"; - } else if (getFreqType() == FREQ_CODON_1x4) - retname << "+F1X4"; - else if (getFreqType() == FREQ_CODON_3x4) - retname << "+F3X4"; - else if (getFreqType() == FREQ_CODON_3x4C) - name += "+F3X4C"; - else if (getFreqType() == FREQ_ESTIMATE) { - retname << "+FO"; + } + /* + if (getFreqType() == FREQ_EMPIRICAL || (getFreqType() == FREQ_USER_DEFINED && phylo_tree->aln->seq_type == SEQ_DNA)) { + retname << "+F"; + retname << "{" << state_freq[0]; + for (int i = 1; i < num_states; i++) + retname << "," << state_freq[i]; + retname << "}"; + } else if (getFreqType() == FREQ_CODON_1x4) + retname << "+F1X4"; + else if (getFreqType() == FREQ_CODON_3x4) + retname << "+F3X4"; + else if (getFreqType() == FREQ_CODON_3x4C) + name += "+F3X4C"; + else if (getFreqType() == FREQ_ESTIMATE) { + retname << "+FO"; retname << "{" << state_freq[0]; for (int i = 1; i < num_states; i++) retname << "," << state_freq[i]; retname << "}"; } else if (getFreqType() == FREQ_EQUAL && phylo_tree->aln->seq_type != SEQ_DNA) - retname << "+FQ"; + retname << "+FQ"; + */ } void ModelMarkov::init_state_freq(StateFreqType type) { - //if (type == FREQ_UNKNOWN) return; - int i; - freq_type = type; - assert(freq_type != FREQ_UNKNOWN); - switch (freq_type) { - case FREQ_EQUAL: - if (phylo_tree->aln->seq_type == SEQ_CODON) { - int nscodon = phylo_tree->aln->getNumNonstopCodons(); - double freq_codon = (1.0-(num_states-nscodon)*MIN_FREQUENCY)/(nscodon); - for (i = 0; i < num_states; i++) - if (phylo_tree->aln->isStopCodon(i)) - state_freq[i] = MIN_FREQUENCY; - else - state_freq[i] = freq_codon; - } else { + //if (type == FREQ_UNKNOWN) return; + int i; + freq_type = type; + assert(freq_type != FREQ_UNKNOWN); + switch (freq_type) { + case FREQ_EQUAL: + if (phylo_tree->aln->seq_type == SEQ_CODON) { + int nscodon = phylo_tree->aln->getNumNonstopCodons(); + double freq_codon = (1.0-(num_states-nscodon)*MIN_FREQUENCY)/(nscodon); + for (i = 0; i < num_states; i++) + if (phylo_tree->aln->isStopCodon(i)) + state_freq[i] = MIN_FREQUENCY; + else + state_freq[i] = freq_codon; + } else { double freq_state = 1.0/num_states; - for (i = 0; i < num_states; i++) - state_freq[i] = freq_state; - } - break; - case FREQ_ESTIMATE: - case FREQ_EMPIRICAL: - if (phylo_tree->aln->seq_type == SEQ_CODON) { - double ntfreq[12]; - phylo_tree->aln->computeCodonFreq(freq_type, state_freq, ntfreq); -// phylo_tree->aln->computeCodonFreq(state_freq); - } else if (phylo_tree->aln->seq_type != SEQ_POMO) - phylo_tree->aln->computeStateFreq(state_freq); - for (i = 0; i < num_states; i++) - if (state_freq[i] > state_freq[highest_freq_state]) - highest_freq_state = i; - break; - case FREQ_USER_DEFINED: - if (state_freq[0] == 0.0) outError("State frequencies not specified"); - break; - default: break; - } + for (i = 0; i < num_states; i++) + state_freq[i] = freq_state; + } + break; + case FREQ_ESTIMATE: + case FREQ_EMPIRICAL: + if (phylo_tree->aln->seq_type == SEQ_CODON) { + double ntfreq[12]; + phylo_tree->aln->computeCodonFreq(freq_type, state_freq, ntfreq); +// phylo_tree->aln->computeCodonFreq(state_freq); + } else if (phylo_tree->aln->seq_type != SEQ_POMO) + phylo_tree->aln->computeStateFreq(state_freq); + for (i = 0; i < num_states; i++) + if (state_freq[i] > state_freq[highest_freq_state]) + highest_freq_state = i; + break; + case FREQ_USER_DEFINED: + if (state_freq[0] == 0.0) outError("State frequencies not specified"); + break; + default: break; + } + if (phylo_tree->aln->seq_type == SEQ_DNA) { + // For complex DNA freq_types, adjust state_freq to conform to that freq_type. + forceFreqsConform(state_freq, freq_type); + } } void ModelMarkov::init(StateFreqType type) { @@ -542,7 +565,10 @@ int ModelMarkov::getNDimFreq() { else if (freq_type == FREQ_CODON_3x4 || freq_type == FREQ_CODON_3x4C) return 9; - return 0; + if (phylo_tree->aln->seq_type == SEQ_DNA) { + return nFreqParams(freq_type); + } + return 0; } void ModelMarkov::scaleStateFreq(bool sum_one) { @@ -668,28 +694,18 @@ bool ModelMarkov::isUnstableParameters() { } void ModelMarkov::setBounds(double *lower_bound, double *upper_bound, bool *bound_check) { - assert(is_reversible && "setBounds should only be called on subclass of ModelMarkov"); - int i, ndim = getNDim(); + int i, ndim = getNDim(); - for (i = 1; i <= ndim; i++) { - //cout << variables[i] << endl; - lower_bound[i] = MIN_RATE; - upper_bound[i] = MAX_RATE; - bound_check[i] = false; - } + for (i = 1; i <= ndim; i++) { + //cout << variables[i] << endl; + lower_bound[i] = MIN_RATE; + upper_bound[i] = MAX_RATE; + bound_check[i] = false; + } - if (freq_type == FREQ_ESTIMATE) { - for (i = ndim-num_states+2; i <= ndim; i++) { -// lower_bound[i] = MIN_FREQUENCY/state_freq[highest_freq_state]; -// upper_bound[i] = state_freq[highest_freq_state]/MIN_FREQUENCY; - lower_bound[i] = MIN_FREQUENCY; -// upper_bound[i] = 100.0; - upper_bound[i] = 1.0; - bound_check[i] = false; - } - } + setBoundsForFreqType(&lower_bound[num_params+1],&upper_bound[num_params+1],&bound_check[num_params+1],MIN_FREQUENCY,freq_type); } double ModelMarkov::optimizeParameters(double gradient_epsilon) { diff --git a/tools.cpp b/tools.cpp index 6e4a687da..c68bf2ef3 100644 --- a/tools.cpp +++ b/tools.cpp @@ -31,6 +31,7 @@ #include "timeutil.h" #include "gzstream.h" #include "MPIHelper.h" +#include "alignment.h" VerboseMode verbose_mode; @@ -1962,7 +1963,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) { if (strcmp(argv[cnt], "-f") == 0) { cnt++; if (cnt >= argc) - throw "Use -f "; + throw "Use -f >"; if (strcmp(argv[cnt], "q") == 0 || strcmp(argv[cnt], "EQ") == 0) params.freq_type = FREQ_EQUAL; else if (strcmp(argv[cnt], "c") == 0 @@ -1974,8 +1975,18 @@ void parseArg(int argc, char *argv[], Params ¶ms) { else if (strcmp(argv[cnt], "u") == 0 || strcmp(argv[cnt], "UD") == 0) params.freq_type = FREQ_USER_DEFINED; + else if (strcmp(argv[cnt], "ry") == 0 + || strcmp(argv[cnt], "RY") == 0) + params.freq_type = FREQ_DNA_RY; + else if (strcmp(argv[cnt], "ws") == 0 + || strcmp(argv[cnt], "WS") == 0) + params.freq_type = FREQ_DNA_WS; + else if (strcmp(argv[cnt], "mk") == 0 + || strcmp(argv[cnt], "MK") == 0) + params.freq_type = FREQ_DNA_MK; else - throw "Use -f "; + // throws error message if can't parse + params.freq_type = parseStateFreqDigits(argv[cnt]); continue; } if (strcmp(argv[cnt], "-fs") == 0) { @@ -3469,13 +3480,13 @@ void usage_iqtree(char* argv[], bool full_command) { << " Morphology/SNP: MK (default), ORDERED" << endl << " Lie Markov DNA: One of the following, optionally prefixed by RY, WS or MK:" << endl << " 1.1, 2.2b, 3.3a, 3.3b, 3.3c," << endl - << " 3.4, 4.4a, 4.4b, 4.5a, 4.5b," << endl - << " 5.6a, 5.6b, 5.7a, 5.7b, 5.7c," << endl - << " 5.11a,5.11b,5.11c,5.16, 6.6," << endl - << " 6.7a, 6.7b, 6.8a, 6.8b, 6.17a," << endl - << " 6.17b,8.8, 8.10a,8.10b, 8.16," << endl - << " 8.17, 8.18, 9.20a,9.20b,10.12," << endl - << " 10.34,12.12" << endl + << " 3.4, 4.4a, 4.4b, 4.5a, 4.5b," << endl + << " 5.6a, 5.6b, 5.7a, 5.7b, 5.7c," << endl + << " 5.11a,5.11b,5.11c,5.16, 6.6," << endl + << " 6.7a, 6.7b, 6.8a, 6.8b, 6.17a," << endl + << " 6.17b,8.8, 8.10a,8.10b, 8.16," << endl + << " 8.17, 8.18, 9.20a,9.20b,10.12," << endl + << " 10.34,12.12" << endl << " Non-reversible: STRSYM (strand symmetric model, synonymous with WS6.6)" << endl << " Non-reversible: UNREST (most general unrestricted model, functionally equivalent to 12.12)" << endl << " Otherwise: Name of file containing user-model parameters" << endl @@ -3485,6 +3496,11 @@ void usage_iqtree(char* argv[], bool full_command) { << " +F Empirically counted frequencies from alignment" << endl << " +FO (letter-O) Optimized frequencies by maximum-likelihood" << endl << " +FQ Equal frequencies" << endl + << " +FRY, +FWS, +FMK For DNA models only, +FRY is freq(A+G)=1/2=freq(C+T)," << endl + << " +FWS is freq(A+T)=1/2=freq(C+G), +FMK is freq(A+C)=1/2=freq(G+T)." << endl + << " +F#### where # are digits - for DNA models only, for basis in ACGT order," << endl + << " digits indicate which frequencies are constrained to be the same." << endl + << " E.g. +F1221 means freq(A)=freq(T), freq(C)=freq(G)." << endl << " +FU Amino-acid frequencies by the given protein matrix" << endl << " +F1x4 (codon model) Equal NT frequencies over three codon positions" << endl << " +F3x4 (codon model) Unequal NT frequencies over three codon positions" << endl @@ -4295,3 +4311,507 @@ int pairInteger(int int1, int int2) { return ((int1 + int2)*(int1 + int2 + 1)/2 + int1); } } + +/* + * Given a string of 4 digits, return a StateFreqType according to + * equality constraints expressed by those digits. + * E.g. "1233" constrains pi_G=pi_T (ACGT order, 3rd and 4th equal) + * which results in FREQ_DNA_2311. "5288" would give the same result. + */ + +StateFreqType parseStateFreqDigits(string digits) { + bool good = true; + if (digits.length()!=4) { + good = false; + } else { + // Convert digits to canonical form, first occuring digit becomes 1 etc. + int digit_order[] = {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}; + int first_found = 0; + for (int i=0; i<4; i++) { + int digit = digits[i]-'0'; + if (digit<0 || digit>9) { + good = false; // found a non-digit + break; + } + if (digit_order[digit]==-1) { + // haven't seen this digit before + digit_order[digit]=++first_found; + } + // rewrite digit in canonical form + digits[i] = '0'+digit_order[digit]; + } + // e.g. if digits was "5288", digit_order will end up as {-1,-1,2,-1,-1,1,-1,-1,3,-1} + } + if (!good) throw "Use -f >"; + StateFreqType freq_type = FREQ_UNKNOWN; + // Now just exhaustively list all canonical digit possibilities + if (digits.compare("1111")==0) { + freq_type = FREQ_EQUAL; + } else if (digits.compare("1112")==0) { + freq_type = FREQ_DNA_1112; + } else if (digits.compare("1121")==0) { + freq_type = FREQ_DNA_1121; + } else if (digits.compare("1211")==0) { + freq_type = FREQ_DNA_1211; + } else if (digits.compare("1222")==0) { + freq_type = FREQ_DNA_2111; + } else if (digits.compare("1122")==0) { + freq_type = FREQ_DNA_1122; + } else if (digits.compare("1212")==0) { + freq_type = FREQ_DNA_1212; + } else if (digits.compare("1221")==0) { + freq_type = FREQ_DNA_1221; + } else if (digits.compare("1123")==0) { + freq_type = FREQ_DNA_1123; + } else if (digits.compare("1213")==0) { + freq_type = FREQ_DNA_1213; + } else if (digits.compare("1231")==0) { + freq_type = FREQ_DNA_1231; + } else if (digits.compare("1223")==0) { + freq_type = FREQ_DNA_2113; + } else if (digits.compare("1232")==0) { + freq_type = FREQ_DNA_2131; + } else if (digits.compare("1233")==0) { + freq_type = FREQ_DNA_2311; + } else if (digits.compare("1234")==0) { + freq_type = FREQ_ESTIMATE; + } else + throw ("Unrecognized canonical digits - Can't happen"); // paranoia is good. + return freq_type; +} + +/* + * For freq_type, return a "+F" string specifying that freq_type. + * Note not all freq_types accomodated. + * Inverse of this occurs in ModelFactory::ModelFactory, + * where +F... suffixes on model names get parsed. + */ +string freqTypeString(StateFreqType freq_type) { + switch(freq_type) { + case FREQ_UNKNOWN: return(""); + case FREQ_USER_DEFINED: return("+FU"); + case FREQ_EQUAL: return("+FQ"); + case FREQ_EMPIRICAL: return("+F"); + case FREQ_ESTIMATE: return("+FO"); + case FREQ_CODON_1x4: return("+F1X4"); + case FREQ_CODON_3x4: return("+F3X4"); + case FREQ_CODON_3x4C: return("+F3X4C"); + case FREQ_MIXTURE: return(""); // no idea what to do here - MDW + case FREQ_DNA_RY: return("+FRY"); + case FREQ_DNA_WS: return("+FWS"); + case FREQ_DNA_MK: return("+FMK"); + case FREQ_DNA_1112: return("+F1112"); + case FREQ_DNA_1121: return("+F1121"); + case FREQ_DNA_1211: return("+F1211"); + case FREQ_DNA_2111: return("+F2111"); + case FREQ_DNA_1122: return("+F1122"); + case FREQ_DNA_1212: return("+F1212"); + case FREQ_DNA_1221: return("+F1221"); + case FREQ_DNA_1123: return("+F1123"); + case FREQ_DNA_1213: return("+F1213"); + case FREQ_DNA_1231: return("+F1231"); + case FREQ_DNA_2113: return("+F2113"); + case FREQ_DNA_2131: return("+F2131"); + case FREQ_DNA_2311: return("+F2311"); + default: throw("Unrecoginzed freq_type in freqTypeString - can't happen"); + } +} + +/* + * All params in range [0,1] + * returns true if base frequencies have changed as a result of this call + */ + +bool freqsFromParams(double *freq_vec, double *params, StateFreqType freq_type) { + double pA, pC, pG, pT; // base freqs + switch (freq_type) { + case FREQ_EQUAL: + pA=pC=pG=pT=0.25; + break; + case FREQ_USER_DEFINED: + case FREQ_EMPIRICAL: + pA=freq_vec[0]; // i.e. freq_vec will not be changed + pC=freq_vec[1]; + pG=freq_vec[2]; + pT=freq_vec[3]; + break; + case FREQ_ESTIMATE: // Minh: in code review, please pay extra attention to ensure my treadment of FREQ_ESTIMATE is equivalent to old treatment. + pA=params[0]; + pC=params[1]; + pG=params[2]; + pT=1-pA-pC-pG; + break; + case FREQ_DNA_RY: + pA = params[0]/2; + pG = 0.5-pA; + pC = params[1]/2; + pT = 0.5-pC; + break; + case FREQ_DNA_WS: + pA = params[0]/2; + pT = 0.5-pA; + pC = params[1]/2; + pG = 0.5-pC; + break; + case FREQ_DNA_MK: + pA = params[0]/2; + pC = 0.5-pA; + pG = params[1]/2; + pT = 0.5-pG; + break; + case FREQ_DNA_1112: + pA = pC = pG = params[0]/3; + pT = 1-3*pA; + break; + case FREQ_DNA_1121: + pA = pC = pT = params[0]/3; + pG = 1-3*pA; + break; + case FREQ_DNA_1211: + pA = pG = pT = params[0]/3; + pC = 1-3*pA; + break; + case FREQ_DNA_2111: + pC = pG = pT = params[0]/3; + pA = 1-3*pC; + break; + case FREQ_DNA_1122: + pA = params[0]/2; + pC = pA; + pG = 0.5-pA; + pT = pG; + break; + case FREQ_DNA_1212: + pA = params[0]/2; + pG = pA; + pC = 0.5-pA; + pT = pC; + break; + case FREQ_DNA_1221: + pA = params[0]/2; + pT = pA; + pC = 0.5-pA; + pG = pC; + break; + case FREQ_DNA_1123: + pA = params[0]/2; + pC = pA; + pG = params[1]*(1-2*pA); + pT = 1-pG-2*pA; + break; + case FREQ_DNA_1213: + pA = params[0]/2; + pG = pA; + pC = params[1]*(1-2*pA); + pT = 1-pC-2*pA; + break; + case FREQ_DNA_1231: + pA = params[0]/2; + pT = pA; + pC = params[1]*(1-2*pA); + pG = 1-pC-2*pA; + break; + case FREQ_DNA_2113: + pC = params[0]/2; + pG = pC; + pA = params[1]*(1-2*pC); + pT = 1-pA-2*pC; + break; + case FREQ_DNA_2131: + pC = params[0]/2; + pT = pC; + pA = params[1]*(1-2*pC); + pG = 1-pA-2*pC; + break; + case FREQ_DNA_2311: + pG = params[0]/2; + pT = pG; + pA = params[1]*(1-2*pG); + pC = 1-pA-2*pG; + break; + default: + throw("Unrecognized freq_type in freqsFromParams - can't happen"); + } + bool changed = freq_vec[0]!=pA || freq_vec[1]!=pC || freq_vec[2]!=pG || freq_vec[3]!=pT; + if (changed) { + freq_vec[0]=pA; + freq_vec[1]=pC; + freq_vec[2]=pG; + freq_vec[3]=pT; + } + return(changed); +} + +/* + * For given freq_type, derives frequency parameters from freq_vec + * All parameters are in range [0,1] (assuming freq_vec is valid) + */ + +void paramsFromFreqs(double *params, double *freq_vec, StateFreqType freq_type) { + double pA = freq_vec[0]; // These just improve code readability + double pC = freq_vec[1]; + double pG = freq_vec[2]; + double pT = freq_vec[3]; + switch (freq_type) { + case FREQ_EQUAL: + case FREQ_USER_DEFINED: + case FREQ_EMPIRICAL: + break; // freq_vec never changes + case FREQ_ESTIMATE: + params[0]=pA; + params[1]=pC; + params[2]=pG; + break; + case FREQ_DNA_RY: + params[0]=2*pA; + params[1]=2*pC; + break; + case FREQ_DNA_WS: + params[0]=2*pA; + params[1]=2*pC; + break; + case FREQ_DNA_MK: + params[0]=2*pA; + params[1]=2*pG; + break; + case FREQ_DNA_1112: + params[0]=3*pA; + break; + case FREQ_DNA_1121: + params[0]=3*pA; + break; + case FREQ_DNA_1211: + params[0]=3*pA; + break; + case FREQ_DNA_2111: + params[0]=3*pC; + break; + case FREQ_DNA_1122: + params[0]=2*pA; + break; + case FREQ_DNA_1212: + params[0]=2*pA; + break; + case FREQ_DNA_1221: + params[0]=2*pA; + break; + case FREQ_DNA_1123: + params[0]=2*pA; + params[1]=pG/(1-params[0]); + break; + case FREQ_DNA_1213: + params[0]=2*pA; + params[1]=pC/(1-params[0]); + break; + case FREQ_DNA_1231: + params[0]=2*pA; + params[1]=pC/(1-params[0]); + break; + case FREQ_DNA_2113: + params[0]=2*pC; + params[1]=pA/(1-params[0]); + break; + case FREQ_DNA_2131: + params[0]=2*pC; + params[1]=pA/(1-params[0]); + break; + case FREQ_DNA_2311: + params[0]=2*pG; + params[1]=pA/(1-params[0]); + break; + default: + throw("Unrecognized freq_type in paramsFromFreqs - can't happen"); + } +} + +/* + * Given a DNA freq_type and a base frequency vector, alter the + * base freq vector to conform with the constraints of freq_type + */ +void forceFreqsConform(double *base_freq, StateFreqType freq_type) { + double pA = base_freq[0]; // These just improve code readability + double pC = base_freq[1]; + double pG = base_freq[2]; + double pT = base_freq[3]; + double diff; + switch (freq_type) { + case FREQ_EQUAL: + base_freq[0] = base_freq[1] = base_freq[2] = base_freq[3] = 0.25; + break; + case FREQ_USER_DEFINED: + case FREQ_EMPIRICAL: + case FREQ_ESTIMATE: + break; // any base_freq is legal + case FREQ_DNA_RY: + diff = (pA+pG-pC-pT)/2; + base_freq[0] = pA-diff; + base_freq[1] = pC+diff; + base_freq[2] = pG-diff; + base_freq[3] = pT+diff; + break; + case FREQ_DNA_WS: + diff = (pA+pT-pC-pG)/2; + base_freq[0] = pA-diff; + base_freq[1] = pC+diff; + base_freq[2] = pG+diff; + base_freq[3] = pT-diff; + break; + case FREQ_DNA_MK: + diff = (pA+pC-pG-pT)/2; + base_freq[0] = pA-diff; + base_freq[1] = pC-diff; + base_freq[2] = pG+diff; + base_freq[3] = pT+diff; + break; + case FREQ_DNA_1112: + base_freq[0]=base_freq[1]=base_freq[2]=(pA+pC+pG)/3; + break; + case FREQ_DNA_1121: + base_freq[0]=base_freq[1]=base_freq[3]=(pA+pC+pT)/3; + break; + case FREQ_DNA_1211: + base_freq[0]=base_freq[2]=base_freq[3]=(pA+pG+pT)/3; + break; + case FREQ_DNA_2111: + base_freq[1]=base_freq[2]=base_freq[3]=(pC+pG+pT)/3; + break; + case FREQ_DNA_1122: + base_freq[0]=base_freq[1]=(pA+pC)/2; + base_freq[2]=base_freq[3]=(pG+pT)/2; + break; + case FREQ_DNA_1212: + base_freq[0]=base_freq[2]=(pA+pG)/2; + base_freq[1]=base_freq[3]=(pC+pT)/2; + break; + case FREQ_DNA_1221: + base_freq[0]=base_freq[3]=(pA+pT)/2; + base_freq[1]=base_freq[2]=(pC+pG)/2; + break; + case FREQ_DNA_1123: + base_freq[0]=base_freq[1]=(pA+pC)/2; + break; + case FREQ_DNA_1213: + base_freq[0]=base_freq[2]=(pA+pG)/2; + break; + case FREQ_DNA_1231: + base_freq[0]=base_freq[3]=(pA+pT)/2; + break; + case FREQ_DNA_2113: + base_freq[1]=base_freq[2]=(pC+pG)/2; + break; + case FREQ_DNA_2131: + base_freq[1]=base_freq[3]=(pC+pT)/2; + break; + case FREQ_DNA_2311: + base_freq[2]=base_freq[3]=(pG+pT)/2; + break; + default: + throw("Unrecognized freq_type in forceFreqsConform - can't happen"); + } +} + +/* + * For given freq_type, how many parameters are needed to + * determine frequenc vector? + * Currently, this is for DNA StateFreqTypes only. + */ + +int nFreqParams(StateFreqType freq_type) { + switch (freq_type) { + case FREQ_EQUAL: + case FREQ_USER_DEFINED: + case FREQ_EMPIRICAL: + return(0); + case FREQ_DNA_1112: + case FREQ_DNA_1121: + case FREQ_DNA_1211: + case FREQ_DNA_2111: + case FREQ_DNA_1122: + case FREQ_DNA_1212: + case FREQ_DNA_1221: + return(1); + case FREQ_DNA_RY: + case FREQ_DNA_WS: + case FREQ_DNA_MK: + case FREQ_DNA_1123: + case FREQ_DNA_1213: + case FREQ_DNA_1231: + case FREQ_DNA_2113: + case FREQ_DNA_2131: + case FREQ_DNA_2311: + return(2); + case FREQ_ESTIMATE: + return(3); + default: + throw("Unrecognized freq_type in freqsFromParams - can't happen"); + } +} + +/* + * For freq_type, and given every base must have frequency >= min_freq, set upper + * and lower bounds for parameters. + */ + void setBoundsForFreqType(double *lower_bound, + double *upper_bound, + bool *bound_check, + double min_freq, + StateFreqType freq_type) { + // Sanity check: if min_freq==0, lower_bound=0 and upper_bound=1 + // (except FREQ_ESTIMATE, which follows legacy code way of doing things.) + switch (freq_type) { + case FREQ_EQUAL: + case FREQ_USER_DEFINED: + case FREQ_EMPIRICAL: + break; // There are no frequency determining parameters + case FREQ_DNA_1112: + case FREQ_DNA_1121: + case FREQ_DNA_1211: + case FREQ_DNA_2111: + // one frequency determining parameter + lower_bound[0] = 3*min_freq; + upper_bound[0] = 1-min_freq; + bound_check[0] = true; + break; + case FREQ_DNA_1122: + case FREQ_DNA_1212: + case FREQ_DNA_1221: + // one frequency determining parameter + lower_bound[0] = 2*min_freq; + upper_bound[0] = 1-2*min_freq; + bound_check[0] = true; + break; + case FREQ_DNA_RY: + case FREQ_DNA_WS: + case FREQ_DNA_MK: + // two frequency determining parameters + lower_bound[0] = lower_bound[1] = 2*min_freq; + upper_bound[0] = upper_bound[1] = 1-2*min_freq; + bound_check[0] = bound_check[1] = true; + break; + case FREQ_DNA_1123: + case FREQ_DNA_1213: + case FREQ_DNA_1231: + case FREQ_DNA_2113: + case FREQ_DNA_2131: + case FREQ_DNA_2311: + // two frequency determining parameters + lower_bound[0] = 2*min_freq; + upper_bound[0] = 1-2*min_freq; + lower_bound[1] = min_freq/(1-2*min_freq); + upper_bound[1] = (1-3*min_freq)/(1-2*min_freq); + bound_check[0] = bound_check[1] = true; + break; + /* NOTE: + * upper_bound[1] and lower_bound[1] are not perfect. Some in-bounds parameters + * will give base freqs for '2' or '3' base below minimum. This is + * the best that can be done without passing min_freq to freqsFromParams + */ + case FREQ_ESTIMATE: + lower_bound[0] = lower_bound[1] = lower_bound[2] = min_freq; + upper_bound[0] = upper_bound[1] = upper_bound[2] = 1; + bound_check[0] = bound_check[1] = bound_check[2] = false; + break; + default: + throw("Unrecognized freq_type in setBoundsForFreqType - can't happen"); + } +} diff --git a/tools.h b/tools.h index 9bda0f3c5..71505a047 100644 --- a/tools.h +++ b/tools.h @@ -382,7 +382,16 @@ enum TestType { enum StateFreqType { FREQ_UNKNOWN, FREQ_USER_DEFINED, FREQ_EQUAL, FREQ_EMPIRICAL, FREQ_ESTIMATE, FREQ_CODON_1x4, FREQ_CODON_3x4, FREQ_CODON_3x4C, // special frequency for codon model - FREQ_MIXTURE // mixture-frequency model + FREQ_MIXTURE, // mixture-frequency model + // FREQ_DNA_RY has pi_A+pi_G = 0.5 = pi_C+pi_T. Similarly WS pairs (AT)(CG), + // MK pairs (AC)(GT) in same way. + FREQ_DNA_RY, FREQ_DNA_WS, FREQ_DNA_MK, + // in following, digits indicate which frequencies must equal each other + // (in ACGT order), e.g. 2131 means pi_C=pi_T (pi_A, pi_G unconstrained) + FREQ_DNA_1112, FREQ_DNA_1121, FREQ_DNA_1211, FREQ_DNA_2111, + FREQ_DNA_1122, FREQ_DNA_1212, FREQ_DNA_1221, + FREQ_DNA_1123, FREQ_DNA_1213, FREQ_DNA_1231, + FREQ_DNA_2113, FREQ_DNA_2131, FREQ_DNA_2311, }; /** @@ -2540,6 +2549,57 @@ bool memcmpcpy(void * destination, const void * source, size_t num); */ int pairInteger(int int1, int int2); +/* + * Given a string of 4 digits, return a StateFreqType according to + * equality constraints expressed by those digits. + * E.g. "1233" constrains pi_G=pi_T (ACGT order, 3rd and 4th equal) + * which results in FREQ_DNA_2311. "5288" would give the same result. + */ +StateFreqType parseStateFreqDigits(string digits); + +/* + * For freq_type, return a "+F" string specifying that freq_type. + * Note not all freq_types accomodated. + * Inverse of this occurs in ModelFactory::ModelFactory, + * where +F... suffixes on model names get parsed. + */ +string freqTypeString(StateFreqType freq_type); + +/* + * All params in range [0,1] + * returns true if base frequencies have changed as a result of this call + */ +bool freqsFromParams(double *freq_vec, double *params, StateFreqType freq_type); + +/* + * For given freq_type, derives frequency parameters from freq_vec + * All parameters are in range [0,1] (assuming freq_vec is valid) + */ +void paramsFromFreqs(double *params, double *freq_vec, StateFreqType freq_type); + +/* + * Given a DNA freq_type and a base frequency vector, alter the + * base freq vector to conform with the constraints of freq_type + */ +void forceFreqsConform(double *base_freq, StateFreqType freq_type); + +/* + * For given freq_type, how many parameters are needed to + * determine frequenc vector? + * Currently, this is for DNA StateFreqTypes only. + */ + int nFreqParams(StateFreqType freq_type); + +/* + * For freq_type, and given every base must have frequency >= min_freq, set upper + * and lower bounds for parameters. + */ + void setBoundsForFreqType(double *lower_bound, + double *upper_bound, + bool *bound_check, + double min_freq, + StateFreqType freq_type); + template string NumberToString ( T Number ) {