Skip to content

Commit

Permalink
Added a bunch of new DNA frequency classes. These are all like
Browse files Browse the repository at this point in the history
FREQ_ESTIMATE in that base frequencies will be optimized, but
they are constrained in various ways (e.g. pi_A==pi_T).
These are intended for use with time reversible models.

This commit is at the stage where code compiles but is completely untested.
  • Loading branch information
MichaelWoodhams committed Dec 2, 2016
1 parent d435c0b commit 26083b0
Show file tree
Hide file tree
Showing 7 changed files with 831 additions and 148 deletions.
106 changes: 52 additions & 54 deletions model/modeldna.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -347,10 +347,8 @@ bool ModelDNA::setRateType(const char *rate_str) {

int ModelDNA::getNDim() {
assert(freq_type != FREQ_UNKNOWN);
int ndim = num_params;
if (freq_type == FREQ_ESTIMATE)
ndim += num_states-1;
return ndim;
// possible TO-DO: cache nFreqParams(freq_type) to avoid repeat calls.
return (num_params+nFreqParams(freq_type));
}

void ModelDNA::writeParameters(ostream &out) {
Expand All @@ -369,68 +367,68 @@ void ModelDNA::writeParameters(ostream &out) {
}
}


/*
* getVariables *writes* the variables (i.e. model parameters).
* Returns true if any variables have changed, false if not.
*/
bool ModelDNA::getVariables(double *variables) {
int i;
int i;
bool changed = false;
if (num_params > 0) {
int num_all = param_spec.length();
if (verbose_mode >= VB_MAX) {
for (i = 1; i <= num_params; i++)
cout << " estimated variables[" << i << "] = " << variables[i] << endl;
}
for (i = 0; i < num_all; i++)
if (!param_fixed[param_spec[i]]) {
if (num_params > 0) {
int num_all = param_spec.length();
if (verbose_mode >= VB_MAX) {
for (i = 1; i <= num_params; i++)
cout << " estimated variables[" << i << "] = " << variables[i] << endl;
}
for (i = 0; i < num_all; i++)
if (!param_fixed[param_spec[i]]) {
changed |= (rates[i] != variables[(int)param_spec[i]]);
rates[i] = variables[(int)param_spec[i]];
}
}
if (freq_type == FREQ_ESTIMATE) {
// 2015-09-07: relax the sum of state_freq to be 1, this will be done at the end of optimization
int ndim = getNDim();
changed |= memcmpcpy(state_freq, variables+(ndim-num_states+2), (num_states-1)*sizeof(double));
// double sum = 0;
// for (i = 0; i < num_states-1; i++)
// sum += state_freq[i];
// state_freq[num_states-1] = 1.0 - sum;
rates[i] = variables[(int)param_spec[i]];
}

}
changed |= freqsFromParams(state_freq,variables+num_params+1,freq_type);
return changed;
// double sum = 0;
// for (i = 0; i < num_states-1; i++)
// sum += state_freq[i];
// state_freq[num_states-1] = 1.0 - sum;

// BUG FIX 2015.08.28
// int nrate = getNDim();
// if (freq_type == FREQ_ESTIMATE) nrate -= (num_states-1);
// double sum = 1.0;
// int i, j;
// for (i = 1; i < num_states; i++)
// sum += variables[nrate+i];
// for (i = 0, j = 1; i < num_states; i++)
// if (i != highest_freq_state) {
// state_freq[i] = variables[nrate+j] / sum;
// j++;
// }
// state_freq[highest_freq_state] = 1.0/sum;
}
return changed;
// double sum = 1.0;
// int i, j;
// for (i = 1; i < num_states; i++)
// sum += variables[nrate+i];
// for (i = 0, j = 1; i < num_states; i++)
// if (i != highest_freq_state) {
// state_freq[i] = variables[nrate+j] / sum;
// j++;
// }
// state_freq[highest_freq_state] = 1.0/sum;
}

/*
* setVariables *reads* the variables (i.e. model parameters).
*/
void ModelDNA::setVariables(double *variables) {
if (num_params > 0) {
int num_all = param_spec.length();
for (int i = 0; i < num_all; i++)
if (!param_fixed[param_spec[i]])
variables[(int)param_spec[i]] = rates[i];
}
if (freq_type == FREQ_ESTIMATE) {
// 2015-09-07: relax the sum of state_freq to be 1, this will be done at the end of optimization
int ndim = getNDim();
memcpy(variables+(ndim-num_states+2), state_freq, (num_states-1)*sizeof(double));
if (num_params > 0) {
int num_all = param_spec.length();
for (int i = 0; i < num_all; i++)
if (!param_fixed[param_spec[i]])
variables[(int)param_spec[i]] = rates[i];
}
// and copy parameters for base frequencies
paramsFromFreqs(variables+num_params+1, state_freq, freq_type);

// BUG FIX 2015.08.28
// int nrate = getNDim();
// if (freq_type == FREQ_ESTIMATE) nrate -= (num_states-1);
// int i, j;
// for (i = 0, j = 1; i < num_states; i++)
// if (i != highest_freq_state) {
// variables[nrate+j] = state_freq[i] / state_freq[highest_freq_state];
// j++;
// }
}
// int i, j;
// for (i = 0, j = 1; i < num_states; i++)
// if (i != highest_freq_state) {
// variables[nrate+j] = state_freq[i] / state_freq[highest_freq_state];
// j++;
// }
}
58 changes: 41 additions & 17 deletions model/modelfactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,53 +321,77 @@ ModelFactory::ModelFactory(Params &params, PhyloTree *tree, ModelsBlock *models_
if (freq_str.length() > 2 && freq_str[2] == OPEN_BRACKET) {
if (freq_type == FREQ_MIXTURE)
outError("Mixture frequency with user-defined frequency is not allowed");
close_bracket = freq_str.find(CLOSE_BRACKET);
if (close_bracket == string::npos)
outError("Close bracket not found in ", freq_str);
if (close_bracket != freq_str.length()-1)
outError("Wrong close bracket position ", freq_str);
freq_type = FREQ_USER_DEFINED;
freq_params = freq_str.substr(3, close_bracket-3);
} else if (freq_str == "+FC" || freq_str == "+Fc" || freq_str == "+F") {
close_bracket = freq_str.find(CLOSE_BRACKET);
if (close_bracket == string::npos)
outError("Close bracket not found in ", freq_str);
if (close_bracket != freq_str.length()-1)
outError("Wrong close bracket position ", freq_str);
freq_type = FREQ_USER_DEFINED;
freq_params = freq_str.substr(3, close_bracket-3);
} else if (freq_str == "+FC" || freq_str == "+Fc" || freq_str == "+F") {
if (freq_type == FREQ_MIXTURE) {
freq_params = "empirical," + freq_params;
optimize_mixmodel_weight = true;
} else
freq_type = FREQ_EMPIRICAL;
} else if (freq_str == "+FU" || freq_str == "+Fu") {
} else if (freq_str == "+FU" || freq_str == "+Fu") {
if (freq_type == FREQ_MIXTURE)
outError("Mixture frequency with user-defined frequency is not allowed");
else
freq_type = FREQ_USER_DEFINED;
} else if (freq_str == "+FQ" || freq_str == "+Fq") {
} else if (freq_str == "+FQ" || freq_str == "+Fq") {
if (freq_type == FREQ_MIXTURE)
outError("Mixture frequency with equal frequency is not allowed");
else
freq_type = FREQ_EQUAL;
} else if (freq_str == "+FO" || freq_str == "+Fo") {
freq_type = FREQ_EQUAL;
} else if (freq_str == "+FO" || freq_str == "+Fo") {
if (freq_type == FREQ_MIXTURE) {
freq_params = "optimize," + freq_params;
optimize_mixmodel_weight = true;
} else
freq_type = FREQ_ESTIMATE;
} else if (freq_str == "+F1x4" || freq_str == "+F1X4") {
} else if (freq_str == "+F1x4" || freq_str == "+F1X4") {
if (freq_type == FREQ_MIXTURE)
outError("Mixture frequency with " + freq_str + " is not allowed");
else
freq_type = FREQ_CODON_1x4;
} else if (freq_str == "+F3x4" || freq_str == "+F3X4") {
} else if (freq_str == "+F3x4" || freq_str == "+F3X4") {
if (freq_type == FREQ_MIXTURE)
outError("Mixture frequency with " + freq_str + " is not allowed");
else
freq_type = FREQ_CODON_3x4;
} else if (freq_str == "+F3x4C" || freq_str == "+F3x4c" || freq_str == "+F3X4C" || freq_str == "+F3X4c") {
} else if (freq_str == "+F3x4C" || freq_str == "+F3x4c" || freq_str == "+F3X4C" || freq_str == "+F3X4c") {
if (freq_type == FREQ_MIXTURE)
outError("Mixture frequency with " + freq_str + " is not allowed");
else
freq_type = FREQ_CODON_3x4C;
} else outError("Unknown state frequency type ",freq_str);
// model_str = model_str.substr(0, posfreq);
} else if (freq_str == "+FRY") {
// MDW to Minh: I don't know how these should interact with FREQ_MIXTURE,
// so as nearly everything else treats it as an error, I do too.
if (freq_type == FREQ_MIXTURE)
outError("Mixture frequency with " + freq_str + " is not allowed");
else
freq_type = FREQ_DNA_RY;
} else if (freq_str == "+FWS") {
if (freq_type == FREQ_MIXTURE)
outError("Mixture frequency with " + freq_str + " is not allowed");
else
freq_type = FREQ_DNA_WS;
} else if (freq_str == "+FMK") {
if (freq_type == FREQ_MIXTURE)
outError("Mixture frequency with " + freq_str + " is not allowed");
else
freq_type = FREQ_DNA_MK;
} else {
// might be "+F####" where # are digits
try {
freq_type = parseStateFreqDigits(freq_str.substr(2)); // throws an error if not in +F#### format
} catch (...) {
outError("Unknown state frequency type ",freq_str);
}
}
// model_str = model_str.substr(0, posfreq);
}

/******************** initialize model ****************************/

Expand Down
65 changes: 65 additions & 0 deletions model/modelliemarkov.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,17 @@ void ModelLieMarkov::init(const char *model_name, string model_params, StateFreq
}
setBasis(); // sets basis and num_params

/*
* Check validity of freq_type. (After setBasis because that sets bdf,
* if needed could move before setBasis with minor changes)
*/
if (!validFreqType()) {
throw("Can't use base frequency type "
+freqTypeString(freq_type)
+" with Lie-Markov model "
+name);
}

if (model_parameters)
delete[] model_parameters;
model_parameters = new double [num_params];
Expand Down Expand Up @@ -381,6 +392,60 @@ ModelLieMarkov::~ModelLieMarkov() {
// Do nothing, for now. model_parameters is reclaimed in ~ModelMarkov
}

/*
* Return 'true' if freq type is compatible with this Lie-Markov model.
* NOTE: Any freq_type exept FREQ_USER_DEFINED, FREQ_EMPIRICAL or
* FREQ_ESTIMATE is at best redundant, and worst incompatible.
* The above three are really the only freq types which should be used
* with an LM model.
* Actually, the +F1123s are valid with a BDF=3 LM model, but
* I haven't coded for this possibility so reject it. Could be fixed.
*
* Also for FREQ_USER_DEFINED and FREQ_EMPIRICAL, for LM models with
* BDF<3, compatibility depends on the given base freqs. There
* is code elsewhere which prints a warning if incompatible base freqs
* (and actual model base freqs will be 'close to' the requested freqs.)
*/
bool ModelLieMarkov::validFreqType() {
int bdf=BDF[model_num];
switch(getFreqType()) {
case FREQ_USER_DEFINED:
case FREQ_EMPIRICAL:
case FREQ_ESTIMATE:
return true;
case FREQ_UNKNOWN:
case FREQ_CODON_1x4:
case FREQ_CODON_3x4:
case FREQ_CODON_3x4C:
case FREQ_MIXTURE:
case FREQ_DNA_1112:
case FREQ_DNA_1121:
case FREQ_DNA_1211:
case FREQ_DNA_2111:
case FREQ_DNA_1123:
case FREQ_DNA_1213:
case FREQ_DNA_1231:
case FREQ_DNA_2113:
case FREQ_DNA_2131:
case FREQ_DNA_2311:
return false;
case FREQ_EQUAL:
return (bdf==0);
case FREQ_DNA_RY: return("+FRY");
return (bdf==2 && symmetry==0);
case FREQ_DNA_WS: return("+FWS");
return (bdf==2 && symmetry==1);
case FREQ_DNA_MK: return("+FMK");
return (bdf==2 && symmetry==2);
case FREQ_DNA_1122: return("+F1122");
return (bdf==1 && symmetry==2);
case FREQ_DNA_1212: return("+F1212");
return (bdf==1 && symmetry==0);
case FREQ_DNA_1221: return("+F1221");
return (bdf==1 && symmetry==1);
default: throw("Unrecoginzed freq_type in validFreqType - can't happen");
}
}

/* static */ bool ModelLieMarkov::validModelName(string model_name) {
int model_num, symmetry;
Expand Down
2 changes: 1 addition & 1 deletion model/modelliemarkov.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,6 @@ class ModelLieMarkov: public ModelMarkov {
const static int NUM_RATES;
const static int NUM_LM_MODELS;
*/
bool validFreqType();
};

#endif /* MODELLIEMARKOV_H_ */
Loading

0 comments on commit 26083b0

Please sign in to comment.