From f3ea3d69fbbe64740da31f0c477f5d52e55ac7ce Mon Sep 17 00:00:00 2001 From: stamatak Date: Fri, 28 Nov 2014 15:06:22 +0100 Subject: [PATCH] -added two additional ways for correcting ascertainment bias. Both new methods can be used to correct for the absence of invariable sites, when the number of invariable sites of the dataset is known. --- README | 2 +- Release-Notes | 2 +- axml.c | 167 ++++++++++++++++++++++++++++++++++++++- axml.h | 20 ++++- evaluateGenericSpecial.c | 58 ++++++++++---- makenewzGenericSpecial.c | 108 +++++++++++++++++++++---- parsePartitions.c | 139 +++++++++++++++++++++++++------- 7 files changed, 432 insertions(+), 64 deletions(-) diff --git a/README b/README index f2f991a..7fc29ae 100644 --- a/README +++ b/README @@ -1,4 +1,4 @@ -Standard RAxML version 8.1.6 +Standard RAxML version 8.1.7 Compiling under Linux: diff --git a/Release-Notes b/Release-Notes index 80a1bd2..0b42430 100644 --- a/Release-Notes +++ b/Release-Notes @@ -1,4 +1,4 @@ -RAxML v8.1.6 +RAxML v8.1.7 We view RAxML code development as permanent work in progress. diff --git a/axml.c b/axml.c index a9e74a0..db9ea76 100644 --- a/axml.c +++ b/axml.c @@ -1878,6 +1878,7 @@ static void getinput(analdef *adef, rawdata *rdta, cruncheddata *cdta, tree *tr) tr->initialPartitionData[0].usePredefinedProtFreqs = TRUE; tr->initialPartitionData[0].optimizeBaseFrequencies = FALSE; strcpy(tr->initialPartitionData[0].proteinSubstitutionFileName, proteinModelFileName); + } for(i = 0; i <= rdta->sites; i++) @@ -1898,6 +1899,7 @@ static void getinput(analdef *adef, rawdata *rdta, cruncheddata *cdta, tree *tr) tr->extendedPartitionData[i].partitionName = (char*)rax_malloc((strlen(tr->initialPartitionData[i].partitionName) + 1) * sizeof(char)); strcpy(tr->extendedPartitionData[i].partitionName, tr->initialPartitionData[i].partitionName); strcpy(tr->extendedPartitionData[i].proteinSubstitutionFileName, tr->initialPartitionData[i].proteinSubstitutionFileName); + strcpy(tr->extendedPartitionData[i].ascFileName, tr->initialPartitionData[i].ascFileName); tr->extendedPartitionData[i].dataType = tr->initialPartitionData[i].dataType; tr->extendedPartitionData[i].protModels = tr->initialPartitionData[i].protModels; tr->extendedPartitionData[i].usePredefinedProtFreqs = tr->initialPartitionData[i].usePredefinedProtFreqs; @@ -3182,6 +3184,9 @@ static void allocPartitions(tree *tr) tr->partitionData[i].freqExponents = (double*)rax_malloc(pl->frequenciesLength * sizeof(double)); tr->partitionData[i].tipVector = (double *)rax_malloc(pl->tipVectorLength * sizeof(double)); + tr->partitionData[i].invariableFrequencies = (double *)rax_malloc(pl->states * sizeof(double)); + + #ifdef _HET tr->partitionData[i].EIGN_TIP = (double*)rax_malloc(pl->eignLength * sizeof(double)); @@ -4692,6 +4697,7 @@ static void printREADME(void) #else printf(" [--mesquite][--silent][--no-seq-check][--no-bfgs]\n"); #endif + printf(" [--asc-corr=stamatakis|felsenstein|lewis]\n"); printf("\n"); printf(" -a Specify a column weight file name to assign individual weights to each column of \n"); printf(" the alignment. Those weights must be integers separated by any type and number \n"); @@ -5051,7 +5057,7 @@ static void printREADME(void) printf(" Bootstopping will only work in combination with \"-x\" or \"-b\"\n"); printf("\n"); printf(" DEFAULT: 1 single analysis\n"); - printf("\n"); + printf("\n"); #ifndef _WAYNE_MPI printf(" --mesquite Print output files that can be parsed by Mesquite.\n"); printf("\n"); @@ -5071,6 +5077,16 @@ static void printREADME(void) printf(" --no-bfgs Disables automatic usage of BFGS method to optimize GTR rates on unpartitioned DNA datasets\n"); printf("\n"); printf(" DEFAULT: BFGS on\n"); + printf("\n"); + printf(" --asc-corr Allows to specify the type of ascertainment bias correction you wish to use. There are three\n"); + printf(" types available:\n"); + printf(" --asc-corr=lewis: the standard correction by Paul Lewis\n"); + printf(" --asc-corr=felsenstein: a correction introduced by Joe Felsenstein that allows to explicitely specify\n"); + printf(" the number of invariable sites (if known) one wants to correct for.\n"); + printf(" --asc-corr=stamatakis: a correction introduced by myself that allows to explicitely specify\n"); + printf(" the number of invariable sites for each character (if known) one wants to correct for.\n"); + printf(" For further details please refer to the manual!\n"); + printf("\n"); printf("\n\n\n\n"); @@ -5177,6 +5193,7 @@ static void get_args(int argc, char *argv[], analdef *adef, tree *tr) tr->noRateHet = FALSE; tr->perPartitionEPA = FALSE; tr->useBrLenScaler = FALSE; + tr->ascertainmentCorrectionType = NOT_DEFINED; #ifdef _BASTIEN tr->doBastienStuff = FALSE; @@ -5189,13 +5206,14 @@ static void get_args(int argc, char *argv[], analdef *adef, tree *tr) while(1) { static struct - option long_options[5] = + option long_options[6] = { /* These options set a flag. */ {"mesquite", no_argument, &flag, 1}, {"silent", no_argument, &flag, 1}, {"no-seq-check", no_argument, &flag, 1}, {"no-bfgs", no_argument, &flag, 1}, + {"asc-corr", required_argument, &flag, 1}, /* These options don't set a flag. We distinguish them by their indices. */ //{"add", no_argument, 0, 'a'}, @@ -5240,6 +5258,45 @@ static void get_args(int argc, char *argv[], analdef *adef, tree *tr) case 3: adef->useBFGS = FALSE; break; + case 4: + { + char + *ascModels[3] = {"lewis", "felsenstein", "stamatakis"}; + + int + k; + + for(k = 0; k < 3; k++) + if(strcmp(optarg, ascModels[k]) == 0) + break; + + if(k == 3) + { + printf("\nError, unknown ascertainment correction type, you can specify one of the following corrections:\n\n"); + for(k = 0; k < 3; k++) + printf("--asc-corr=%s\n", ascModels[k]); + printf("\n"); + errorExit(-1); + } + else + { + switch(k) + { + case 0: + tr->ascertainmentCorrectionType = LEWIS_CORRECTION; + break; + case 1: + tr->ascertainmentCorrectionType = FELSENSTEIN_CORRECTION; + break; + case 2: + tr->ascertainmentCorrectionType = STAMATAKIS_CORRECTION; + break; + default: + assert(0); + } + } + } + break; default: assert(0); } @@ -8143,6 +8200,7 @@ static void initPartition(tree *tr, tree *localTree, int tid) localTree->cdta->patratStored = tr->cdta->patratStored; localTree->discreteRateCategories = tr->discreteRateCategories; + localTree->ascertainmentCorrectionType = tr->ascertainmentCorrectionType; for(model = 0; model < localTree->NumberOfModels; model++) { @@ -8603,8 +8661,11 @@ static void execFunction(tree *tr, tree *localTree, int tid, int n) localTree->partitionData[model].propInvariant = tr->partitionData[model].propInvariant; localTree->partitionData[model].lower = tr->partitionData[model].lower; localTree->partitionData[model].upper = tr->partitionData[model].upper; - + localTree->partitionData[model].numberOfCategories = tr->partitionData[model].numberOfCategories; + + localTree->partitionData[model].invariableWeight = tr->partitionData[model].invariableWeight; + memcpy(localTree->partitionData[model].invariableFrequencies, tr->partitionData[model].invariableFrequencies, pl->states * sizeof(double)); } } @@ -11988,6 +12049,91 @@ static void checkAscBias(tree *tr) } } + +static void readAscFiles(tree *tr) + { + if(tr->ascertainmentCorrectionType == STAMATAKIS_CORRECTION || tr->ascertainmentCorrectionType == FELSENSTEIN_CORRECTION) + { + int + model; + + for(model = 0; model < tr->NumberOfModels; model++) + { + if(tr->partitionData[model].ascBias) + { + if(filexists(tr->partitionData[model].ascFileName)) + { + FILE + *f = myfopen(tr->partitionData[model].ascFileName, "rb"); + + switch(tr->ascertainmentCorrectionType) + { + case STAMATAKIS_CORRECTION: + { + boolean + errorDetected = FALSE; + + unsigned int + *weights = (unsigned int*)rax_malloc((size_t)tr->partitionData[model].states * sizeof(unsigned int)); + + int + i; + + for(i = 0; i < tr->partitionData[model].states; i++) + if(fscanf(f, "%u", &weights[i]) != 1) + { + errorDetected = TRUE; + break; + } + + if(errorDetected) + { + printf("\nProblem reading number of invariable sites in ascertainment correction file %s\n", tr->partitionData[model].ascFileName); + printf("for stamatakis ascertainment bias correction for partition %d with name %s.\n\n", model, tr->partitionData[model].partitionName); + errorExit(-1); + } + + for(i = 0; i < tr->partitionData[model].states; i++) + tr->partitionData[model].invariableFrequencies[i] = (double)weights[i]; + + rax_free(weights); + } + break; + case FELSENSTEIN_CORRECTION: + { + unsigned int + length; + + if(fscanf(f, "%u", &length) != 1) + { + printf("\nProblem reading number of invariable sites in ascertainment correction file %s\n", tr->partitionData[model].ascFileName); + printf("for felsenstein ascertainment bias correction for partition %d with name %s.\n\n", model, tr->partitionData[model].partitionName); + errorExit(-1); + } + + tr->partitionData[0].invariableWeight = (double)length; + } + break; + default: + assert(0); + } + + fclose(f); + } + else + { + printf("You specified that you want to use a stamatakis or felsenstein ascertainment bias correction for partition %d with name %s.\n", + model, tr->partitionData[model].partitionName); + printf("but did not specify a correction file for this partition in the partition file!\n\n"); + errorExit(-1); + } + } + } + } + } + + + /******* missing data prediction code -f k option **************************************************/ //function that makes the actual sequence guess estimate given a node of the tree p which is a tip @@ -13086,7 +13232,16 @@ int main (int argc, char *argv[]) setRateHetAndDataIncrement(tr, adef); if(countAscBias > 0 && !adef->readTaxaOnly) - checkAscBias(tr); + { + if(tr->ascertainmentCorrectionType == NOT_DEFINED) + { + printf("\nError, one or more of your partitions use an ascertainment bias correction,\n"); + printf("but you have not specified which correction type you want to use via \"--asc-corr\" \n\n"); + errorExit(-1); + } + + checkAscBias(tr); + } #ifdef _USE_PTHREADS startPthreads(tr); @@ -13098,6 +13253,10 @@ int main (int argc, char *argv[]) allocNodex(tr); #endif + + readAscFiles(tr); + + if(!adef->readTaxaOnly) { #ifdef _USE_PTHREADS diff --git a/axml.h b/axml.h index 467b4bd..6c5d88a 100644 --- a/axml.h +++ b/axml.h @@ -166,9 +166,9 @@ #define PointGamma(prob,alpha,beta) PointChi2(prob,2.0*(alpha))/(2.0*(beta)) #define programName "RAxML" -#define programVersion "8.1.6" -#define programVersionInt 816 -#define programDate "November 26 2014" +#define programVersion "8.1.7" +#define programVersionInt 817 +#define programDate "November 28 2014" #define TREE_EVALUATION 0 @@ -599,6 +599,7 @@ typedef struct { char *partitionName; char proteinSubstitutionFileName[2048]; + char ascFileName[2048]; double externalAAMatrix[420]; double *sumBuffer; @@ -611,6 +612,9 @@ typedef struct { double *left; double *right; + double *invariableFrequencies; + double invariableWeight; + #ifdef _HET /* heterotachy */ @@ -768,6 +772,14 @@ typedef struct +/***********************************************************/ + +#define NOT_DEFINED 0 +#define LEWIS_CORRECTION 1 +#define FELSENSTEIN_CORRECTION 2 +#define STAMATAKIS_CORRECTION 3 + + /**************************************************************/ @@ -879,6 +891,8 @@ typedef struct { double fracchange; double rawFracchange; + int ascertainmentCorrectionType; + #ifdef _BASTIEN double secondDerivative[NUM_BRANCHES]; boolean doBastienStuff; diff --git a/evaluateGenericSpecial.c b/evaluateGenericSpecial.c index 1c765bb..bae895e 100644 --- a/evaluateGenericSpecial.c +++ b/evaluateGenericSpecial.c @@ -254,7 +254,7 @@ static double evaluateCatFlex(int *ex1, int *ex2, int *cptr, int *wptr, static double evaluateCatAsc(int *ex1, int *ex2, double *x1, double *x2, double *tipVector, - unsigned char *tipX1, int n, double *diagptable, const int numStates) + unsigned char *tipX1, int n, double *diagptable, const int numStates, double *accumulator, double *weightVector) { double exponent, @@ -300,6 +300,9 @@ static double evaluateCatAsc(int *ex1, int *ex2, } #endif + if(weightVector) + *accumulator += weightVector[i] * (LOG(FABS(term)) + (ex2[i] * LOG(minlikelihood))); + sum += unobserved; } } @@ -331,7 +334,10 @@ static double evaluateCatAsc(int *ex1, int *ex2, assert(exponent < 700.0); unobserved = FABS(term) * exp(exponent); - + + if(weightVector) + *accumulator += weightVector[i] * (LOG(FABS(term)) + ((ex1[i] + ex2[i]) * LOG(minlikelihood))); + sum += unobserved; } } @@ -343,7 +349,7 @@ static double evaluateCatAsc(int *ex1, int *ex2, static double evaluateGammaAsc(int *ex1, int *ex2, double *x1, double *x2, double *tipVector, - unsigned char *tipX1, int n, double *diagptable, const int numStates) + unsigned char *tipX1, int n, double *diagptable, const int numStates, double *accumulator, double *weightVector) { double exponent, @@ -394,7 +400,9 @@ static double evaluateGammaAsc(int *ex1, int *ex2, assert(0); } #endif - + if(weightVector) + *accumulator += weightVector[i] * (LOG(0.25 * FABS(term)) + (ex2[i] * LOG(minlikelihood))); + sum += unobserved; } } @@ -428,7 +436,10 @@ static double evaluateGammaAsc(int *ex1, int *ex2, assert(exponent < 700.0); unobserved = 0.25 * FABS(term) * exp(exponent); - + + if(weightVector) + *accumulator += weightVector[i] * (LOG(0.25 * FABS(term)) + ((ex1[i] + ex2[i]) * LOG(minlikelihood))); + sum += unobserved; } } @@ -3458,9 +3469,15 @@ double evaluateIterative(tree *tr, boolean writeVector) int w = 0; - double + double + *weightVector = (double*)NULL, + accumulator = 0.0, correction; + if(tr->ascertainmentCorrectionType == STAMATAKIS_CORRECTION) + weightVector = tr->partitionData[model].invariableFrequencies; + // invariableWeight; + switch(tr->rateHetModel) { case CAT: @@ -3473,23 +3490,36 @@ double evaluateIterative(tree *tr, boolean writeVector) correction = evaluateCatAsc(ex1_asc, ex2_asc, x1_start_asc, x2_start_asc, tr->partitionData[model].tipVector, - tip, ascWidth, diagptable, ascWidth); + tip, ascWidth, diagptable, ascWidth, &accumulator, weightVector); } break; case GAMMA: correction = evaluateGammaAsc(ex1_asc, ex2_asc, x1_start_asc, x2_start_asc, tr->partitionData[model].tipVector, - tip, ascWidth, diagptable, ascWidth); + tip, ascWidth, diagptable, ascWidth, &accumulator, weightVector); break; default: assert(0); } - - - for(i = tr->partitionData[model].lower; i < tr->partitionData[model].upper; i++) - w += tr->cdta->aliaswgt[i]; - - partitionLikelihood = partitionLikelihood - (double)w * LOG(1.0 - correction); + switch(tr->ascertainmentCorrectionType) + { + case LEWIS_CORRECTION: + for(i = tr->partitionData[model].lower; i < tr->partitionData[model].upper; i++) + w += tr->cdta->aliaswgt[i]; + partitionLikelihood = partitionLikelihood - (double)w * LOG(1.0 - correction); + break; + case STAMATAKIS_CORRECTION: + //printf("accumulator %f\n", accumulator); + //exit(1); + partitionLikelihood += accumulator; + break; + case FELSENSTEIN_CORRECTION: + partitionLikelihood += tr->partitionData[model].invariableWeight * LOG(correction); + // printf("correction %f %f\n", partitionLikelihood, tr->partitionData[model].invariableWeight * LOG(correction)); + break; + default: + assert(0); + } #ifdef _DEBUG_ASC printf("E w: %f %f ARG %f ragu %f\n", partitionLikelihood, (double)w, 1.0 - correction, (double)w * LOG(1.0 - correction)); diff --git a/makenewzGenericSpecial.c b/makenewzGenericSpecial.c index ae8f0a5..5f9f727 100644 --- a/makenewzGenericSpecial.c +++ b/makenewzGenericSpecial.c @@ -1158,13 +1158,18 @@ static void coreGammaFlex(double *gammaRates, double *EIGN, double *sumtable, in } static double coreCatAsc(double *EIGN, double *sumtable, int upper, - volatile double *ext_dlnLdlz, volatile double *ext_d2lnLdlz2, double lz, const int numStates) + volatile double *ext_dlnLdlz, volatile double *ext_d2lnLdlz2, double lz, const int numStates, + volatile double *standard_ext_dlnLdlz, volatile double *standard_ext_d2lnLdlz2, + volatile double *felsenstein_ext_dlnLdlz, volatile double *felsenstein_ext_d2lnLdlz2, + double *weightVector) { double diagptable[1024], lh = 0.0, dlnLdlz = 0.0, d2lnLdlz2 = 0.0, + standard_dlnLdlz = 0.0, + standard_d2lnLdlz2 = 0.0, ki, kisqr; @@ -1202,28 +1207,55 @@ static double coreCatAsc(double *EIGN, double *sumtable, int upper, d2lnLidlz2 += tmp * diagptable[l * 4 + 2]; } + if(weightVector) + { + double + standard_inv_Li = inv_Li, + standard_dlnLidlz = dlnLidlz, + standard_d2lnLidlz2 = d2lnLidlz2; + + standard_inv_Li = 1.0 / FABS(standard_inv_Li); + + standard_dlnLidlz *= standard_inv_Li; + standard_d2lnLidlz2 *= standard_inv_Li; + + standard_dlnLdlz += weightVector[i] * standard_dlnLidlz; + standard_d2lnLdlz2 += weightVector[i] * (standard_d2lnLidlz2 - standard_dlnLidlz * standard_dlnLidlz); + } + inv_Li = FABS(inv_Li); lh += inv_Li; dlnLdlz += dlnLidlz; - d2lnLdlz2 += d2lnLidlz2; + d2lnLdlz2 += d2lnLidlz2; } *ext_dlnLdlz = (dlnLdlz / (lh - 1.0)); *ext_d2lnLdlz2 = (((lh - 1.0) * (d2lnLdlz2) - (dlnLdlz * dlnLdlz)) / ((lh - 1.0) * (lh - 1.0))); + *felsenstein_ext_dlnLdlz = dlnLdlz / lh; + *felsenstein_ext_d2lnLdlz2 = ((d2lnLdlz2 * lh) - (dlnLdlz * dlnLdlz)) / (lh * lh); + + *standard_ext_dlnLdlz = standard_dlnLdlz; + *standard_ext_d2lnLdlz2 = standard_d2lnLdlz2; + return lh; } static double coreGammaAsc(double *gammaRates, double *EIGN, double *sumtable, int upper, - volatile double *ext_dlnLdlz, volatile double *ext_d2lnLdlz2, double lz, const int numStates) + volatile double *ext_dlnLdlz, volatile double *ext_d2lnLdlz2, double lz, const int numStates, + volatile double *standard_ext_dlnLdlz, volatile double *standard_ext_d2lnLdlz2, + volatile double *felsenstein_ext_dlnLdlz, volatile double *felsenstein_ext_d2lnLdlz2, + double *weightVector) { double diagptable[1024], lh = 0.0, dlnLdlz = 0.0, d2lnLdlz2 = 0.0, + standard_dlnLdlz = 0.0, + standard_d2lnLdlz2 = 0.0, ki, kisqr; @@ -1269,6 +1301,22 @@ static double coreGammaAsc(double *gammaRates, double *EIGN, double *sumtable, i } } + if(weightVector) + { + double + standard_inv_Li = inv_Li, + standard_dlnLidlz = dlnLidlz, + standard_d2lnLidlz2 = d2lnLidlz2; + + standard_inv_Li = 1.0 / FABS(standard_inv_Li); + + standard_dlnLidlz *= standard_inv_Li; + standard_d2lnLidlz2 *= standard_inv_Li; + + standard_dlnLdlz += weightVector[i] * standard_dlnLidlz; + standard_d2lnLdlz2 += weightVector[i] * (standard_d2lnLidlz2 - standard_dlnLidlz * standard_dlnLidlz); + } + inv_Li = 0.25 * FABS(inv_Li); dlnLidlz *= 0.25; d2lnLidlz2 *= 0.25; @@ -1280,6 +1328,12 @@ static double coreGammaAsc(double *gammaRates, double *EIGN, double *sumtable, i *ext_dlnLdlz = (dlnLdlz / (lh - 1.0)); *ext_d2lnLdlz2 = (((lh - 1.0) * (d2lnLdlz2) - (dlnLdlz * dlnLdlz)) / ((lh - 1.0) * (lh - 1.0))); + + *felsenstein_ext_dlnLdlz = dlnLdlz / lh; + *felsenstein_ext_d2lnLdlz2 = ((d2lnLdlz2 * lh) - (dlnLdlz * dlnLdlz)) / (lh * lh); + + *standard_ext_dlnLdlz = standard_dlnLdlz; + *standard_ext_d2lnLdlz2 = standard_d2lnLdlz2; return lh; } @@ -4223,36 +4277,62 @@ void execCore(tree *tr, volatile double *_dlnLdlz, volatile double *_d2lnLdlz2) i; double - correction; + *weightVector = (double*)NULL; int w = 0; volatile double d1 = 0.0, - d2 = 0.0; + d2 = 0.0, + standard_d1 = 0.0, + standard_d2 = 0.0, + felsenstein_d1 = 0.0, + felsenstein_d2 = 0.0; - for(i = tr->partitionData[model].lower; i < tr->partitionData[model].upper; i++) - w += tr->cdta->aliaswgt[i]; + if(tr->ascertainmentCorrectionType == STAMATAKIS_CORRECTION) + weightVector = tr->partitionData[model].invariableFrequencies; switch(tr->rateHetModel) { case CAT: - correction = coreCatAsc(tr->partitionData[model].EIGN, tr->partitionData[model].ascSumBuffer, states, - &d1, &d2, lz, states); + coreCatAsc(tr->partitionData[model].EIGN, tr->partitionData[model].ascSumBuffer, states, + &d1, &d2, lz, states, &standard_d1, &standard_d2, + &felsenstein_d1, &felsenstein_d2, + weightVector); break; case GAMMA: - correction = coreGammaAsc(tr->partitionData[model].gammaRates, tr->partitionData[model].EIGN, tr->partitionData[model].ascSumBuffer, states, - &d1, &d2, lz, states); + coreGammaAsc(tr->partitionData[model].gammaRates, tr->partitionData[model].EIGN, tr->partitionData[model].ascSumBuffer, states, + &d1, &d2, lz, states, &standard_d1, &standard_d2, + &felsenstein_d1, &felsenstein_d2, + weightVector); break; default: assert(0); } - correction = 1.0 - correction; + - _dlnLdlz[branchIndex] = _dlnLdlz[branchIndex] + dlnLdlz - (double)w * d1; - _d2lnLdlz2[branchIndex] = _d2lnLdlz2[branchIndex] + d2lnLdlz2- (double)w * d2; + switch(tr->ascertainmentCorrectionType) + { + case LEWIS_CORRECTION: + for(i = tr->partitionData[model].lower; i < tr->partitionData[model].upper; i++) + w += tr->cdta->aliaswgt[i]; + + _dlnLdlz[branchIndex] = _dlnLdlz[branchIndex] + dlnLdlz - (double)w * d1; + _d2lnLdlz2[branchIndex] = _d2lnLdlz2[branchIndex] + d2lnLdlz2- (double)w * d2; + break; + case STAMATAKIS_CORRECTION: + _dlnLdlz[branchIndex] += dlnLdlz + standard_d1; + _d2lnLdlz2[branchIndex] += d2lnLdlz2 + standard_d2; + break; + case FELSENSTEIN_CORRECTION: + _dlnLdlz[branchIndex] += dlnLdlz + tr->partitionData[model].invariableWeight * felsenstein_d1; + _d2lnLdlz2[branchIndex] += d2lnLdlz2 + tr->partitionData[model].invariableWeight * felsenstein_d2; + break; + default: + assert(0); + } } else diff --git a/parsePartitions.c b/parsePartitions.c index e277bed..4e5df81 100644 --- a/parsePartitions.c +++ b/parsePartitions.c @@ -95,7 +95,7 @@ static void analyzeIdentifier(char **ch, int modelNumber, tree *tr) model[2048] = "", thisModel[2048] = ""; - int + int i = 0, n, j, @@ -111,22 +111,27 @@ static void analyzeIdentifier(char **ch, int modelNumber, tree *tr) *ch = *ch + 1; } + ident[i] = '\0'; + n = i; i = 0; + + for(i = 0; i < n; i++) if(ident[i] == ',') containsComma = 1; if(!containsComma) { - printf("Error, model file must have format: DNA or AA model, then a comma, and then the partition name\n"); + printf("Error, model file must have format: Substitution model, then a comma, and then the partition name\n"); exit(-1); } else { boolean - useProteinSubstitutionFile = FALSE, + analyzeRest = TRUE, + useExternalFile = FALSE, found = FALSE; int @@ -156,59 +161,137 @@ static void analyzeIdentifier(char **ch, int modelNumber, tree *tr) if(closeBracket > 0 || openBracket > 0) { if((closeBracket == 1) && (openBracket == 1) && (openPos < closePos)) - useProteinSubstitutionFile = TRUE; + useExternalFile = TRUE; else { - printf("\nError: Apparently you want to specify a user-defined protein substitution model that shall be read from file\n"); - printf("It must be enclosed in opening and closing bracktes like this: [fileName]\n\n"); + printf("\nError: Apparently you want to specify a user-defined protein substitution model\n"); + printf("or ascertainment bias correction model that shall be read from file\n"); + printf("It must be enclosed in opening and closing bracktes like this: [prot=fileName] or [asc=fileName]\n\n"); printf("you specified: %s\n\n", model); exit(-1); } } - if(useProteinSubstitutionFile) + if(useExternalFile) { - char - protFileName[2048] = ""; + char + designator[2048] = "", + fileName[2048] = ""; int - pos, - k, + pos, + index, lower = 0, - upper = i - 1; + upper = i - 1; + + boolean + isProteinFile = TRUE; - while(model[lower] == '[' || model[lower] == ' ') + while(model[lower] == '[') lower++; - while(model[upper] == ']' || model[upper] == ' ') + while(model[upper] == ']') upper--; assert(lower < upper); - for(k = lower, pos = 0; k <= upper; k++, pos++) - protFileName[pos] = model[k]; + index = lower; + pos = 0; + + while(model[index] != '~') + { + designator[pos] = model[index]; + pos++; + index++; + } + + designator[pos] = '\0'; + + if(strcmp(designator, "asc") == 0) + isProteinFile = FALSE; + else + { + if(strcmp(designator, "prot") == 0) + isProteinFile = TRUE; + else + { + printf("Error external partition file type %s does not exist\n", designator); + printf("Available file types: asc and prot\n"); + exit(-1); + } + } - protFileName[pos] = '\0'; + while(model[index] == '~') + index++; + pos = 0; + while(model[index] != ']') + { + fileName[pos] = model[index]; + index++; + pos++; + } + + fileName[pos] = '\0'; - if(!filexists(protFileName)) + if(!filexists(fileName)) { - printf("\n\ncustom protein substitution file [%s] you want to use does not exist!\n", protFileName); + printf("\n\ncustom protein substitution or ascertainment bias file [%s] you want to use does not exist!\n", fileName); printf("you need to specify the full path\n"); printf("the file name shall not contain blanks!\n\n"); - exit(0); + exit(-1); } - - strcpy(tr->initialPartitionData[modelNumber].proteinSubstitutionFileName, protFileName); - /*printf("%s \n", tr->initialPartitionData[modelNumber].proteinSubstitutionFileName);*/ - tr->initialPartitionData[modelNumber].protModels = PROT_FILE; - tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = TRUE; - tr->initialPartitionData[modelNumber].dataType = AA_DATA; + if(isProteinFile) + { + strcpy(tr->initialPartitionData[modelNumber].proteinSubstitutionFileName, fileName); + /*printf("%s \n", tr->initialPartitionData[modelNumber].proteinSubstitutionFileName);*/ + + tr->initialPartitionData[modelNumber].protModels = PROT_FILE; + tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = TRUE; + tr->initialPartitionData[modelNumber].dataType = AA_DATA; + analyzeRest = FALSE; + } + else + { + int + newIndex = 0; + + strcpy(tr->initialPartitionData[modelNumber].ascFileName, fileName); + + i = 0; + + while(ident[i] != ',') + { + if(ident[i] == '\0') + { + printf("Expecting two commas in string %s\n", ident); + exit(-1); + } + i++; + } + + i++; + + while(ident[i] != ',') + { + if(ident[i] == '\0') + { + printf("Expecting two commas in string %s\n", ident); + exit(-1); + } + model[newIndex] = ident[i]; + i++; + newIndex++; + } + + model[newIndex] = '\0'; + } } - else + + if(analyzeRest) { /* AA */ tr->initialPartitionData[modelNumber].ascBias = FALSE; @@ -1532,6 +1615,7 @@ void parseSecondaryStructure(tree *tr, analdef *adef, int sites) partBuffer[i].partitionName = (char*)rax_malloc((strlen(tr->extendedPartitionData[i].partitionName) + 1) * sizeof(char)); strcpy(partBuffer[i].partitionName, tr->extendedPartitionData[i].partitionName); strcpy(partBuffer[i].proteinSubstitutionFileName, tr->extendedPartitionData[i].proteinSubstitutionFileName); + strcpy(partBuffer[i].ascFileName, tr->extendedPartitionData[i].ascFileName); partBuffer[i].dataType = tr->extendedPartitionData[i].dataType; partBuffer[i].protModels = tr->extendedPartitionData[i].protModels; partBuffer[i].usePredefinedProtFreqs = tr->extendedPartitionData[i].usePredefinedProtFreqs; @@ -1549,6 +1633,7 @@ void parseSecondaryStructure(tree *tr, analdef *adef, int sites) tr->extendedPartitionData[i].partitionName = (char*)rax_malloc((strlen(partBuffer[i].partitionName) + 1) * sizeof(char)); strcpy(tr->extendedPartitionData[i].partitionName, partBuffer[i].partitionName); strcpy(tr->extendedPartitionData[i].proteinSubstitutionFileName, partBuffer[i].proteinSubstitutionFileName); + strcpy(tr->extendedPartitionData[i].ascFileName, partBuffer[i].ascFileName); tr->extendedPartitionData[i].dataType = partBuffer[i].dataType; tr->extendedPartitionData[i].protModels= partBuffer[i].protModels; tr->extendedPartitionData[i].usePredefinedProtFreqs= partBuffer[i].usePredefinedProtFreqs;