diff --git a/README b/README index f2f991a..7fc29ae 100644 --- a/README +++ b/README @@ -1,4 +1,4 @@ -Standard RAxML version 8.1.6 +Standard RAxML version 8.1.7 Compiling under Linux: diff --git a/Release-Notes b/Release-Notes index 80a1bd2..0b42430 100644 --- a/Release-Notes +++ b/Release-Notes @@ -1,4 +1,4 @@ -RAxML v8.1.6 +RAxML v8.1.7 We view RAxML code development as permanent work in progress. diff --git a/axml.c b/axml.c index a9e74a0..db9ea76 100644 --- a/axml.c +++ b/axml.c @@ -1878,6 +1878,7 @@ static void getinput(analdef *adef, rawdata *rdta, cruncheddata *cdta, tree *tr) tr->initialPartitionData[0].usePredefinedProtFreqs = TRUE; tr->initialPartitionData[0].optimizeBaseFrequencies = FALSE; strcpy(tr->initialPartitionData[0].proteinSubstitutionFileName, proteinModelFileName); + } for(i = 0; i <= rdta->sites; i++) @@ -1898,6 +1899,7 @@ static void getinput(analdef *adef, rawdata *rdta, cruncheddata *cdta, tree *tr) tr->extendedPartitionData[i].partitionName = (char*)rax_malloc((strlen(tr->initialPartitionData[i].partitionName) + 1) * sizeof(char)); strcpy(tr->extendedPartitionData[i].partitionName, tr->initialPartitionData[i].partitionName); strcpy(tr->extendedPartitionData[i].proteinSubstitutionFileName, tr->initialPartitionData[i].proteinSubstitutionFileName); + strcpy(tr->extendedPartitionData[i].ascFileName, tr->initialPartitionData[i].ascFileName); tr->extendedPartitionData[i].dataType = tr->initialPartitionData[i].dataType; tr->extendedPartitionData[i].protModels = tr->initialPartitionData[i].protModels; tr->extendedPartitionData[i].usePredefinedProtFreqs = tr->initialPartitionData[i].usePredefinedProtFreqs; @@ -3182,6 +3184,9 @@ static void allocPartitions(tree *tr) tr->partitionData[i].freqExponents = (double*)rax_malloc(pl->frequenciesLength * sizeof(double)); tr->partitionData[i].tipVector = (double *)rax_malloc(pl->tipVectorLength * sizeof(double)); + tr->partitionData[i].invariableFrequencies = (double *)rax_malloc(pl->states * sizeof(double)); + + #ifdef _HET tr->partitionData[i].EIGN_TIP = (double*)rax_malloc(pl->eignLength * sizeof(double)); @@ -4692,6 +4697,7 @@ static void printREADME(void) #else printf(" [--mesquite][--silent][--no-seq-check][--no-bfgs]\n"); #endif + printf(" [--asc-corr=stamatakis|felsenstein|lewis]\n"); printf("\n"); printf(" -a Specify a column weight file name to assign individual weights to each column of \n"); printf(" the alignment. Those weights must be integers separated by any type and number \n"); @@ -5051,7 +5057,7 @@ static void printREADME(void) printf(" Bootstopping will only work in combination with \"-x\" or \"-b\"\n"); printf("\n"); printf(" DEFAULT: 1 single analysis\n"); - printf("\n"); + printf("\n"); #ifndef _WAYNE_MPI printf(" --mesquite Print output files that can be parsed by Mesquite.\n"); printf("\n"); @@ -5071,6 +5077,16 @@ static void printREADME(void) printf(" --no-bfgs Disables automatic usage of BFGS method to optimize GTR rates on unpartitioned DNA datasets\n"); printf("\n"); printf(" DEFAULT: BFGS on\n"); + printf("\n"); + printf(" --asc-corr Allows to specify the type of ascertainment bias correction you wish to use. There are three\n"); + printf(" types available:\n"); + printf(" --asc-corr=lewis: the standard correction by Paul Lewis\n"); + printf(" --asc-corr=felsenstein: a correction introduced by Joe Felsenstein that allows to explicitely specify\n"); + printf(" the number of invariable sites (if known) one wants to correct for.\n"); + printf(" --asc-corr=stamatakis: a correction introduced by myself that allows to explicitely specify\n"); + printf(" the number of invariable sites for each character (if known) one wants to correct for.\n"); + printf(" For further details please refer to the manual!\n"); + printf("\n"); printf("\n\n\n\n"); @@ -5177,6 +5193,7 @@ static void get_args(int argc, char *argv[], analdef *adef, tree *tr) tr->noRateHet = FALSE; tr->perPartitionEPA = FALSE; tr->useBrLenScaler = FALSE; + tr->ascertainmentCorrectionType = NOT_DEFINED; #ifdef _BASTIEN tr->doBastienStuff = FALSE; @@ -5189,13 +5206,14 @@ static void get_args(int argc, char *argv[], analdef *adef, tree *tr) while(1) { static struct - option long_options[5] = + option long_options[6] = { /* These options set a flag. */ {"mesquite", no_argument, &flag, 1}, {"silent", no_argument, &flag, 1}, {"no-seq-check", no_argument, &flag, 1}, {"no-bfgs", no_argument, &flag, 1}, + {"asc-corr", required_argument, &flag, 1}, /* These options don't set a flag. We distinguish them by their indices. */ //{"add", no_argument, 0, 'a'}, @@ -5240,6 +5258,45 @@ static void get_args(int argc, char *argv[], analdef *adef, tree *tr) case 3: adef->useBFGS = FALSE; break; + case 4: + { + char + *ascModels[3] = {"lewis", "felsenstein", "stamatakis"}; + + int + k; + + for(k = 0; k < 3; k++) + if(strcmp(optarg, ascModels[k]) == 0) + break; + + if(k == 3) + { + printf("\nError, unknown ascertainment correction type, you can specify one of the following corrections:\n\n"); + for(k = 0; k < 3; k++) + printf("--asc-corr=%s\n", ascModels[k]); + printf("\n"); + errorExit(-1); + } + else + { + switch(k) + { + case 0: + tr->ascertainmentCorrectionType = LEWIS_CORRECTION; + break; + case 1: + tr->ascertainmentCorrectionType = FELSENSTEIN_CORRECTION; + break; + case 2: + tr->ascertainmentCorrectionType = STAMATAKIS_CORRECTION; + break; + default: + assert(0); + } + } + } + break; default: assert(0); } @@ -8143,6 +8200,7 @@ static void initPartition(tree *tr, tree *localTree, int tid) localTree->cdta->patratStored = tr->cdta->patratStored; localTree->discreteRateCategories = tr->discreteRateCategories; + localTree->ascertainmentCorrectionType = tr->ascertainmentCorrectionType; for(model = 0; model < localTree->NumberOfModels; model++) { @@ -8603,8 +8661,11 @@ static void execFunction(tree *tr, tree *localTree, int tid, int n) localTree->partitionData[model].propInvariant = tr->partitionData[model].propInvariant; localTree->partitionData[model].lower = tr->partitionData[model].lower; localTree->partitionData[model].upper = tr->partitionData[model].upper; - + localTree->partitionData[model].numberOfCategories = tr->partitionData[model].numberOfCategories; + + localTree->partitionData[model].invariableWeight = tr->partitionData[model].invariableWeight; + memcpy(localTree->partitionData[model].invariableFrequencies, tr->partitionData[model].invariableFrequencies, pl->states * sizeof(double)); } } @@ -11988,6 +12049,91 @@ static void checkAscBias(tree *tr) } } + +static void readAscFiles(tree *tr) + { + if(tr->ascertainmentCorrectionType == STAMATAKIS_CORRECTION || tr->ascertainmentCorrectionType == FELSENSTEIN_CORRECTION) + { + int + model; + + for(model = 0; model < tr->NumberOfModels; model++) + { + if(tr->partitionData[model].ascBias) + { + if(filexists(tr->partitionData[model].ascFileName)) + { + FILE + *f = myfopen(tr->partitionData[model].ascFileName, "rb"); + + switch(tr->ascertainmentCorrectionType) + { + case STAMATAKIS_CORRECTION: + { + boolean + errorDetected = FALSE; + + unsigned int + *weights = (unsigned int*)rax_malloc((size_t)tr->partitionData[model].states * sizeof(unsigned int)); + + int + i; + + for(i = 0; i < tr->partitionData[model].states; i++) + if(fscanf(f, "%u", &weights[i]) != 1) + { + errorDetected = TRUE; + break; + } + + if(errorDetected) + { + printf("\nProblem reading number of invariable sites in ascertainment correction file %s\n", tr->partitionData[model].ascFileName); + printf("for stamatakis ascertainment bias correction for partition %d with name %s.\n\n", model, tr->partitionData[model].partitionName); + errorExit(-1); + } + + for(i = 0; i < tr->partitionData[model].states; i++) + tr->partitionData[model].invariableFrequencies[i] = (double)weights[i]; + + rax_free(weights); + } + break; + case FELSENSTEIN_CORRECTION: + { + unsigned int + length; + + if(fscanf(f, "%u", &length) != 1) + { + printf("\nProblem reading number of invariable sites in ascertainment correction file %s\n", tr->partitionData[model].ascFileName); + printf("for felsenstein ascertainment bias correction for partition %d with name %s.\n\n", model, tr->partitionData[model].partitionName); + errorExit(-1); + } + + tr->partitionData[0].invariableWeight = (double)length; + } + break; + default: + assert(0); + } + + fclose(f); + } + else + { + printf("You specified that you want to use a stamatakis or felsenstein ascertainment bias correction for partition %d with name %s.\n", + model, tr->partitionData[model].partitionName); + printf("but did not specify a correction file for this partition in the partition file!\n\n"); + errorExit(-1); + } + } + } + } + } + + + /******* missing data prediction code -f k option **************************************************/ //function that makes the actual sequence guess estimate given a node of the tree p which is a tip @@ -13086,7 +13232,16 @@ int main (int argc, char *argv[]) setRateHetAndDataIncrement(tr, adef); if(countAscBias > 0 && !adef->readTaxaOnly) - checkAscBias(tr); + { + if(tr->ascertainmentCorrectionType == NOT_DEFINED) + { + printf("\nError, one or more of your partitions use an ascertainment bias correction,\n"); + printf("but you have not specified which correction type you want to use via \"--asc-corr\" \n\n"); + errorExit(-1); + } + + checkAscBias(tr); + } #ifdef _USE_PTHREADS startPthreads(tr); @@ -13098,6 +13253,10 @@ int main (int argc, char *argv[]) allocNodex(tr); #endif + + readAscFiles(tr); + + if(!adef->readTaxaOnly) { #ifdef _USE_PTHREADS diff --git a/axml.h b/axml.h index 467b4bd..6c5d88a 100644 --- a/axml.h +++ b/axml.h @@ -166,9 +166,9 @@ #define PointGamma(prob,alpha,beta) PointChi2(prob,2.0*(alpha))/(2.0*(beta)) #define programName "RAxML" -#define programVersion "8.1.6" -#define programVersionInt 816 -#define programDate "November 26 2014" +#define programVersion "8.1.7" +#define programVersionInt 817 +#define programDate "November 28 2014" #define TREE_EVALUATION 0 @@ -599,6 +599,7 @@ typedef struct { char *partitionName; char proteinSubstitutionFileName[2048]; + char ascFileName[2048]; double externalAAMatrix[420]; double *sumBuffer; @@ -611,6 +612,9 @@ typedef struct { double *left; double *right; + double *invariableFrequencies; + double invariableWeight; + #ifdef _HET /* heterotachy */ @@ -768,6 +772,14 @@ typedef struct +/***********************************************************/ + +#define NOT_DEFINED 0 +#define LEWIS_CORRECTION 1 +#define FELSENSTEIN_CORRECTION 2 +#define STAMATAKIS_CORRECTION 3 + + /**************************************************************/ @@ -879,6 +891,8 @@ typedef struct { double fracchange; double rawFracchange; + int ascertainmentCorrectionType; + #ifdef _BASTIEN double secondDerivative[NUM_BRANCHES]; boolean doBastienStuff; diff --git a/evaluateGenericSpecial.c b/evaluateGenericSpecial.c index 1c765bb..bae895e 100644 --- a/evaluateGenericSpecial.c +++ b/evaluateGenericSpecial.c @@ -254,7 +254,7 @@ static double evaluateCatFlex(int *ex1, int *ex2, int *cptr, int *wptr, static double evaluateCatAsc(int *ex1, int *ex2, double *x1, double *x2, double *tipVector, - unsigned char *tipX1, int n, double *diagptable, const int numStates) + unsigned char *tipX1, int n, double *diagptable, const int numStates, double *accumulator, double *weightVector) { double exponent, @@ -300,6 +300,9 @@ static double evaluateCatAsc(int *ex1, int *ex2, } #endif + if(weightVector) + *accumulator += weightVector[i] * (LOG(FABS(term)) + (ex2[i] * LOG(minlikelihood))); + sum += unobserved; } } @@ -331,7 +334,10 @@ static double evaluateCatAsc(int *ex1, int *ex2, assert(exponent < 700.0); unobserved = FABS(term) * exp(exponent); - + + if(weightVector) + *accumulator += weightVector[i] * (LOG(FABS(term)) + ((ex1[i] + ex2[i]) * LOG(minlikelihood))); + sum += unobserved; } } @@ -343,7 +349,7 @@ static double evaluateCatAsc(int *ex1, int *ex2, static double evaluateGammaAsc(int *ex1, int *ex2, double *x1, double *x2, double *tipVector, - unsigned char *tipX1, int n, double *diagptable, const int numStates) + unsigned char *tipX1, int n, double *diagptable, const int numStates, double *accumulator, double *weightVector) { double exponent, @@ -394,7 +400,9 @@ static double evaluateGammaAsc(int *ex1, int *ex2, assert(0); } #endif - + if(weightVector) + *accumulator += weightVector[i] * (LOG(0.25 * FABS(term)) + (ex2[i] * LOG(minlikelihood))); + sum += unobserved; } } @@ -428,7 +436,10 @@ static double evaluateGammaAsc(int *ex1, int *ex2, assert(exponent < 700.0); unobserved = 0.25 * FABS(term) * exp(exponent); - + + if(weightVector) + *accumulator += weightVector[i] * (LOG(0.25 * FABS(term)) + ((ex1[i] + ex2[i]) * LOG(minlikelihood))); + sum += unobserved; } } @@ -3458,9 +3469,15 @@ double evaluateIterative(tree *tr, boolean writeVector) int w = 0; - double + double + *weightVector = (double*)NULL, + accumulator = 0.0, correction; + if(tr->ascertainmentCorrectionType == STAMATAKIS_CORRECTION) + weightVector = tr->partitionData[model].invariableFrequencies; + // invariableWeight; + switch(tr->rateHetModel) { case CAT: @@ -3473,23 +3490,36 @@ double evaluateIterative(tree *tr, boolean writeVector) correction = evaluateCatAsc(ex1_asc, ex2_asc, x1_start_asc, x2_start_asc, tr->partitionData[model].tipVector, - tip, ascWidth, diagptable, ascWidth); + tip, ascWidth, diagptable, ascWidth, &accumulator, weightVector); } break; case GAMMA: correction = evaluateGammaAsc(ex1_asc, ex2_asc, x1_start_asc, x2_start_asc, tr->partitionData[model].tipVector, - tip, ascWidth, diagptable, ascWidth); + tip, ascWidth, diagptable, ascWidth, &accumulator, weightVector); break; default: assert(0); } - - - for(i = tr->partitionData[model].lower; i < tr->partitionData[model].upper; i++) - w += tr->cdta->aliaswgt[i]; - - partitionLikelihood = partitionLikelihood - (double)w * LOG(1.0 - correction); + switch(tr->ascertainmentCorrectionType) + { + case LEWIS_CORRECTION: + for(i = tr->partitionData[model].lower; i < tr->partitionData[model].upper; i++) + w += tr->cdta->aliaswgt[i]; + partitionLikelihood = partitionLikelihood - (double)w * LOG(1.0 - correction); + break; + case STAMATAKIS_CORRECTION: + //printf("accumulator %f\n", accumulator); + //exit(1); + partitionLikelihood += accumulator; + break; + case FELSENSTEIN_CORRECTION: + partitionLikelihood += tr->partitionData[model].invariableWeight * LOG(correction); + // printf("correction %f %f\n", partitionLikelihood, tr->partitionData[model].invariableWeight * LOG(correction)); + break; + default: + assert(0); + } #ifdef _DEBUG_ASC printf("E w: %f %f ARG %f ragu %f\n", partitionLikelihood, (double)w, 1.0 - correction, (double)w * LOG(1.0 - correction)); diff --git a/makenewzGenericSpecial.c b/makenewzGenericSpecial.c index ae8f0a5..5f9f727 100644 --- a/makenewzGenericSpecial.c +++ b/makenewzGenericSpecial.c @@ -1158,13 +1158,18 @@ static void coreGammaFlex(double *gammaRates, double *EIGN, double *sumtable, in } static double coreCatAsc(double *EIGN, double *sumtable, int upper, - volatile double *ext_dlnLdlz, volatile double *ext_d2lnLdlz2, double lz, const int numStates) + volatile double *ext_dlnLdlz, volatile double *ext_d2lnLdlz2, double lz, const int numStates, + volatile double *standard_ext_dlnLdlz, volatile double *standard_ext_d2lnLdlz2, + volatile double *felsenstein_ext_dlnLdlz, volatile double *felsenstein_ext_d2lnLdlz2, + double *weightVector) { double diagptable[1024], lh = 0.0, dlnLdlz = 0.0, d2lnLdlz2 = 0.0, + standard_dlnLdlz = 0.0, + standard_d2lnLdlz2 = 0.0, ki, kisqr; @@ -1202,28 +1207,55 @@ static double coreCatAsc(double *EIGN, double *sumtable, int upper, d2lnLidlz2 += tmp * diagptable[l * 4 + 2]; } + if(weightVector) + { + double + standard_inv_Li = inv_Li, + standard_dlnLidlz = dlnLidlz, + standard_d2lnLidlz2 = d2lnLidlz2; + + standard_inv_Li = 1.0 / FABS(standard_inv_Li); + + standard_dlnLidlz *= standard_inv_Li; + standard_d2lnLidlz2 *= standard_inv_Li; + + standard_dlnLdlz += weightVector[i] * standard_dlnLidlz; + standard_d2lnLdlz2 += weightVector[i] * (standard_d2lnLidlz2 - standard_dlnLidlz * standard_dlnLidlz); + } + inv_Li = FABS(inv_Li); lh += inv_Li; dlnLdlz += dlnLidlz; - d2lnLdlz2 += d2lnLidlz2; + d2lnLdlz2 += d2lnLidlz2; } *ext_dlnLdlz = (dlnLdlz / (lh - 1.0)); *ext_d2lnLdlz2 = (((lh - 1.0) * (d2lnLdlz2) - (dlnLdlz * dlnLdlz)) / ((lh - 1.0) * (lh - 1.0))); + *felsenstein_ext_dlnLdlz = dlnLdlz / lh; + *felsenstein_ext_d2lnLdlz2 = ((d2lnLdlz2 * lh) - (dlnLdlz * dlnLdlz)) / (lh * lh); + + *standard_ext_dlnLdlz = standard_dlnLdlz; + *standard_ext_d2lnLdlz2 = standard_d2lnLdlz2; + return lh; } static double coreGammaAsc(double *gammaRates, double *EIGN, double *sumtable, int upper, - volatile double *ext_dlnLdlz, volatile double *ext_d2lnLdlz2, double lz, const int numStates) + volatile double *ext_dlnLdlz, volatile double *ext_d2lnLdlz2, double lz, const int numStates, + volatile double *standard_ext_dlnLdlz, volatile double *standard_ext_d2lnLdlz2, + volatile double *felsenstein_ext_dlnLdlz, volatile double *felsenstein_ext_d2lnLdlz2, + double *weightVector) { double diagptable[1024], lh = 0.0, dlnLdlz = 0.0, d2lnLdlz2 = 0.0, + standard_dlnLdlz = 0.0, + standard_d2lnLdlz2 = 0.0, ki, kisqr; @@ -1269,6 +1301,22 @@ static double coreGammaAsc(double *gammaRates, double *EIGN, double *sumtable, i } } + if(weightVector) + { + double + standard_inv_Li = inv_Li, + standard_dlnLidlz = dlnLidlz, + standard_d2lnLidlz2 = d2lnLidlz2; + + standard_inv_Li = 1.0 / FABS(standard_inv_Li); + + standard_dlnLidlz *= standard_inv_Li; + standard_d2lnLidlz2 *= standard_inv_Li; + + standard_dlnLdlz += weightVector[i] * standard_dlnLidlz; + standard_d2lnLdlz2 += weightVector[i] * (standard_d2lnLidlz2 - standard_dlnLidlz * standard_dlnLidlz); + } + inv_Li = 0.25 * FABS(inv_Li); dlnLidlz *= 0.25; d2lnLidlz2 *= 0.25; @@ -1280,6 +1328,12 @@ static double coreGammaAsc(double *gammaRates, double *EIGN, double *sumtable, i *ext_dlnLdlz = (dlnLdlz / (lh - 1.0)); *ext_d2lnLdlz2 = (((lh - 1.0) * (d2lnLdlz2) - (dlnLdlz * dlnLdlz)) / ((lh - 1.0) * (lh - 1.0))); + + *felsenstein_ext_dlnLdlz = dlnLdlz / lh; + *felsenstein_ext_d2lnLdlz2 = ((d2lnLdlz2 * lh) - (dlnLdlz * dlnLdlz)) / (lh * lh); + + *standard_ext_dlnLdlz = standard_dlnLdlz; + *standard_ext_d2lnLdlz2 = standard_d2lnLdlz2; return lh; } @@ -4223,36 +4277,62 @@ void execCore(tree *tr, volatile double *_dlnLdlz, volatile double *_d2lnLdlz2) i; double - correction; + *weightVector = (double*)NULL; int w = 0; volatile double d1 = 0.0, - d2 = 0.0; + d2 = 0.0, + standard_d1 = 0.0, + standard_d2 = 0.0, + felsenstein_d1 = 0.0, + felsenstein_d2 = 0.0; - for(i = tr->partitionData[model].lower; i < tr->partitionData[model].upper; i++) - w += tr->cdta->aliaswgt[i]; + if(tr->ascertainmentCorrectionType == STAMATAKIS_CORRECTION) + weightVector = tr->partitionData[model].invariableFrequencies; switch(tr->rateHetModel) { case CAT: - correction = coreCatAsc(tr->partitionData[model].EIGN, tr->partitionData[model].ascSumBuffer, states, - &d1, &d2, lz, states); + coreCatAsc(tr->partitionData[model].EIGN, tr->partitionData[model].ascSumBuffer, states, + &d1, &d2, lz, states, &standard_d1, &standard_d2, + &felsenstein_d1, &felsenstein_d2, + weightVector); break; case GAMMA: - correction = coreGammaAsc(tr->partitionData[model].gammaRates, tr->partitionData[model].EIGN, tr->partitionData[model].ascSumBuffer, states, - &d1, &d2, lz, states); + coreGammaAsc(tr->partitionData[model].gammaRates, tr->partitionData[model].EIGN, tr->partitionData[model].ascSumBuffer, states, + &d1, &d2, lz, states, &standard_d1, &standard_d2, + &felsenstein_d1, &felsenstein_d2, + weightVector); break; default: assert(0); } - correction = 1.0 - correction; + - _dlnLdlz[branchIndex] = _dlnLdlz[branchIndex] + dlnLdlz - (double)w * d1; - _d2lnLdlz2[branchIndex] = _d2lnLdlz2[branchIndex] + d2lnLdlz2- (double)w * d2; + switch(tr->ascertainmentCorrectionType) + { + case LEWIS_CORRECTION: + for(i = tr->partitionData[model].lower; i < tr->partitionData[model].upper; i++) + w += tr->cdta->aliaswgt[i]; + + _dlnLdlz[branchIndex] = _dlnLdlz[branchIndex] + dlnLdlz - (double)w * d1; + _d2lnLdlz2[branchIndex] = _d2lnLdlz2[branchIndex] + d2lnLdlz2- (double)w * d2; + break; + case STAMATAKIS_CORRECTION: + _dlnLdlz[branchIndex] += dlnLdlz + standard_d1; + _d2lnLdlz2[branchIndex] += d2lnLdlz2 + standard_d2; + break; + case FELSENSTEIN_CORRECTION: + _dlnLdlz[branchIndex] += dlnLdlz + tr->partitionData[model].invariableWeight * felsenstein_d1; + _d2lnLdlz2[branchIndex] += d2lnLdlz2 + tr->partitionData[model].invariableWeight * felsenstein_d2; + break; + default: + assert(0); + } } else diff --git a/parsePartitions.c b/parsePartitions.c index e277bed..4e5df81 100644 --- a/parsePartitions.c +++ b/parsePartitions.c @@ -95,7 +95,7 @@ static void analyzeIdentifier(char **ch, int modelNumber, tree *tr) model[2048] = "", thisModel[2048] = ""; - int + int i = 0, n, j, @@ -111,22 +111,27 @@ static void analyzeIdentifier(char **ch, int modelNumber, tree *tr) *ch = *ch + 1; } + ident[i] = '\0'; + n = i; i = 0; + + for(i = 0; i < n; i++) if(ident[i] == ',') containsComma = 1; if(!containsComma) { - printf("Error, model file must have format: DNA or AA model, then a comma, and then the partition name\n"); + printf("Error, model file must have format: Substitution model, then a comma, and then the partition name\n"); exit(-1); } else { boolean - useProteinSubstitutionFile = FALSE, + analyzeRest = TRUE, + useExternalFile = FALSE, found = FALSE; int @@ -156,59 +161,137 @@ static void analyzeIdentifier(char **ch, int modelNumber, tree *tr) if(closeBracket > 0 || openBracket > 0) { if((closeBracket == 1) && (openBracket == 1) && (openPos < closePos)) - useProteinSubstitutionFile = TRUE; + useExternalFile = TRUE; else { - printf("\nError: Apparently you want to specify a user-defined protein substitution model that shall be read from file\n"); - printf("It must be enclosed in opening and closing bracktes like this: [fileName]\n\n"); + printf("\nError: Apparently you want to specify a user-defined protein substitution model\n"); + printf("or ascertainment bias correction model that shall be read from file\n"); + printf("It must be enclosed in opening and closing bracktes like this: [prot=fileName] or [asc=fileName]\n\n"); printf("you specified: %s\n\n", model); exit(-1); } } - if(useProteinSubstitutionFile) + if(useExternalFile) { - char - protFileName[2048] = ""; + char + designator[2048] = "", + fileName[2048] = ""; int - pos, - k, + pos, + index, lower = 0, - upper = i - 1; + upper = i - 1; + + boolean + isProteinFile = TRUE; - while(model[lower] == '[' || model[lower] == ' ') + while(model[lower] == '[') lower++; - while(model[upper] == ']' || model[upper] == ' ') + while(model[upper] == ']') upper--; assert(lower < upper); - for(k = lower, pos = 0; k <= upper; k++, pos++) - protFileName[pos] = model[k]; + index = lower; + pos = 0; + + while(model[index] != '~') + { + designator[pos] = model[index]; + pos++; + index++; + } + + designator[pos] = '\0'; + + if(strcmp(designator, "asc") == 0) + isProteinFile = FALSE; + else + { + if(strcmp(designator, "prot") == 0) + isProteinFile = TRUE; + else + { + printf("Error external partition file type %s does not exist\n", designator); + printf("Available file types: asc and prot\n"); + exit(-1); + } + } - protFileName[pos] = '\0'; + while(model[index] == '~') + index++; + pos = 0; + while(model[index] != ']') + { + fileName[pos] = model[index]; + index++; + pos++; + } + + fileName[pos] = '\0'; - if(!filexists(protFileName)) + if(!filexists(fileName)) { - printf("\n\ncustom protein substitution file [%s] you want to use does not exist!\n", protFileName); + printf("\n\ncustom protein substitution or ascertainment bias file [%s] you want to use does not exist!\n", fileName); printf("you need to specify the full path\n"); printf("the file name shall not contain blanks!\n\n"); - exit(0); + exit(-1); } - - strcpy(tr->initialPartitionData[modelNumber].proteinSubstitutionFileName, protFileName); - /*printf("%s \n", tr->initialPartitionData[modelNumber].proteinSubstitutionFileName);*/ - tr->initialPartitionData[modelNumber].protModels = PROT_FILE; - tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = TRUE; - tr->initialPartitionData[modelNumber].dataType = AA_DATA; + if(isProteinFile) + { + strcpy(tr->initialPartitionData[modelNumber].proteinSubstitutionFileName, fileName); + /*printf("%s \n", tr->initialPartitionData[modelNumber].proteinSubstitutionFileName);*/ + + tr->initialPartitionData[modelNumber].protModels = PROT_FILE; + tr->initialPartitionData[modelNumber].usePredefinedProtFreqs = TRUE; + tr->initialPartitionData[modelNumber].dataType = AA_DATA; + analyzeRest = FALSE; + } + else + { + int + newIndex = 0; + + strcpy(tr->initialPartitionData[modelNumber].ascFileName, fileName); + + i = 0; + + while(ident[i] != ',') + { + if(ident[i] == '\0') + { + printf("Expecting two commas in string %s\n", ident); + exit(-1); + } + i++; + } + + i++; + + while(ident[i] != ',') + { + if(ident[i] == '\0') + { + printf("Expecting two commas in string %s\n", ident); + exit(-1); + } + model[newIndex] = ident[i]; + i++; + newIndex++; + } + + model[newIndex] = '\0'; + } } - else + + if(analyzeRest) { /* AA */ tr->initialPartitionData[modelNumber].ascBias = FALSE; @@ -1532,6 +1615,7 @@ void parseSecondaryStructure(tree *tr, analdef *adef, int sites) partBuffer[i].partitionName = (char*)rax_malloc((strlen(tr->extendedPartitionData[i].partitionName) + 1) * sizeof(char)); strcpy(partBuffer[i].partitionName, tr->extendedPartitionData[i].partitionName); strcpy(partBuffer[i].proteinSubstitutionFileName, tr->extendedPartitionData[i].proteinSubstitutionFileName); + strcpy(partBuffer[i].ascFileName, tr->extendedPartitionData[i].ascFileName); partBuffer[i].dataType = tr->extendedPartitionData[i].dataType; partBuffer[i].protModels = tr->extendedPartitionData[i].protModels; partBuffer[i].usePredefinedProtFreqs = tr->extendedPartitionData[i].usePredefinedProtFreqs; @@ -1549,6 +1633,7 @@ void parseSecondaryStructure(tree *tr, analdef *adef, int sites) tr->extendedPartitionData[i].partitionName = (char*)rax_malloc((strlen(partBuffer[i].partitionName) + 1) * sizeof(char)); strcpy(tr->extendedPartitionData[i].partitionName, partBuffer[i].partitionName); strcpy(tr->extendedPartitionData[i].proteinSubstitutionFileName, partBuffer[i].proteinSubstitutionFileName); + strcpy(tr->extendedPartitionData[i].ascFileName, partBuffer[i].ascFileName); tr->extendedPartitionData[i].dataType = partBuffer[i].dataType; tr->extendedPartitionData[i].protModels= partBuffer[i].protModels; tr->extendedPartitionData[i].usePredefinedProtFreqs= partBuffer[i].usePredefinedProtFreqs;