Skip to content

Commit

Permalink
Add mpileup -U option for old MWU scale.
Browse files Browse the repository at this point in the history
For backwards compatibility, the MWU_ZSCORE #ifdef has been replaced
by the -U option.  The default is the new Z score, but we can now
output the same INFO fields as before.
  • Loading branch information
jkbonfield committed Apr 23, 2021
1 parent e4e1610 commit b842297
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 64 deletions.
94 changes: 52 additions & 42 deletions bam2bcf.c
Original file line number Diff line number Diff line change
Expand Up @@ -882,43 +882,44 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
// calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual);
// calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual);

#ifdef MWU_ZSCORE
// U z-normalised as +/- number of standard deviations from mean.
if (call->ori_ref < 0) {
if (bca->fmt_flag & B2B_INFO_RPB)
call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos,
bca->npos, 0, 1);
call->mwu_mq = calc_mwu_biasZ(bca->iref_mq, bca->ialt_mq,
bca->nqual,1,1);
if ( bca->fmt_flag & B2B_INFO_SCB )
call->mwu_sc = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl,
100, 0,1);
if (bca->fmt_flag & B2B_INFO_ZSCORE) {
// U z-normalised as +/- number of standard deviations from mean.
if (call->ori_ref < 0) {
if (bca->fmt_flag & B2B_INFO_RPB)
call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos,
bca->npos, 0, 1);
call->mwu_mq = calc_mwu_biasZ(bca->iref_mq, bca->ialt_mq,
bca->nqual,1,1);
if ( bca->fmt_flag & B2B_INFO_SCB )
call->mwu_sc = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl,
100, 0,1);
} else {
if (bca->fmt_flag & B2B_INFO_RPB)
call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
bca->npos, 0, 1);
call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq,
bca->nqual,1,1);
call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq,
bca->nqual,0,1);
call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
bca->nqual,0,1);
if ( bca->fmt_flag & B2B_INFO_SCB )
call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl,
100, 0,1);
}
} else {
if (bca->fmt_flag & B2B_INFO_RPB)
// Old method; U as probability between 0 and 1
if ( bca->fmt_flag & B2B_INFO_RPB )
call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
bca->npos, 0, 1);
bca->npos, 0, 0);
call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq,
bca->nqual,1,1);
bca->nqual, 1, 0);
call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq,
bca->nqual,0,1);
bca->nqual, 0, 0);
call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
bca->nqual,0,1);
if ( bca->fmt_flag & B2B_INFO_SCB )
call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl,
100, 0,1);
bca->nqual, 0, 0);
}


#else
// Old method; U as probability between 0 and 1
if ( bca->fmt_flag & B2B_INFO_RPB )
call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos, bca->npos,
0, 0);
call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq, bca->nqual,1,0);
call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq, bca->nqual,0,0);
call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs, bca->nqual,0,0);
#endif

#if CDF_MWU_TESTS
// CDF version of MWU tests is not calculated by default
if ( bca->fmt_flag & B2B_INFO_RPB )
Expand Down Expand Up @@ -1016,18 +1017,27 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);

#ifdef MWU_ZSCORE
if ( bc->mwu_pos != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
if ( bc->mwu_mq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
if ( bc->mwu_mqs != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
if ( bc->mwu_bq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
if ( bc->mwu_sc != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
#else
if ( bc->mwu_pos != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
if ( bc->mwu_mq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
if ( bc->mwu_mqs != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
if ( bc->mwu_bq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
#endif
if (bca->fmt_flag & B2B_INFO_ZSCORE) {
if ( bc->mwu_pos != HUGE_VAL )
bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
if ( bc->mwu_mq != HUGE_VAL )
bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
if ( bc->mwu_mqs != HUGE_VAL )
bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
if ( bc->mwu_bq != HUGE_VAL )
bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
if ( bc->mwu_sc != HUGE_VAL )
bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
} else {
if ( bc->mwu_pos != HUGE_VAL )
bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
if ( bc->mwu_mq != HUGE_VAL )
bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
if ( bc->mwu_mqs != HUGE_VAL )
bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
if ( bc->mwu_bq != HUGE_VAL )
bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
}

if ( bc->strand_bias != HUGE_VAL )
bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
Expand Down
5 changes: 1 addition & 4 deletions bam2bcf.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,6 @@ DEALINGS IN THE SOFTWARE. */
#define CDF_MWU_TESTS 0
#endif

// Use calc_mwu_biasZ as a sd-normalised score centred on 0 instead of the
// old method with values in the range 0 to 1.
#define MWU_ZSCORE

#define B2B_INDEL_NULL 10000

#define B2B_FMT_DP (1<<0)
Expand All @@ -65,6 +61,7 @@ DEALINGS IN THE SOFTWARE. */
#define B2B_INFO_RPB (1<<15)
#define B2B_FMT_QS (1<<16)
#define B2B_INFO_SCB (1<<17)
#define B2B_INFO_ZSCORE (1<<30) // MWU as-is or Z-normalised

#define B2B_MAX_ALLELES 5

Expand Down
13 changes: 13 additions & 0 deletions doc/bcftools.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1942,6 +1942,19 @@ INFO/DPR .. Deprecated in favor of INFO/AD; Number of high-quality bases for
*--threads* 'INT'::
see *<<common_options,Common Options>>*

*-U, --mwu-u*::
The the previous Mann-Whitney U test score from version 1.12 and
earlier. This is a probability score, but importantly it folds
probabilities above or below the desired score into the same P.
The new Mann-Whitney U test score is a "Z score", expressing the
score as the number of standard deviations away from the mean (with
zero being matching the mean). It keeps both positive and
negative values. This can be important for some tests where
errors are asymmetric.

This option changes the INFO field names produced back to the ones
used by the earlier Bcftools releases. For excample BQBZ becomes
BQB.
==== Options for SNP/INDEL genotype likelihood computation

*-X, --config* 'STR'::
Expand Down
42 changes: 24 additions & 18 deletions mpileup.c
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,8 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
conf->bc.tid = tid; conf->bc.pos = pos;
bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc);
bcf_clear1(conf->bcf_rec);
bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, 0, 0);
bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag,
conf->bca, 0);
flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);

// call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
Expand Down Expand Up @@ -763,21 +764,23 @@ static int mpileup(mplp_conf_t *conf)
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">");
if ( conf->fmt_flag&B2B_INFO_VDB )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">");
#ifdef MWU_ZSCORE
if ( conf->fmt_flag&B2B_INFO_RPB )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Read Position Bias (closer to 0 is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality Bias (closer to 0 is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Base Quality Bias (closer to 0 is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality vs Strand Bias (closer to 0 is better)\">");
if ( conf->fmt_flag&B2B_INFO_SCB )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SCBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Soft-Clip Length Bias (closer to 0 is better)\">");
#else
if ( conf->fmt_flag&B2B_INFO_RPB )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
#endif

if (conf->fmt_flag & B2B_INFO_ZSCORE) {
if ( conf->fmt_flag&B2B_INFO_RPB )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Read Position Bias (closer to 0 is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality Bias (closer to 0 is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Base Quality Bias (closer to 0 is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality vs Strand Bias (closer to 0 is better)\">");
if ( conf->fmt_flag&B2B_INFO_SCB )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SCBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Soft-Clip Length Bias (closer to 0 is better)\">");
} else {
if ( conf->fmt_flag&B2B_INFO_RPB )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
}

bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=FS,Number=1,Type=Float,Description=\"Phred-scaled p-value using Fisher's exact test to detect strand bias\">");
#if CDF_MWU_TESTS
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">");
Expand Down Expand Up @@ -1132,6 +1135,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
" -o, --output FILE write output to FILE [standard output]\n"
" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n"
" 'z' compressed VCF; 'v' uncompressed VCF [v]\n"
" -U, --mwu-u use older probability scale for Mann-Whitney U test\n"
" --threads INT use multithreading with INT worker threads [0]\n"
"\n"
"SNP/INDEL genotype likelihoods options:\n"
Expand Down Expand Up @@ -1197,7 +1201,7 @@ int main_mpileup(int argc, char *argv[])
mplp.n_threads = 0;
mplp.bsmpl = bam_smpl_init();
// the default to be changed in future, see also parse_format_flag()
mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB;
mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE;
mplp.max_read_len = 500;

static const struct option lopts[] =
Expand Down Expand Up @@ -1257,9 +1261,10 @@ int main_mpileup(int argc, char *argv[])
{"platforms", required_argument, NULL, 'P'},
{"max-read-len", required_argument, NULL, 'M'},
{"config", required_argument, NULL, 'X'},
{"mwu-u", no_argument, NULL, 'U'},
{NULL, 0, NULL, 0}
};
while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:",lopts,NULL)) >= 0) {
while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) {
switch (c) {
case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
case 1 :
Expand Down Expand Up @@ -1361,6 +1366,7 @@ int main_mpileup(int argc, char *argv[])
mplp.fmt_flag |= parse_format_flag(optarg);
break;
case 'M': mplp.max_read_len = atoi(optarg); break;
case 'U': mplp.fmt_flag &= ~B2B_INFO_ZSCORE; break;
case 'X':
if (strcasecmp(optarg, "pacbio-ccs") == 0) {
mplp.min_frac = 0.1;
Expand Down

0 comments on commit b842297

Please sign in to comment.