Skip to content

Commit

Permalink
Add explicit --group-samples-tag option. Resolves samtools#1370
Browse files Browse the repository at this point in the history
  • Loading branch information
pd3 committed Jan 19, 2021
1 parent 8a744dd commit e4bc1b7
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 24 deletions.
26 changes: 9 additions & 17 deletions mcall.c
Original file line number Diff line number Diff line change
Expand Up @@ -262,15 +262,8 @@ static void init_sample_groups(call_t *call)
return;
}

// Parse tag (optional) and file name
char *fname = call->sample_groups;
while ( *fname && *fname!=':' ) fname++;
if ( *fname )
if ( call->sample_groups_tag )
{
call->sample_groups_tag = call->sample_groups;
*fname = 0;
fname++;

// Is the tag defined in the header?
int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->sample_groups_tag);
if ( tag_id==-1 ) error("No such tag \"%s\"\n",call->sample_groups_tag);
Expand All @@ -286,11 +279,10 @@ static void init_sample_groups(call_t *call)
if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "AD";
else error("Error: neither \"AD\" nor \"QS\" FORMAT tag exists and no alternative given with -G\n");
}
fname = call->sample_groups;
}

// Read samples/groups
if ( !strcmp("-",fname) )
if ( !strcmp("-",call->sample_groups) )
{
// single-sample calling, each sample creates its own group
call->nsmpl_grp = nsmpl;
Expand All @@ -305,8 +297,8 @@ static void init_sample_groups(call_t *call)
else
{
int nlines;
char **lines = hts_readlist(fname, 1, &nlines);
if ( !lines ) error("Could not read the file: %s\n", fname);
char **lines = hts_readlist(call->sample_groups, 1, &nlines);
if ( !lines ) error("Could not read the file: %s\n", call->sample_groups);

uint32_t *smpl2grp = (uint32_t*)calloc(nsmpl,sizeof(uint32_t));
uint32_t *grp2n = (uint32_t*)calloc(nsmpl,sizeof(uint32_t));
Expand All @@ -317,14 +309,14 @@ static void init_sample_groups(call_t *call)
{
char *ptr = lines[i];
while ( *ptr && !isspace(*ptr) ) ptr++;
if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",fname,lines[i]);
if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]);
char *tmp = ptr;
while ( *ptr && isspace(*ptr) ) ptr++;
if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",fname,lines[i]);
if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]);
*tmp = 0;
int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]);
if ( ismpl<0 ) continue;
if ( smpl2grp[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],fname);
if ( smpl2grp[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups);
if ( !khash_str2int_has_key(grp2idx,ptr+1) )
{
khash_str2int_set(grp2idx, ptr+1, call->nsmpl_grp);
Expand All @@ -337,12 +329,12 @@ static void init_sample_groups(call_t *call)
smpl2grp[ismpl] = igrp+1; // +1 to distinguish unlisted samples
}
khash_str2int_destroy(grp2idx);
if ( !call->nsmpl_grp ) error("Could not parse the file, no matching samples found: %s\n", fname);
if ( !call->nsmpl_grp ) error("Could not parse the file, no matching samples found: %s\n", call->sample_groups);

call->smpl_grp = (smpl_grp_t*)calloc(call->nsmpl_grp,sizeof(*call->smpl_grp));
for (i=0; i<nsmpl; i++)
{
if ( !smpl2grp[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],fname);
if ( !smpl2grp[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],call->sample_groups);
int igrp = smpl2grp[i] - 1;
if ( !call->smpl_grp[igrp].nsmpl )
call->smpl_grp[igrp].smpl = (uint32_t*)calloc(grp2n[igrp],sizeof(uint32_t));
Expand Down
8 changes: 4 additions & 4 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -278,10 +278,10 @@
test_vcf_call($opts,in=>'mpileup.X',out=>'mpileup.X.out',args=>'-mv --ploidy-file {PATH}/mpileup.ploidy -S {PATH}/mpileup.ped');
test_vcf_call($opts,in=>'mpileup.X',out=>'mpileup.X.2.out',args=>'-mv --ploidy-file {PATH}/mpileup.ploidy -S {PATH}/mpileup.2.samples');
test_vcf_call($opts,in=>'mpileup.NA19213.NA19129',out=>'mpileup.hwe.1.out',args=>'-mv');
test_vcf_call($opts,in=>'mpileup.NA19213.NA19129',out=>'mpileup.hwe.1b.out',args=>'-mv -G AD:-');
test_vcf_call($opts,in=>'mpileup.NA19213.NA19129',out=>'mpileup.hwe.1b.out',args=>'-mv -G - --group-samples-tag AD');
test_vcf_call($opts,in=>'mpileup.hwe',out=>'mpileup.hwe.2.out',args=>'-mv');
test_vcf_call($opts,in=>'mpileup.hwe',out=>'mpileup.hwe.3.out',args=>'-mv -G AD:-'); # 21,3,0 becomes 0/0 because of the prior -P
test_vcf_call($opts,in=>'mpileup.hwe',out=>'mpileup.hwe.4.out',args=>'-mv -G AD:{PATH}/mpileup.hwe.samples');
test_vcf_call($opts,in=>'mpileup.hwe',out=>'mpileup.hwe.3.out',args=>'-mv -G - --group-samples-tag AD'); # 21,3,0 becomes 0/0 because of the prior -P
test_vcf_call($opts,in=>'mpileup.hwe',out=>'mpileup.hwe.4.out',args=>'-mv -G {PATH}/mpileup.hwe.samples --group-samples-tag AD');
test_vcf_call_cAls($opts,in=>'mpileup',out=>'mpileup.cAls.out',tab=>'mpileup');
test_vcf_call_cAls($opts,in=>'mpileup.2',out=>'mpileup.cAls.2.out',tab=>'mpileup.2');
test_vcf_call_cAls($opts,in=>'mpileup.3',out=>'mpileup.cAls.3.out',tab=>'mpileup.3',args=>'-i');
Expand All @@ -297,7 +297,7 @@
test_vcf_call($opts,in=>'mpileup.c.X',out=>'mpileup.c.X.out',args=>'-cv --ploidy-file {PATH}/mpileup.ploidy -S {PATH}/mpileup.ped');
test_vcf_call($opts,in=>'mpileup.c.X',out=>'mpileup.c.X.2.out',args=>'-cv --ploidy-file {PATH}/mpileup.ploidy -S {PATH}/mpileup.2.samples');
test_vcf_call($opts,in=>'call-G',out=>'call-G.1.out',args=>'-mv');
test_vcf_call($opts,in=>'call-G',out=>'call-G.2.out',args=>'-mv -G AD:-');
test_vcf_call($opts,in=>'call-G',out=>'call-G.2.out',args=>'-mv -G - --group-samples-tag AD');
test_vcf_call($opts,in=>'call-G.2',out=>'call-G.2.1.out',args=>'-mv -F AN_POP,AC_POP');
test_vcf_call($opts,in=>'call.af-fixation',out=>'call.af-fixation.1.out',args=>'-m');
test_vcf_call($opts,in=>'call.af-fixation',out=>'call.af-fixation.2.out',args=>'-m -G {PATH}/call.af-fixation.txt');
Expand Down
8 changes: 5 additions & 3 deletions vcfcall.c
Original file line number Diff line number Diff line change
Expand Up @@ -740,7 +740,6 @@ static void destroy_data(args_t *args)
free(args->samples_map);
free(args->sample2sex);
free(args->aux.ploidy);
free(args->aux.sample_groups);
free(args->str.s);
if ( args->gvcf ) gvcf_destroy(args->gvcf);
bcf_hdr_destroy(args->aux.hdr);
Expand Down Expand Up @@ -892,8 +891,9 @@ static void usage(args_t *args)
// fprintf(stderr, " -a, --annots LIST Add annotations: GQ,GP,PV4 (lowercase allowed). Prefixed with ^ indicates a request for\n");
// fprintf(stderr, " tag removal [^I16,^QS,^FMT/QS]\n");
fprintf(stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n");
fprintf(stderr, " -G, --group-samples [TAG:]FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n");
fprintf(stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n");
fprintf(stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n");
fprintf(stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n");
fprintf(stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n");
fprintf(stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n");
fprintf(stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n");
Expand Down Expand Up @@ -951,6 +951,7 @@ int main_vcfcall(int argc, char *argv[])
{"prior-freqs",required_argument,NULL,'F'},
{"gvcf",required_argument,NULL,'g'},
{"group-samples",required_argument,NULL,'G'},
{"group-samples-tag",required_argument,NULL,3},
{"output",required_argument,NULL,'o'},
{"output-type",required_argument,NULL,'O'},
{"regions",required_argument,NULL,'r'},
Expand Down Expand Up @@ -989,7 +990,8 @@ int main_vcfcall(int argc, char *argv[])
case 1 : ploidy = optarg; break;
case 'X': ploidy = "X"; fprintf(stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break;
case 'Y': ploidy = "Y"; fprintf(stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break;
case 'G': args.aux.sample_groups = strdup(optarg); break;
case 'G': args.aux.sample_groups = optarg; break;
case 3 : args.aux.sample_groups_tag = optarg; break;
case 'f': fprintf(stderr,"Warning: -f, --format-fields will be deprecated, please use -a, --annotate instead.\n");
case 'a':
if (optarg[0]=='?') { list_annotations(stderr); return 1; }
Expand Down

0 comments on commit e4bc1b7

Please sign in to comment.