diff --git a/.gitignore b/.gitignore index a3e4fed..bdb0770 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ bin/bam_stats c/*.o examples/cgp_gnos_pull.ini +/MYMETA.* diff --git a/README.md b/README.md index d44962c..6c55ad0 100644 --- a/README.md +++ b/README.md @@ -43,11 +43,10 @@ Please see the [wiki](https://github.com/ICGC-TCGA-PanCancer/PCAP-core/wiki) for ####Cutting the release 1. Update `lib/PCAP.pm` to the correct version. -2. Update `c/Makefile` to the correct version. -3. Ensure upgrade path for new version number is added to `lib/PCAP.pm`. -4. Update `CHANGES.md` to show major items. -5. Run `./prerelease.sh` -6. Check all tests and coverage reports are acceptable. -7. Commit the updated docs tree and updated module/version. -8. Push commits. -9. Use the GitHub tools to draft a release. +2. Ensure upgrade path for new version number is added to `lib/PCAP.pm`. +3. Update `CHANGES.md` to show major items. +4. Run `./prerelease.sh` +5. Check all tests and coverage reports are acceptable. +6. Commit the updated docs tree and updated module/version. +7. Push commits. +8. Use the GitHub tools to draft a release. diff --git a/c/Makefile b/c/Makefile index 382291e..c0d9c94 100644 --- a/c/Makefile +++ b/c/Makefile @@ -1,4 +1,5 @@ -VERSION=1.10.0 +#VERSION=1.10.0 +VERSION=$(shell perl -I../lib -MPCAP -e 'print PCAP->VERSION;') #Compiler CC = gcc -O3 -DVERSION='"$(VERSION)"' -g @@ -99,9 +100,9 @@ valgrind: .c.o: $(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@ -clean: +clean: @echo clean - $(RM) ./*.o *~ $(BAM_STATS_TARGET) ./tests/tests_log $(TESTS) ./*.gcda ./*.gcov ./*.gcno *.gcda *.gcov *.gcno ./tests/*.gcda ./tests/*.gcov ./tests/*.gcno + $(RM) ./*.o *~ $(BAM_STATS_TARGET) ./tests/tests_log $(TESTS) ./*.gcda ./*.gcov ./*.gcno *.gcda *.gcov *.gcno ./tests/*.gcda ./tests/*.gcov ./tests/*.gcno -rm -rf $(HTSTMP) diff --git a/c/bam_access.c b/c/bam_access.c index e29c706..c306dc9 100644 --- a/c/bam_access.c +++ b/c/bam_access.c @@ -30,50 +30,26 @@ int get_rg_index_from_rg_store(rg_info_t **grps, char *rg, int grps_size){ return -1; } -void parse_rg_line(char *tmp_line, const int idx, rg_info_t **groups, - char *id, char *sm, char *pl, char *pu, char *lib){ +void parse_rg_line(char *tmp_line, rg_info_t *group) { //Now tokenise tmp_line on \t and read in char *tag = strtok(tmp_line,"\t"); + assert(strcmp(tag,"@RG")==0); + group->id = strdup("\0"); + group->sample = strdup("\0"); + group->platform = strdup("\0"); + group->platform_unit = strdup("\0"); + group->lib = strdup("\0"); + tag = strtok(NULL,"\t"); while(tag != NULL){ - int chk = sscanf(tag,"ID:%[^\t\n]",id); - if(chk == 1){ - groups[idx]->id = (char *) malloc(sizeof(char) * 1000); - strcpy(groups[idx]->id,id); - tag = strtok(NULL,"\t"); - continue; - } - chk=0; - chk = sscanf(tag,"SM:%[^\t\n]",sm); - if(chk == 1){ - groups[idx]->sample = (char *) malloc(sizeof(char) * 1000); - strcpy(groups[idx]->sample,sm); - tag = strtok(NULL,"\t"); - continue; - } - chk = 0; - chk = sscanf(tag,"PL:%[^\t\n]",pl); - if(chk == 1){ - groups[idx]->platform = (char *) malloc(sizeof(char) * 1000); - strcpy(groups[idx]->platform,pl); - tag = strtok(NULL,"\t"); - continue; - } - chk = 0; - chk = sscanf(tag,"PU:%[^\t\n]",pu); - if(chk == 1){ - groups[idx]->platform_unit = (char *) malloc(sizeof(char) * 1000); - strcpy(groups[idx]->platform_unit,pu); - tag = strtok(NULL,"\t"); - continue; - } - chk = 0; - chk = sscanf(tag,"LB:%[^\t\n]",lib); - if(chk == 1){ - groups[idx]->lib = (char *) malloc(sizeof(char) * 1000); - strcpy(groups[idx]->lib,lib); - tag = strtok(NULL,"\t"); - continue; - } + assert(tag[2]==':'); + tag[2]=0; + char *val = tag+3; + if (strcmp("ID",tag)==0) group->id = strdup(val); + if (strlen(val)==0) val = "."; + if (strcmp("SM",tag)==0) group->sample = strdup(val); + if (strcmp("PL",tag)==0) group->platform = strdup(val); + if (strcmp("PU",tag)==0) group->platform_unit = strdup(val); + if (strcmp("LB",tag)==0) group->lib = strdup(val); tag = strtok(NULL,"\t"); }//End of iterating through tags in this RG tmp_line return; @@ -85,8 +61,7 @@ rg_info_t **parse_header(bam_hdr_t *head, int *grps_size, stats_rd_t ****grp_sta rg_info_t **groups; int size = 0; char *head_txt = head->text; - char *head_bac = malloc(sizeof(char) * (strlen(head_txt)+1)); - strncpy(head_bac,head_txt,strlen(head_txt)+1); + char *head_bac = strdup(head_txt); //First pass counts read groups line = strtok(head_txt,"\n"); while(line != NULL){ @@ -100,73 +75,41 @@ rg_info_t **parse_header(bam_hdr_t *head, int *grps_size, stats_rd_t ****grp_sta //We now have the number of read groups, assign the RG id to each. groups = (rg_info_t**) malloc(sizeof(rg_info_t *) * size); check_mem(groups); - char ** ptr = malloc(sizeof(char **)); - check_mem(ptr); - line = strtok_r(head_bac,"\n",ptr); + char *ptr = NULL; + line = strtok_r(head_bac,"\n",&ptr); int idx = 0; while(line != NULL){ //Check for a read group line if(strncmp(line,"@RG",3)==0){ groups[idx] = (rg_info_t *) malloc(sizeof(rg_info_t)); check_mem(groups[idx]); - char *id = malloc(sizeof(char) * 1000); - check_mem(id); - id[0] = '\0'; - char *sm = malloc(sizeof(char) * 1000); - check_mem(sm); - sm[0] = '\0'; - char *pl = malloc(sizeof(char) * 1000); - check_mem(pl); - pl[0] = '\0'; - char *pu = malloc(sizeof(char) * 1000); - check_mem(pu); - pu[0] = '\0'; - char *lib = malloc(sizeof(char) * 1000); - check_mem(lib); - lib[0] = '\0'; - char * tmp = malloc(sizeof(char) * (strlen(line)+1)); - strcpy(tmp,line); - parse_rg_line(tmp,idx,groups,id,sm,pl,pu,lib); - check((id != NULL),"Error recognising ID from RG line. NULL found."); - check((strcmp(id,"")!=0),"Error recognising ID from RG line. Empty string."); - check((id[0]!='\0'),"Error recognising ID from RG line. Empty string."); - check((sm != NULL),"Error recognising SM from RG line."); - if((sm[0]=='\0')){ - groups[idx]->sample = "."; - } - check(pl != NULL,"Error recognising PL from RG line."); - if((pl[0]=='\0')){ - groups[idx]->platform = "."; - } - check(lib != NULL,"Error recognising LB from RG line."); - if((lib[0]=='\0')){ - groups[idx]->lib = "."; - } - check(pu != NULL,"Error recognising PU from RG line."); - if((pu[0]=='\0')){ - groups[idx]->platform_unit = "."; - } + char * tmp = strdup(line); + parse_rg_line(tmp,groups[idx]); free (tmp); + + check((groups[idx]->id != NULL),"Error recognising ID from RG line. NULL found."); + check((groups[idx]->id[0]!='\0'),"Error recognising ID from RG line. Empty string."); + check((groups[idx]->sample != NULL),"Error recognising SM from RG line."); + if(groups[idx]->sample[0] == '\0') groups[idx]->sample = strdup("."); + check(groups[idx]->platform != NULL,"Error recognising PL from RG line."); + if(groups[idx]->platform[0] == '\0') groups[idx]->platform = strdup("."); + check(groups[idx]->lib != NULL,"Error recognising LB from RG line."); + if(groups[idx]->lib[0] == '\0') groups[idx]->lib = strdup("."); + check(groups[idx]->platform_unit != NULL,"Error recognising PU from RG line."); + if(groups[idx]->platform_unit[0] == '\0') groups[idx]->platform_unit = strdup("."); idx++; }//End of iteration through header lines. - line = strtok_r(NULL,"\n",ptr); + line = strtok_r(NULL,"\n",&ptr); } - free(ptr); if(line) free(line); }else{ //Deal with a possible lack of @RG lines. groups = malloc(sizeof(rg_info_t*) * 1); check_mem(groups); - groups[0]->id = malloc(sizeof(char*) * 100); - groups[0]->sample = malloc(sizeof(char) * 100); - groups[0]->platform = malloc(sizeof(char) * 100); - groups[0]->platform_unit = malloc(sizeof(char) * 100); - groups[0]->lib = malloc(sizeof(char) * 100); - - groups[0]->id = "."; - groups[0]->sample = "."; - groups[0]->platform = "."; - groups[0]->platform_unit = "."; - groups[0]->lib = "."; + groups[0]->id = strdup("."); + groups[0]->sample = strdup("."); + groups[0]->platform = strdup("."); + groups[0]->platform_unit = strdup("."); + groups[0]->lib = strdup("."); size = 1; } *grp_stats = (stats_rd_t***) malloc(sizeof(stats_rd_t**) * (size)); diff --git a/c/bam_stats.c b/c/bam_stats.c index 6e96ff9..25202dd 100644 --- a/c/bam_stats.c +++ b/c/bam_stats.c @@ -32,13 +32,13 @@ #include "khash.h" static char *input_file = NULL; -static char *output_file; -static char *ref_file; +static char *output_file = NULL; +static char *ref_file = NULL; static int rna = 0; int grps_size = 0; stats_rd_t*** grp_stats; static char *bas_header = "bam_filename\tsample\tplatform\tplatform_unit\tlibrary\treadgroup\tread_length_r1\tread_length_r2\t#_mapped_bases\t#_mapped_bases_r1\t#_mapped_bases_r2\t#_divergent_bases\t#_divergent_bases_r1\t#_divergent_bases_r2\t#_total_reads\t#_total_reads_r1\t#_total_reads_r2\t#_mapped_reads\t#_mapped_reads_r1\t#_mapped_reads_r2\t#_mapped_reads_properly_paired\t#_gc_bases_r1\t#_gc_bases_r2\tmean_insert_size\tinsert_size_sd\tmedian_insert_size\t#_duplicate_reads\n"; -static char *rg_line_pattern = "%s\t%s\t%s\t%s\t%s\t%s\t%ld\t%ld\t%lld\t%lld\t%lld\t%lld\t%lld\t%lld\t%lld\t%lld\t%lld\t%lld\t%lld\t%lld\t%lld\t%lld\t%lld\t%.3f\t%.3f\t%.3f\t%lld\n"; +static char *rg_line_pattern = "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%.3f\t%.3f\t%.3f\t%d\n"; int check_exist(char *fname){ @@ -52,7 +52,7 @@ int check_exist(char *fname){ void print_version (int exit_code){ printf ("%s\n",VERSION); - return; + exit(exit_code); } void print_usage (int exit_code){ @@ -64,7 +64,7 @@ void print_usage (int exit_code){ printf ("-r --ref-file File path to reference index (.fai) file.\n"); printf (" NB. If cram format is supplied via -b and the reference listed in the cram header can't be found bam_stats may fail to work correctly.\n"); printf ("-a --rna Uses the RNA method of calculating insert size (ignores anything outside ± ('sd'*standard_dev) of the mean in calculating a new mean)\n"); - + printf ("Other:\n"); printf ("-h --help Display this usage information.\n"); printf ("-v --version Prints the version number.\n\n"); @@ -104,11 +104,11 @@ void options(int argc, char *argv[]){ case 'r': ref_file = optarg; break; - + case 'a': rna = 1; break; - + case 'h': print_usage(0); break; @@ -197,7 +197,7 @@ int calculate_mean_sd_median_insert_size(khash_t(ins) *inserts,double *mean, dou tt_sd += val; }); - + if(tt_sd){ double variance = fabs((double)((double)pp_sd / (double)tt_sd)); @@ -205,7 +205,7 @@ int calculate_mean_sd_median_insert_size(khash_t(ins) *inserts,double *mean, dou }else{ *sd = 0; } - + kh_destroy(ins, inserts); } //End of if we have data to calculate from. diff --git a/docs.tar.gz b/docs.tar.gz index d4c9938..d8abd4c 100644 Binary files a/docs.tar.gz and b/docs.tar.gz differ diff --git a/lib/PCAP.pm b/lib/PCAP.pm index 3948381..1d3e65d 100644 --- a/lib/PCAP.pm +++ b/lib/PCAP.pm @@ -24,7 +24,7 @@ use strict; use Const::Fast qw(const); use base 'Exporter'; -our $VERSION = '1.11.1'; +our $VERSION = '1.12.0'; our @EXPORT = qw($VERSION); const my $LICENSE => @@ -74,6 +74,7 @@ const my %UPGRADE_PATH => ( '0.1.0' => 'biobambam,bwa,samtools', '1.10.0' => 'biobambam', '1.11.0' => 'biobambam', '1.11.1' => 'biobambam', + '1.12.0' => 'biobambam', ); sub license { diff --git a/lib/PCAP/Bam.pm b/lib/PCAP/Bam.pm index 457c0e1..d2666f0 100644 --- a/lib/PCAP/Bam.pm +++ b/lib/PCAP/Bam.pm @@ -159,7 +159,7 @@ sub _which { } sub sample_name { - my $bam = shift; + my ($bam, $die_no_sample) = @_; my $sam = sam_ob($bam); my $header = $sam->header->text; my $sample; @@ -168,6 +168,12 @@ sub sample_name { die "BAM file appears to contain data for multiple samples, not supported: \n\n$header\n" if(defined $sample && $sample ne $new_sample); $sample = $new_sample; } + unless(defined $sample) { + if(defined $die_no_sample && $die_no_sample != 0) { + die "ERROR: Failed to find samplename in RG headers of $bam"; + } + warn "WARN: Failed to find samplename in RG headers of $bam\n"; + } return ($sample, $sam); # also return the SAM object } diff --git a/t/pcapBam.t b/t/pcapBam.t index 2a02798..b499d55 100644 --- a/t/pcapBam.t +++ b/t/pcapBam.t @@ -1,6 +1,7 @@ use strict; use Test::More; use Test::Fatal; +use Test::Warn; use File::Spec; use Try::Tiny qw(try catch finally); use Const::Fast qw(const); @@ -123,6 +124,11 @@ subtest 'header_info checks' => sub { , qr/ERROR: This BAM has no readgroups:/m , 'Fail when no read groups found in header'); + warning_like {($sample, $bio_db_sam) = PCAP::Bam::sample_name($no_rg_bam);} qr/WARN: Failed to find samplename in RG headers of /, "Warn when no sample found"; + + like(exception{ PCAP::Bam::sample_name($no_rg_bam, 1) } + , qr/ERROR: Failed to find samplename in RG headers of /m + , 'Die when no sample found (and die flag set)'); }; done_testing();