Merge pull request #75 from CenterForMedicalGeneticsGhent/feature/cra…

…m_support sam/cram input support
CenterForMedicalGeneticsGhent · Mar 24, 2021 · b2d38a8 · b2d38a8
2 parents 3436705 + b878a0b
commit b2d38a8
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -41,7 +41,7 @@ conda install -f -c conda-forge -c bioconda wisecondorx
 ### Running WisecondorX
 
 There are three main stages (converting, reference creating and predicting) when using WisecondorX:  
-- Convert .bam to .npz files (for both reference and test samples)  
+- Convert aligned reads to .npz files (for both reference and test samples)
 - Create a reference (using reference .npz files)  
     - **Important notes**
         - Automated gender prediction, required to consistently analyze sex chromosomes, is based on a Gaussian mixture 
@@ -56,11 +56,11 @@ There are three main stages (converting, reference creating and predicting) when
         observe additional improvement concerning normalization.  
 - Predict copy number alterations (using the reference file and test .npz cases of interest)  
 
-### Stage (1) Convert .bam to .npz
+### Stage (1) Convert aligned reads (sam/bam/cram) to .npz
 
 ```bash
 
-WisecondorX convert input.bam output.npz [--optional arguments]
+WisecondorX convert input.sam/bam/cram output.npz [--optional arguments]
 ```
 
 <br>Optional argument <br><br> | Function  

diff --git a/wisecondorX/convert_tools.py b/wisecondorX/convert_tools.py
@@ -4,21 +4,31 @@
 
 import numpy as np
 import pysam
+import sys
 
 '''
-Converts bam file to numpy array by transforming
+Converts aligned reads file to numpy array by transforming
 individual reads to counts per bin.
 '''
 
 
-def convert_bam(args):
+def convert_reads(args):
     bins_per_chr = dict()
     for chr in range(1, 25):
         bins_per_chr[str(chr)] = None
 
     logging.info('Importing data ...')
 
-    bam_file = pysam.AlignmentFile(args.infile, 'rb')
+    if args.infile.endswith(".sam"):
+        reads_file = pysam.AlignmentFile(args.infile, 'r')
+    elif args.infile.endswith(".bam"):
+        reads_file = pysam.AlignmentFile(args.infile, 'rb')
+    elif args.infile.endswith(".cram"):
+        reads_file = pysam.AlignmentFile(args.infile, 'rc')
+    else:
+        logging.error(
+            "Unsupported input file type. Make sure your input filename has a correct extension (sam/bam/cram)")
+        sys.exit(1)
 
     reads_seen = 0
     reads_kept = 0
@@ -28,9 +38,9 @@ def convert_bam(args):
     larp = -1
     larp2 = -1
 
-    logging.info('Converting bam ... This might take a while ...')
+    logging.info('Converting aligned reads ... This might take a while ...')
 
-    for index, chr in enumerate(bam_file.references):
+    for index, chr in enumerate(reads_file.references):
 
         chr_name = chr
         if chr_name[:3].lower() == 'chr':
@@ -39,9 +49,9 @@ def convert_bam(args):
             continue
 
         logging.info('Working at {}; processing {} bins'
-                     .format(chr, int(bam_file.lengths[index] / float(args.binsize) + 1)))
-        counts = np.zeros(int(bam_file.lengths[index] / float(args.binsize) + 1), dtype=np.int32)
-        bam_chr = bam_file.fetch(chr)
+                     .format(chr, int(reads_file.lengths[index] / float(args.binsize) + 1)))
+        counts = np.zeros(int(reads_file.lengths[index] / float(args.binsize) + 1), dtype=np.int32)
+        bam_chr = reads_file.fetch(chr)
 
         if chr_name == 'X':
             chr_name = '23'
@@ -81,9 +91,9 @@ def convert_bam(args):
         bins_per_chr[chr_name] = counts
         reads_kept += sum(counts)
 
-    qual_info = {'mapped': bam_file.mapped,
-                 'unmapped': bam_file.unmapped,
-                 'no_coordinate': bam_file.nocoordinate,
+    qual_info = {'mapped': reads_file.mapped,
+                 'unmapped': reads_file.unmapped,
+                 'no_coordinate': reads_file.nocoordinate,
                  'filter_rmdup': reads_rmdup,
                  'filter_mapq': reads_mapq,
                  'pre_retro': reads_seen,

diff --git a/wisecondorX/main.py b/wisecondorX/main.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 
-from wisecondorX.convert_tools import convert_bam
+from wisecondorX.convert_tools import convert_reads
 from wisecondorX.newref_control import tool_newref_prep, tool_newref_main, tool_newref_merge
 from wisecondorX.newref_tools import train_gender_model, get_mask
 from wisecondorX.overall_tools import gender_correct, scale_sample
@@ -20,7 +20,7 @@
 def tool_convert(args):
     logging.info('Starting conversion')
 
-    sample, qual_info = convert_bam(args)
+    sample, qual_info = convert_reads(args)
     np.savez_compressed(args.outfile,
                         binsize=args.binsize,
                         sample=sample,