uclahs-cds · yashpatel6 · Jun 4, 2024 · May 29, 2024 · Jun 1, 2024 · Jun 1, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 ---
 
 ## [Unreleased]
+### Added
++ Support for inputs to be given through a CSV
 
 ---
 

diff --git a/README.md b/README.md
@@ -121,6 +121,23 @@ All samples from the same patient are processed as a single job.
 
 ## Inputs
 
+For CSV inputs, identify the fields needed for each input type below and include the respective fields. For mix inputs, use empty cell values - see template CSVs for examples.
+
+For YAML inputs, see template YAMLs. In each template YAML, any key or value in `<>` needs to be filled in and the `<>` removed, ex. `<patient1>` should be filled in with the actual patient ID `PRAD0001`. Other keys not in `<>` must be kept as they are.
+```YAML
+---
+input:
+    <patient1>:
+...
+```
+should be filled in to become:
+```YAML
+---
+input:
+    PRAD0001:
+...
+```
+
 ### Input BAM
 
 > **BETA** - See warning above

diff --git a/config/input_handler.config b/config/input_handler.config
@@ -4,6 +4,121 @@ includeConfig "${projectDir}/external/pipeline-Nextflow-config/config/csv/csv_pa
 *   Namespace for handling input parsing and parameter settings based on inputs
 */
 input_handler {
+    /**
+    *   If input CSV is given, parse and convert into format produced by direct YAML input
+    */
+    convert_csv_inputs = {
+        if (!params.containsKey('input_csv') || params.containsKey('input')) {
+            return;
+        }
+
+        def common_cols = [
+            'patient',
+            'sample',
+            'state'
+        ];
+
+        def input_types = [
+            'BAM': [
+                'cols': ['path']
+            ],
+            'FASTQ': [
+                'cols': [
+                    'read_group_identifier',
+                    'sequencing_center',
+                    'library_identifier',
+                    'platform_technology',
+                    'platform_unit',
+                    'bam_header_sm',
+                    'lane',
+                    'read1_fastq',
+                    'read2_fastq'
+                ]
+            ],
+            'SRC': [
+                'cols': [
+                    'src_input_type',
+                    'src_input_algorithm',
+                    'src_path'
+                ],
+                'key_map': [
+                    'src_input_type': 'src_input_type',
+                    'src_input_algorithm': 'algorithm',
+                    'src_path': 'path'
+                ]
+            ]
+        ];
+
+        def reader = new BufferedReader(new FileReader(params.input_csv));
+        def header_cols = reader.readLine().split(',') as List;
+
+        def cols_to_parse = [];
+        def types_to_parse = [];
+
+        // Make sure common columns are given
+        assert header_cols.containsAll(common_cols) : "The given input CSV does not contain the expected common columns: `${common_cols}`";
+        cols_to_parse += common_cols;
+
+        input_types.each { type, type_info ->
+            if (header_cols.containsAll(type_info.cols)) {
+                types_to_parse += type;
+                cols_to_parse += type_info.cols;
+            }
+        }
+
+        def given_csv_inputs = csv_parser.parse_csv(params.input_csv, cols_to_parse, true);
+
+        def given_inputs = [:];
+        given_csv_inputs.each { csv_row ->
+            if (!given_inputs.containsKey(csv_row.patient)) {
+                given_inputs[csv_row.patient] = [:];
+            }
+
+            if (!given_inputs[csv_row.patient].containsKey(csv_row.sample)) {
+                given_inputs[csv_row.patient][csv_row.sample] = ['state': csv_row.state];
+            } else {
+                assert csv_row.state == given_inputs[csv_row.patient][csv_row.sample]['state'] : "Multiple states were given for the sample `${csv_row.sample}`! Each sample should have only one state."
+            }
+
+            def col_key = null;
+            def parsed_map = [:];
+            types_to_parse.each { parse_type ->
+                parsed_map = [:];
+                input_types[parse_type]['cols'].each { col_to_parse ->
+                    col_key = (input_types[parse_type].containsKey('key_map')) ? input_types[parse_type]['key_map'][col_to_parse] : col_to_parse;
+                    parsed_map[col_key] = csv_row[col_to_parse];
+                }
+
+                if (parsed_map.any{ parsed_val -> !parsed_val.value }) {
+                    System.out.println("INFO - Found empty fields for `${parse_type}` on row `${csv_row}` - skipping.");
+                    return;
+                }
+
+                if (parse_type == 'BAM') {
+                    if (given_inputs[csv_row.patient][csv_row.sample].containsKey('BAM')) {
+                        if (given_inputs[csv_row.patient][csv_row.sample]['BAM'] != parsed_map) {
+                            throw new IllegalArgumentException("Sample `${csv_row.sample}` for patient `${csv_row.patient}` was given multiple BAMs! Only a single BAM per sample should be given.");
+                        }
+                    }
+
+                    given_inputs[csv_row.patient][csv_row.sample]['BAM'] = parsed_map;
+                } else {
+                    if (!given_inputs[csv_row.patient][csv_row.sample].containsKey(parse_type)) {
+                        given_inputs[csv_row.patient][csv_row.sample][parse_type] = [];
+                    }
+
+                    if (given_inputs[csv_row.patient][csv_row.sample][parse_type].contains(parsed_map)) {
+                        System.out.println("INFO - Found duplicate entry in CSV: `${parsed_map}` for type `${parse_type}` - the duplicate will be skipped.");
+                    } else {
+                        given_inputs[csv_row.patient][csv_row.sample][parse_type] << parsed_map;
+                    }
+                }
+            }
+        }
+
+        params.input = given_inputs;
+    }
+
     /**
     *   Verify inputs and determine input type
     */
@@ -73,6 +188,17 @@ input_handler {
         params.src_input_given = src_input_given;
         params.src_input_types = src_input_given ? given_src_types[0] : [];
 
+        // Check that the bam_header_sm is the same as sample for FASTQ input
+        if (input_type == 'FASTQ') {
+            params.input.each { patient, patient_data ->
+                patient_data.each { sample, sample_data ->
+                    sample_data.FASTQ.each { fastq_data ->
+                        assert fastq_data.bam_header_sm == sample : "Input FASTQs must have matching sample and bam_header_sm! Received `${fastq_data.bam_header_sm}` for sample: `${sample}`";
+                    }
+                }
+            }
+        }
+
         // Check that the same inputs have been given for all samples
         def given_inputs_per_state = ['normal': [] as Set, 'tumor': [] as Set];
         params.input.each { patient, patient_data ->

diff --git a/config/methods.config b/config/methods.config
@@ -341,6 +341,7 @@ methods {
     }
 
     set_up = {
+        input_handler.convert_csv_inputs()
         schema.load_custom_types("${projectDir}/config/custom_schema_types.config")
         schema.validate_specific("${projectDir}/config/schema.yaml", params, ['pipeline_params'])
         input_handler.handle_inputs()

diff --git a/config/schema.yaml b/config/schema.yaml
@@ -1,4 +1,9 @@
 ---
+input_csv:
+  type: 'Path'
+  mode: 'r'
+  required: false
+  help: 'Absolute path to input CSV if inputs are being provided through CSV'
 output_dir:
   type: 'Path'
   mode: 'w'

diff --git a/config/template.config b/config/template.config
@@ -3,6 +3,7 @@ includeConfig "${projectDir}/config/methods.config"
 includeConfig "${projectDir}/nextflow.config"
 
 params {
+    input_csv = '' // Path to input CSV if inputs are being given through CSV; if using YAML inputs, remove this parameter from the config
     output_dir = ''
     leading_work_dir = '' //Should be a /hot/path | Can't use /scratch
     pipeline_work_dir = '/scratch' //For individual pipeline outputs before copying to output_dir

diff --git a/external/pipeline-Nextflow-config b/external/pipeline-Nextflow-config
diff --git a/input/template-input-BAM.csv b/input/template-input-BAM.csv
@@ -0,0 +1,3 @@
+patient,sample,state,path
+patient1,sample1,normal,/path/to/bam
+patient1,sample2,tumor,/path/to/bam
diff --git a/input/template-input-BAM.yaml b/input/template-input-BAM.yaml
@@ -1,15 +1,15 @@
 ---
 input:
-  patient1:
-    sample1:
-      state: "state - normal or tumor"
+  <patient1>:  # Replace key with actual patient ID
+    <sample1>:  # Replace key with actual sample ID
+      state: <state>  # normal or tumor
       BAM:
-        path: /path/to/bam
-    sample2:
-      state: "state - normal or tumor"
+        path: </path/to/bam>  # Absolute path to BAM
+    <sample2>:  # Replace key with actual sample ID
+      state: <state>  # normal or tumor
       BAM:
-        path: /path/to/bam
-    sample3:
-      state: "state - normal or tumor"
+        path: </path/to/bam>  # Absolute path to BAM
+    <sample3>:  # Replace key with actual sample ID
+      state: <state>  # normal or tumor
       BAM:
-        path: /path/to/bam
+        path: </path/to/bam>  # Absolute path to BAM
diff --git a/input/template-input-CRAM.csv b/input/template-input-CRAM.csv
@@ -0,0 +1,3 @@
+patient,sample,state,path
+patient1,sample1,normal,/path/to/cram
+patient1,sample2,tumor,/path/to/cram
diff --git a/input/template-input-CRAM.yaml b/input/template-input-CRAM.yaml
@@ -1,15 +1,15 @@
 ---
 input:
-  patient1:
-    sample1:
-      state: "state - normal or tumor"
+  <patient1>:  # Replace key with actual patient ID
+    <sample1>:  # Replace key with actual sample ID
+      state: <state>  # normal or tumor
       CRAM:
-        path: /path/to/cram
-    sample2:
-      state: "state - normal or tumor"
+        path: </path/to/CRAM>  # Absolute path to CRAM
+    <sample2>:  # Replace key with actual sample ID
+      state: <state>  # normal or tumor
       CRAM:
-        path: /path/to/cram
-    sample3:
-      state: "state - normal or tumor"
+        path: </path/to/CRAM>  # Absolute path to CRAM
+    <sample3>:  # Replace key with actual sample ID
+      state: <state>  # normal or tumor
       CRAM:
-        path: /path/to/cram
+        path: </path/to/CRAM>  # Absolute path to CRAM
diff --git a/input/template-input-FASTQ.csv b/input/template-input-FASTQ.csv
@@ -0,0 +1,7 @@
+patient,sample,state,read_group_identifier,sequencing_center,library_identifier,platform_technology,platform_unit,lane,bam_header_sm,read1_fastq,read2_fastq
+patient1,sample1,normal,rg1,sc1,lb1,pl1,pu1,sample1,L001,/path/to/r1.fq.gz,/path/to/r2.fq.gz
+patient1,sample1,normal,rg2,sc2,lb2,pl2,pu2,sample1,L001,/path/to/r1.fq.gz,/path/to/r2.fq.gz
+patient1,sample2,tumor,rg3,sc3,lb3,pl3,pu3,sample2,L001,/path/to/r1.fq.gz,/path/to/r2.fq.gz
+patient1,sample2,tumor,rg4,sc4,lb4,pl4,pu4,sample2,L001,/path/to/r1.fq.gz,/path/to/r2.fq.gz
+patient1,sample3,tumor,rg5,sc5,lb5,pl5,pu5,sample3,L001,/path/to/r1.fq.gz,/path/to/r2.fq.gz
+patient1,sample3,tumor,rg6,sc6,lb6,pl6,pu6,sample3,L001,/path/to/r1.fq.gz,/path/to/r2.fq.gz
diff --git a/input/template-input-FASTQ.yaml b/input/template-input-FASTQ.yaml
@@ -1,66 +1,66 @@
 ---
 input:
-  patient1:
-    sample1:
-      state: "state - normal or tumor"
+  <patient1>:  # Replace key with actual patient ID
+    <sample1>:  # Replace key with actual sample ID
+      state: <state>  # normal or tumor
       FASTQ:
-        - read_group_identifier: "Read group identifier for alignment"
-          sequencing_center: "Center where sequencing was performed"
-          library_identifier: "Library used for sample"
-          platform_technology: "Technology used for sequencing"
-          platform_unit: "Name of specific platform unit"
-          bam_header_sm: "Sample name tag for BAM"
-          lane: <lane>
-          read1_fastq: "/path/to/r1.fq.gz"
-          read2_fastq: "/path/to/r2.fq.gz"
-        - read_group_identifier: "Read group identifier for alignment"
-          sequencing_center: "Center where sequencing was performed"
-          library_identifier: "Library used for sample"
-          platform_technology: "Technology used for sequencing"
-          platform_unit: "Name of specific platform unit"
-          bam_header_sm: "Sample name tag for BAM"
-          lane: <lane>
-          read1_fastq: "/path/to/r1.fq.gz"
-          read2_fastq: "/path/to/r2.fq.gz"
-    sample2:
-      state: "state - normal or tumor"
+        - read_group_identifier: <rg ID>  # Read group identifier for alignment
+          sequencing_center: <center>  # Center where sequencing was performed
+          library_identifier: <library>  # Library used for sample
+          platform_technology: <platform technology>  # Technology used for sequencing
+          platform_unit: <platform unit>  # Name of specific platform unit
+          bam_header_sm: <sample>  # Sample name tag for BAM
+          lane: <lane>  # Lane on platform
+          read1_fastq: </path/to/r1.fastq.gz>  # Absolute path to R1 FASTQ
+          read2_fastq: </path/to/r2.fastq.gz>  # Absolute path to R2 FASTQ
+        - read_group_identifier: <rg ID>  # Read group identifier for alignment
+          sequencing_center: <center>  # Center where sequencing was performed
+          library_identifier: <library>  # Library used for sample
+          platform_technology: <platform technology>  # Technology used for sequencing
+          platform_unit: <platform unit>  # Name of specific platform unit
+          bam_header_sm: <sample>  # Sample name tag for BAM
+          lane: <lane>  # Lane on platform
+          read1_fastq: </path/to/r1.fastq.gz>  # Absolute path to R1 FASTQ
+          read2_fastq: </path/to/r2.fastq.gz>  # Absolute path to R2 FASTQ
+    <sample2>:  # Replace key with actual sample ID
+      state: <state>  # normal or tumor
       FASTQ:
-        - read_group_identifier: "Read group identifier for alignment"
-          sequencing_center: "Center where sequencing was performed"
-          library_identifier: "Library used for sample"
-          platform_technology: "Technology used for sequencing"
-          platform_unit: "Name of specific platform unit"
-          bam_header_sm: "Sample name tag for BAM"
-          lane: <lane>
-          read1_fastq: "/path/to/r1.fq.gz"
-          read2_fastq: "/path/to/r2.fq.gz"
-        - read_group_identifier: "Read group identifier for alignment"
-          sequencing_center: "Center where sequencing was performed"
-          library_identifier: "Library used for sample"
-          platform_technology: "Technology used for sequencing"
-          platform_unit: "Name of specific platform unit"
-          bam_header_sm: "Sample name tag for BAM"
-          lane: <lane>
-          read1_fastq: "/path/to/r1.fq.gz"
-          read2_fastq: "/path/to/r2.fq.gz"
-    sample3:
-      state: "state - normal or tumor"
+        - read_group_identifier: <rg ID>  # Read group identifier for alignment
+          sequencing_center: <center>  # Center where sequencing was performed
+          library_identifier: <library>  # Library used for sample
+          platform_technology: <platform technology>  # Technology used for sequencing
+          platform_unit: <platform unit>  # Name of specific platform unit
+          bam_header_sm: <sample>  # Sample name tag for BAM
+          lane: <lane>  # Lane on platform
+          read1_fastq: </path/to/r1.fastq.gz>  # Absolute path to R1 FASTQ
+          read2_fastq: </path/to/r2.fastq.gz>  # Absolute path to R2 FASTQ
+        - read_group_identifier: <rg ID>  # Read group identifier for alignment
+          sequencing_center: <center>  # Center where sequencing was performed
+          library_identifier: <library>  # Library used for sample
+          platform_technology: <platform technology>  # Technology used for sequencing
+          platform_unit: <platform unit>  # Name of specific platform unit
+          bam_header_sm: <sample>  # Sample name tag for BAM
+          lane: <lane>  # Lane on platform
+          read1_fastq: </path/to/r1.fastq.gz>  # Absolute path to R1 FASTQ
+          read2_fastq: </path/to/r2.fastq.gz>  # Absolute path to R2 FASTQ
+    <sample3>:  # Replace key with actual sample ID
+      state: <state>  # normal or tumor
       FASTQ:
-        - read_group_identifier: "Read group identifier for alignment"
-          sequencing_center: "Center where sequencing was performed"
-          library_identifier: "Library used for sample"
-          platform_technology: "Technology used for sequencing"
-          platform_unit: "Name of specific platform unit"
-          bam_header_sm: "Sample name tag for BAM"
-          lane: <lane>
-          read1_fastq: "/path/to/r1.fq.gz"
-          read2_fastq: "/path/to/r2.fq.gz"
-        - read_group_identifier: "Read group identifier for alignment"
-          sequencing_center: "Center where sequencing was performed"
-          library_identifier: "Library used for sample"
-          platform_technology: "Technology used for sequencing"
-          platform_unit: "Name of specific platform unit"
-          bam_header_sm: "Sample name tag for BAM"
-          lane: <lane>
-          read1_fastq: "/path/to/r1.fq.gz"
-          read2_fastq: "/path/to/r2.fq.gz"
+        - read_group_identifier: <rg ID>  # Read group identifier for alignment
+          sequencing_center: <center>  # Center where sequencing was performed
+          library_identifier: <library>  # Library used for sample
+          platform_technology: <platform technology>  # Technology used for sequencing
+          platform_unit: <platform unit>  # Name of specific platform unit
+          bam_header_sm: <sample>  # Sample name tag for BAM
+          lane: <lane>  # Lane on platform
+          read1_fastq: </path/to/r1.fastq.gz>  # Absolute path to R1 FASTQ
+          read2_fastq: </path/to/r2.fastq.gz>  # Absolute path to R2 FASTQ
+        - read_group_identifier: <rg ID>  # Read group identifier for alignment
+          sequencing_center: <center>  # Center where sequencing was performed
+          library_identifier: <library>  # Library used for sample
+          platform_technology: <platform technology>  # Technology used for sequencing
+          platform_unit: <platform unit>  # Name of specific platform unit
+          bam_header_sm: <sample>  # Sample name tag for BAM
+          lane: <lane>  # Lane on platform
+          read1_fastq: </path/to/r1.fastq.gz>  # Absolute path to R1 FASTQ
+          read2_fastq: </path/to/r2.fastq.gz>  # Absolute path to R2 FASTQ