Fixing qiita_ware for requirement changes

qiita-spots · josenavas · Mar 28, 2015 · Mar 28, 2015 · Mar 28, 2015 · Mar 28, 2015
commit 8e8de2aae70564616b33d3493e9750b622999640
diff --git a/qiita_ware/processing_pipeline.py b/qiita_ware/processing_pipeline.py
@@ -43,30 +43,35 @@ def _get_qiime_minimal_mapping(prep_template, out_dir):
     from os.path import join
     import pandas as pd
 
-    # Get the data in a pandas DataFrame, so it is easier to manage
-    pt = prep_template.to_dataframe()
-
-    # We now need to rename some columns to be QIIME compliant.
-    # Hopefully, this conversion won't be needed if QIIME relaxes its
-    # constraints
-    pt.rename(columns={'barcodesequence': 'BarcodeSequence',
-                       'linkerprimersequence': 'LinkerPrimerSequence'},
-              inplace=True)
-    pt['Description'] = pd.Series(['Qiita MMF'] * len(pt.index),
-                                  index=pt.index)
+    # The prep templates has a QIIME mapping file, get it
+    qiime_map = pd.read_csv(prep_template.qiime_map_fp, sep='\t',
+                            keep_default_na=False, na_values=['unknown'],
+                            index_col=False)
+    qiime_map.set_index('#SampleID', inplace=True, drop=True)
+
+    # We use our own description to avoid potential processing problems
+    qiime_map['Description'] = pd.Series(['Qiita MMF'] * len(qiime_map.index),
+                                         index=qiime_map.index)
 
     # We ensure the order of the columns as QIIME is expecting
     cols = ['BarcodeSequence', 'LinkerPrimerSequence', 'Description']
 
-    # If the study has more than 1 lane, we should generate a qiita MMF for
-    # each of the lanes. We know how to split the prep template based on
-    # the run_prefix column
-    output_fps = []
     path_builder = partial(join, out_dir)
-    for prefix, df in pt.groupby('run_prefix'):
-        df = df[cols]
-        out_fp = path_builder("%s_MMF.txt" % prefix)
-        output_fps.append(out_fp)
+    if 'run_prefix' in qiime_map:
+        # The study potentially has more than 1 lane, so we should generate a
+        # qiita MMF for each of the lanes. We know how to split the prep
+        # template based on the run_prefix column
+        output_fps = []
+        for prefix, df in qiime_map.groupby('run_prefix'):
+            df = df[cols]
+            out_fp = path_builder("%s_MMF.txt" % prefix)
+            output_fps.append(out_fp)
+            df.to_csv(out_fp, index_label="#SampleID", sep='\t')
+    else:
+        # The study only has one lane, just write the MMF
+        df = qiime_map[cols]
+        out_fp = path_builder("prep_%d_MMF.txt" % prep_template.id)
+        output_fps = [out_fp]
         df.to_csv(out_fp, index_label="#SampleID", sep='\t')
 
     return output_fps

diff --git a/qiita_ware/test/test_processing_pipeline.py b/qiita_ware/test/test_processing_pipeline.py
@@ -207,16 +207,19 @@ def test_get_preprocess_fasta_cmd_sff_no_run_prefix(self):
         self.assertEqual(obs_cmds[3], exp_cmd_4)
 
     def test_get_preprocess_fasta_cmd_sff_run_prefix(self):
+        raw_data = RawData(3)
+        params = Preprocessed454Params(1)
+        prep_template = PrepTemplate(1)
+
         # Need to alter the run_prefix of one sample so we can test the
         # multiple values
         conn_handler = SQLConnectionHandler()
         sql = ("UPDATE qiita.prep_1 SET run_prefix='test1' WHERE "
                "sample_id = '1.SKM9.640192'")
         conn_handler.execute(sql)
-
-        raw_data = RawData(3)
-        params = Preprocessed454Params(1)
-        prep_template = PrepTemplate(1)
+        # Since we change the prep template, we need to re-generated the
+        # prep and qiime mapping files
+        prep_template.regenerate_files()
 
         obs_cmd, obs_output_dir = _get_preprocess_fasta_cmd(
             raw_data, prep_template, params)
@@ -242,23 +245,25 @@ def test_get_preprocess_fasta_cmd_sff_run_prefix(self):
     def test_get_preprocess_fasta_cmd_sff_run_prefix_match(self):
         # Test that the run prefixes in the prep_template and the file names
         # actually match and raise an error if not
+        file_count = get_count('qiita.filepath')
         conn_handler = SQLConnectionHandler()
-        sql = ("""
-            INSERT INTO qiita.filepath (filepath_id, filepath,
-                filepath_type_id, checksum, checksum_algorithm_id,
-                data_directory_id) VALUES (19, '1_new.sff', 17, 852952723, 1,
-                5);
-            INSERT INTO qiita.raw_filepath (raw_data_id , filepath_id) VALUES
-                (3, 19);
-            UPDATE qiita.prep_1 SET run_prefix='preprocess_test';
-            UPDATE qiita.prep_1 SET run_prefix='new' WHERE
-                sample_id = '1.SKB8.640193';
-        """)
-        conn_handler.execute(sql)
+        sql = """INSERT INTO qiita.filepath (filepath, filepath_type_id,
+                        checksum, checksum_algorithm_id, data_directory_id)
+                    VALUES ('1_new.sff', 17, 852952723, 1, 5);
+                INSERT INTO qiita.raw_filepath (raw_data_id , filepath_id)
+                    VALUES (3, %s);
+                UPDATE qiita.prep_1 SET run_prefix='preprocess_test';
+                UPDATE qiita.prep_1 SET run_prefix='new'
+                    WHERE sample_id = '1.SKB8.640193';"""
+        fp_id = file_count + 1
+        conn_handler.execute(sql, (fp_id,))
 
         raw_data = RawData(3)
         params = Preprocessed454Params(1)
         prep_template = PrepTemplate(1)
+        # Since we change the prep template, we need to re-generated the
+        # prep and qiime mapping files
+        prep_template.regenerate_files()
 
         obs_cmd, obs_output_dir = _get_preprocess_fasta_cmd(
             raw_data, prep_template, params)
@@ -287,48 +292,45 @@ def test_get_preprocess_fasta_cmd_sff_run_prefix_match(self):
     def test_get_preprocess_fasta_cmd_sff_run_prefix_match_error_1(self):
         # Test that the run prefixes in the prep_template and the file names
         # actually match and raise an error if not
+        file_count = get_count('qiita.filepath')
         conn_handler = SQLConnectionHandler()
-        sql = ("""
-            INSERT INTO qiita.filepath (filepath_id, filepath,
-                filepath_type_id, checksum, checksum_algorithm_id,
-                data_directory_id) VALUES (19, '1_new.sff', 17, 852952723, 1,
-                5);
-            INSERT INTO qiita.raw_filepath (raw_data_id , filepath_id) VALUES
-                (3, 19);
-            INSERT INTO qiita.filepath (filepath_id, filepath,
-                filepath_type_id, checksum, checksum_algorithm_id,
-                data_directory_id) VALUES (20, '1_error.sff', 17, 852952723,
-                1, 5);
-            INSERT INTO qiita.raw_filepath (raw_data_id , filepath_id) VALUES
-                (3, 20);
-            UPDATE qiita.prep_1 SET run_prefix='preprocess_test';
-            UPDATE qiita.prep_1 SET run_prefix='new' WHERE
-                sample_id = '1.SKB8.640193';
-        """)
-        conn_handler.execute(sql)
+        sql = """INSERT INTO qiita.filepath (filepath, filepath_type_id,
+                        checksum, checksum_algorithm_id, data_directory_id)
+                    VALUES ('1_new.sff', 17, 852952723, 1, 5);
+                INSERT INTO qiita.raw_filepath (raw_data_id , filepath_id)
+                    VALUES (3, %s);
+                INSERT INTO qiita.filepath (filepath, filepath_type_id,
+                        checksum, checksum_algorithm_id, data_directory_id)
+                    VALUES ('1_error.sff', 17, 852952723, 1, 5);
+                INSERT INTO qiita.raw_filepath (raw_data_id , filepath_id)
+                    VALUES (3, %s);
+                UPDATE qiita.prep_1 SET run_prefix='preprocess_test';
+                UPDATE qiita.prep_1 SET run_prefix='new'
+                    WHERE sample_id = '1.SKB8.640193';"""
+        conn_handler.execute(sql, (file_count + 1, file_count + 2))
 
         raw_data = RawData(3)
         params = Preprocessed454Params(1)
         prep_template = PrepTemplate(1)
+        prep_template.regenerate_files()
 
         with self.assertRaises(ValueError):
             _get_preprocess_fasta_cmd(raw_data, prep_template, params)
 
     def test_get_preprocess_fasta_cmd_sff_run_prefix_match_error_2(self):
         # Should raise error
         conn_handler = SQLConnectionHandler()
-        sql = ("""
-            UPDATE qiita.prep_1 SET run_prefix='test1';
-            UPDATE qiita.prep_1 SET run_prefix='test2' WHERE
-                sample_id = '1.SKB2.640194';
-            UPDATE qiita.prep_1 SET run_prefix='error' WHERE
-                sample_id = '1.SKB8.640193';
-        """)
+        sql = """UPDATE qiita.prep_1 SET run_prefix='test1';
+                 UPDATE qiita.prep_1 SET run_prefix='test2'
+                    WHERE sample_id = '1.SKB2.640194';
+                 UPDATE qiita.prep_1 SET run_prefix='error'
+                    WHERE sample_id = '1.SKB8.640193';"""
         conn_handler.execute(sql)
 
         raw_data = RawData(3)
         params = Preprocessed454Params(1)
         prep_template = PrepTemplate(1)
+        prep_template.regenerate_files()
 
         with self.assertRaises(ValueError):
             _get_preprocess_fasta_cmd(raw_data, prep_template, params)

diff --git a/qiita_ware/test/test_util.py b/qiita_ware/test/test_util.py
@@ -98,11 +98,12 @@ def test_dataframe_from_template(self):
             u'SKM7.640188', u'SKM8.640201', u'SKM9.640192'})
 
         self.assertTrue(set(obs.columns), {
-            u'tot_org_carb', u'common_name', u'has_extracted_data',
+            u'tot_org_carb', u'common_name', u'dna_extracted',
             u'required_sample_info_status', u'water_content_soil',
             u'env_feature', u'assigned_from_geo', u'altitude', u'env_biome',
-            u'texture', u'has_physical_specimen', u'description_duplicate',
-            u'physical_location', u'latitude', u'ph', u'host_taxid',
+            u'texture', u'physical_specimen_remaining',
+            u'description_duplicate', u'physical_specimen_location',
+            u'latitude', u'ph', u'host_taxid',
             u'elevation', u'description', u'collection_timestamp',
             u'taxon_id', u'samp_salinity', u'host_subject_id', u'sample_type',
             u'season_environment', u'temp', u'country', u'longitude',
@@ -288,8 +289,8 @@ def test_hdf5IO_open(self):
     'experiment_design_description': [('micro biome of soil and rhizosphere '
                                        'of cannabis plants from CA', 27)],
     'experiment_title': [('Cannabis Soil Microbiome', 27)],
-    'has_extracted_data': [('True', 27)],
-    'has_physical_specimen': [('True', 27)],
+    'dna_extracted': [('True', 27)],
+    'physical_specimen_remaining': [('True', 27)],
     'host_subject_id': [('1001:B1', 1),
                         ('1001:B2', 1),
                         ('1001:B3', 1),
@@ -397,7 +398,7 @@ def test_hdf5IO_open(self):
         ('96.0693176066', 1)],
     'pcr_primers': [('FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT', 27)],
     'ph': [('6.8', 9), ('6.82', 10), ('6.94', 8)],
-    'physical_location': [('ANL', 27)],
+    'physical_specimen_location': [('ANL', 27)],
     'platform': [('Illumina', 27)],
     'required_sample_info_status': [('completed', 27)],
     'run_center': [('ANL', 27)],