biocore · AmandaBirmingham · Aug 31, 2018 · Aug 17, 2018 · Aug 17, 2018 · Aug 17, 2018
diff --git a/.gitignore b/.gitignore
@@ -92,3 +92,6 @@ ENV/
 *.swp
 *.swo
 *~
+
+# keys for https
+support_files/*
diff --git a/labman/db/process.py b/labman/db/process.py
@@ -3233,23 +3233,66 @@ def generate_prep_information(self):
                 ('r', 'water_lot_id', 'water_lot'),
             ]
             sql = """
-                SELECT study_id, sample_id, content, run_name, experiment,
-                       fwd_cycles, rev_cycles, principal_investigator,
-                       et.description as sequencer_description,
-                       lpp.epmotion_robot_id as lepmotion_robot_id,
-                       epmotion_tm300_8_tool_id, epmotion_tm50_8_tool_id,
-                       master_mix_id, water_lot_id,
-                       gep.epmotion_robot_id as gepmotion_robot_id,
-                       epmotion_tool_id, kingfisher_robot_id,
-                       extraction_kit_id,
-                       p1.external_id as plate, w1.row_num as row_num,
-                       w1.col_num as col_num,
-                       p2.external_id as primer_composition,
-                       psc.barcode_seq as primer_set_composition,
-                       run_name as run_prefix, sp.sequencer_id as platform_id,
-                       sp.experiment as center_project_name
+                SELECT study_id, sample_id,
+                    -- BARCODE
+                    psc.barcode_seq AS barcode,
+                    -- primer_set_id links to marker_gene_primer_set_id,
+                    -- where we can get the linker/primer
+                    primer_set_id,
+                    -- primer_plate
+                    p2.external_id AS primer_plate,
+                    -- the well_id is a combination of the
+                    --- row/col_num + content
+                    w1.row_num AS row_num, w1.col_num AS col_num, content,
+                    -- where to get the platting name
+                    process.run_personnel_id AS plating,
+                    -- extractionkit_lot
+                    extraction_kit_id,
+                    -- extraction_robot, both the epmotion robot and the
+                    -- kingfisher robot
+                    lpp.epmotion_robot_id AS lepmotion_robot_id,
+                    kingfisher_robot_id,
+                    -- TM1000_8_tool
+                    epmotion_tool_id,
+                    -- mastermix_lot
+                    master_mix_id,
+                    -- water_lot
+                    water_lot_id,
+                    -- processing_robot
+                    gep.epmotion_robot_id AS gepmotion_robot_id,
+                    -- TM300_8_tool
+                    epmotion_tm300_8_tool_id,
+                    -- TM50_8_tool
+                    epmotion_tm50_8_tool_id,
+                    -- sample_plate
+                    p1.external_id AS sample_plate,
+                    -- project_name
+                    study_alias AS project_name,
+                    -- orig_name
+                    sample_id AS orig_name,
+                    -- experiment_design_description
+                    study_description AS experiment_design_description,
+                    -- run_center
+                    'UCSDMI' AS run_center,
+                    -- primer_date
+                    run_date AS primer_date,
+                    -- run_date
+                    '' AS run_date,
+                    -- RUN_PREFIX
+                    '' AS run_prefix,
+                    -- sequencing_meth
+                    'Sequencing by synthesis' AS sequencing_meth,
+                    -- center_name
+                    'UCSDMI' AS center_name,
+                    -- center_project_name
+                    '' AS center_project_name,
+                    -- instrument_model
+                    et.description AS instrument_model,
+                    -- runid
+                    '' AS runid
                 -- Retrieve sequencing information
                 FROM labman.sequencing_process sp
+                LEFT JOIN labman.process process USING (process_id)
                 LEFT JOIN labman.equipment e ON (
                     sequencer_id = equipment_id)
                 LEFT JOIN labman.equipment_type et ON (
@@ -3300,8 +3343,8 @@ def generate_prep_information(self):
                     pc.primer_set_composition_id =
                     psc.primer_set_composition_id)
                 FULL JOIN qiita.study_sample USING (sample_id)
-                WHERE sequencing_process_id = %s
-                ORDER BY study_id, sample_id, row_num, col_num"""
+                LEFT JOIN qiita.study USING (study_id)
+                WHERE sequencing_process_id = %s"""
         elif assay == self._metagenomics_assay_type:
             extra_fields = [
                 ('e', 'gepmotion_robot_id', 'gdata_robot'),
@@ -3380,32 +3423,32 @@ def generate_prep_information(self):
                     w1.plate_id = p1.plate_id)
                 FULL JOIN qiita.study_sample USING (sample_id)
                 WHERE sequencing_process_id = %s
-                ORDER BY study_id, sample_id, row_num, col_num, i5.barcode_seq
                 """
 
         with sql_connection.TRN as TRN:
-            # to simplify the main queries, let's get all the equipment info
+            # Let's cache some data to avoid quering the DB multiple times
+            # 1/3. equipment
             TRN.add("""SELECT equipment_id, external_id, notes, description
                        FROM labman.equipment
                        LEFT JOIN labman.equipment_type
                        USING (equipment_type_id)""")
-            equipment = {}
-            for row in TRN.execute_fetchindex():
-                row = dict(row)
-                eid = row.pop('equipment_id')
-                equipment[eid] = row
-
-            # and the reagents
+            equipment = {dict(row)['equipment_id']: dict(row)
+                         for row in TRN.execute_fetchindex()}
+            # 2/3. reagents
             TRN.add("""SELECT reagent_composition_id, composition_id,
                            external_lot_id, description
                        FROM labman.reagent_composition
                        LEFT JOIN labman.reagent_composition_type
                        USING (reagent_composition_type_id)""")
-            reagent = {}
-            for row in TRN.execute_fetchindex():
-                row = dict(row)
-                rid = row.pop('reagent_composition_id')
-                reagent[rid] = row
+            reagent = {dict(row)['reagent_composition_id']: dict(row)
+                       for row in TRN.execute_fetchindex()}
+            # 3/3. marker gene primer sets
+            TRN.add("""SELECT marker_gene_primer_set_id, primer_set_id,
+                           target_gene, target_subfragment, linker_sequence,
+                           fwd_primer_sequence, rev_primer_sequence, region
+                       FROM labman.marker_gene_primer_set""")
+            marker_gene_primer_set = {dict(row)['primer_set_id']: dict(row)
+                                      for row in TRN.execute_fetchindex()}
 
             TRN.add(sql, [self.id])
             for result in TRN.execute_fetchindex():
@@ -3415,13 +3458,13 @@ def generate_prep_information(self):
                 content = result.pop('content')
 
                 # format well
-                col = result.pop('col_num')
-                row = result.pop('row_num')
+                col = result['col_num']
+                row = result['row_num']
                 well = []
                 while row:
                     row, rem = divmod(row-1, 26)
                     well[:0] = container_module.LETTERS[rem]
-                result['well'] = ''.join(well) + str(col)
+                result['well_id'] = ''.join(well) + str(col)
 
                 # format extra fields list
                 for t, k, nk in extra_fields:
@@ -3436,13 +3479,34 @@ def generate_prep_information(self):
                     result[nk] = val
 
                 # format some final fields
-                result['platform'] = equipment[
-                    result.pop('platform_id')]['description']
+                result['platform'] = 'Illumina'
+                result['instrument_model'] = ''
+                if assay == self._amplicon_assay_type:
+                    result['extraction_robot'] = '%s_%s' % (
+                        result.pop('epmotion_robot'),
+                        result.pop('kingfisher_robot'))
+                    result['primer_plate'] = result[
+                        'primer_plate'].split(' ')[-1]
+                    mgps = marker_gene_primer_set[result.pop('primer_set_id')]
+                    result['PRIMER'] = '%s%s' % (
+                        mgps['linker_sequence'], mgps['fwd_primer_sequence'])
+                    result['pcr_primers'] = 'FWD:%s; REV:%s' % (
+                        mgps['fwd_primer_sequence'],
+                        mgps['rev_primer_sequence'])
+                    result['linker'] = mgps['linker_sequence']
+                    result['target_gene'] = mgps['target_gene']
+                    result['target_subfragment'] = mgps['target_subfragment']
+                    result['library_construction_protocol'] = (
+                        'Illumina EMP protocol {0} amplification of {1}'
+                        ' {2}'.format(mgps['region'], mgps['target_gene'],
+                                      mgps['target_subfragment']))
 
                 if sid is not None and study_id is not None:
                     study = Study(study_id)
                     if study not in data:
                         data[study] = {}
+                    # if we want the sample_name.well_id, just replace sid
+                    # for content
                     data[study][content] = result
 
                     if assay == self._metagenomics_assay_type:
@@ -3452,16 +3516,66 @@ def generate_prep_information(self):
                     if assay == self._metagenomics_assay_type:
                         result['run_prefix'] = \
                             SequencingProcess._bcl_scrub_name(content)
+
                     blanks[content] = result
 
         # converting from dict to pandas and then to tsv
         for study, vals in data.items():
             merged = {**vals, **blanks}
             df = pd.DataFrame.from_dict(merged, orient='index')
-            df.sort_index(inplace=True)
-            cols = sorted(list(df.columns))
+            # the index/sample_name should be the original name if the
+            # original name if it's not duplicated or None (blanks/spikes)
+            dup_names = df[df.orig_name.duplicated()].orig_name.unique()
+            df.index = [v if v and v not in dup_names else k
+                        for k, v in df.orig_name.iteritems()]
+            df['well_description'] = ['%s_%s_%s' % (
+                x.sample_plate, i, x.well_id) for i, x in df.iterrows()]
+
+            # the following lines apply for assay == self._amplicon_assay_type
+            # when we add shotgun (ToDo: #327), we'll need to modify
+            # 1/3. renaming colums so they match expected casing
+            mv = {
+                'barcode': 'BARCODE', 'master_mix': 'MasterMix_lot',
+                'platform': 'PLATFORM', 'sample_plate': 'Sample_Plate',
+                'run_prefix': 'RUN_PREFIX', 'primer_date': 'Primer_date',
+                'extraction_robot': 'Extraction_robot',
+                'runid': 'RUNID', 'epmotion_tm50_8_tool': 'TM50_8_tool',
+                'library_construction_protocol':
+                    'LIBRARY_CONSTRUCTION_PROTOCOL',
+                'plating': 'Plating', 'linker': 'LINKER',
+                'project_name': 'Project_name', 'orig_name': 'Orig_name',
+                'well_id': 'Well_ID',  'water_lot': 'Water_Lot',
+                'well_description': 'Well_description',
+                'run_center': 'RUN_CENTER',
+                'epmotion_tool': 'TM1000_8_tool',
+                'extraction_kit': 'ExtractionKit_lot',
+                'primer_plate': 'Primer_Plate', 'run_date': 'RUN_DATE',
+                'gdata_robot': 'Processing_robot',
+                'epmotion_tm300_8_tool': 'TM300_8_tool',
+                'instrument_model': 'INSTRUMENT_MODEL',
+                'experiment_design_description':
+                    'EXPERIMENT_DESIGN_DESCRIPTION'
+            }
+            df.rename(index=str, columns=mv, inplace=True)
+            # 2/3. sorting rows
+            rows_order = ['Sample_Plate', 'row_num', 'col_num']
+            df.sort_values(by=rows_order, inplace=True)
+            # 3/3. sorting and keeping only required columns
+            order = [
+                'BARCODE', 'PRIMER', 'Primer_Plate', 'Well_ID', 'Plating',
+                'ExtractionKit_lot', 'Extraction_robot', 'TM1000_8_tool',
+                'Primer_date', 'MasterMix_lot', 'Water_Lot',
+                'Processing_robot', 'TM300_8_tool', 'TM50_8_tool',
+                'Sample_Plate', 'Project_name', 'Orig_name',
+                'Well_description', 'EXPERIMENT_DESIGN_DESCRIPTION',
+                'LIBRARY_CONSTRUCTION_PROTOCOL', 'LINKER', 'PLATFORM',
+                'RUN_CENTER', 'RUN_DATE', 'RUN_PREFIX', 'pcr_primers',
+                'sequencing_meth', 'target_gene', 'target_subfragment',
+                'center_name', 'center_project_name', 'INSTRUMENT_MODEL',
+                'RUNID']
+            df = df[order]
             sio = StringIO()
-            df[cols].to_csv(sio, sep='\t', index_label='sample_name')
+            df.to_csv(sio, sep='\t', index_label='sample_name')
             data[study] = sio.getvalue()
 
         return data
diff --git a/labman/db/tests/test_plate.py b/labman/db/tests/test_plate.py
@@ -7,7 +7,6 @@
 # ----------------------------------------------------------------------------
 
 from unittest import main
-from datetime import datetime
 from types import GeneratorType
 
 from labman.db.testing import LabmanTestCase
@@ -376,12 +375,12 @@ def test_properties(self):
         # orders multiple processes in order from oldest to newest
         tester2 = Plate(26)
         self.assertEqual(len(tester2.quantification_processes), 2)
-        self.assertEqual(tester2.quantification_processes[0].date,
-                         datetime.strptime("2017-10-25 19:10:25",
-                                           '%Y-%m-%d %H:%M:%S'))
-        self.assertEqual(tester2.quantification_processes[1].date,
-                         datetime.strptime("2017-10-26 03:10:25",
-                                           '%Y-%m-%d %H:%M:%S'))
+        # we are going to test the dates as string because in the database we
+        # have the full date (including seconds)
+        obs_date = str(tester2.quantification_processes[0].date)
+        self.assertEqual(obs_date, "2017-10-25 19:10:25")
+        obs_date = str(tester2.quantification_processes[1].date)
+        self.assertEqual(obs_date, "2017-10-26 03:10:25")
 
     def test_get_well(self):
         # Plate 21 - Defined in the test DB
-Original file line number
+Diff line change
@@ Expand Up / @@ -92,3 +92,6 @@ ENV/ @@
     *.swp
     *.swo
     *~
+    # keys for https
+    support_files/*