@@ -3233,23 +3233,66 @@ def generate_prep_information(self):
32333233                ('r' , 'water_lot_id' , 'water_lot' ),
32343234            ]
32353235            sql  =  """ 
3236-                 SELECT study_id, sample_id, content, run_name, experiment, 
3237-                        fwd_cycles, rev_cycles, principal_investigator, 
3238-                        et.description as sequencer_description, 
3239-                        lpp.epmotion_robot_id as lepmotion_robot_id, 
3240-                        epmotion_tm300_8_tool_id, epmotion_tm50_8_tool_id, 
3241-                        master_mix_id, water_lot_id, 
3242-                        gep.epmotion_robot_id as gepmotion_robot_id, 
3243-                        epmotion_tool_id, kingfisher_robot_id, 
3244-                        extraction_kit_id, 
3245-                        p1.external_id as plate, w1.row_num as row_num, 
3246-                        w1.col_num as col_num, 
3247-                        p2.external_id as primer_composition, 
3248-                        psc.barcode_seq as primer_set_composition, 
3249-                        run_name as run_prefix, sp.sequencer_id as platform_id, 
3250-                        sp.experiment as center_project_name 
3236+                 SELECT study_id, sample_id, 
3237+                     -- BARCODE 
3238+                     psc.barcode_seq AS barcode, 
3239+                     -- primer_set_id links to marker_gene_primer_set_id, 
3240+                     -- where we can get the linker/primer 
3241+                     primer_set_id, 
3242+                     -- primer_plate 
3243+                     p2.external_id AS primer_plate, 
3244+                     -- the well_id is a combination of the 
3245+                     --- row/col_num + content 
3246+                     w1.row_num AS row_num, w1.col_num AS col_num, content, 
3247+                     -- where to get the platting name 
3248+                     process.run_personnel_id AS plating, 
3249+                     -- extractionkit_lot 
3250+                     extraction_kit_id, 
3251+                     -- extraction_robot, both the epmotion robot and the 
3252+                     -- kingfisher robot 
3253+                     lpp.epmotion_robot_id AS lepmotion_robot_id, 
3254+                     kingfisher_robot_id, 
3255+                     -- TM1000_8_tool 
3256+                     epmotion_tool_id, 
3257+                     -- mastermix_lot 
3258+                     master_mix_id, 
3259+                     -- water_lot 
3260+                     water_lot_id, 
3261+                     -- processing_robot 
3262+                     gep.epmotion_robot_id AS gepmotion_robot_id, 
3263+                     -- TM300_8_tool 
3264+                     epmotion_tm300_8_tool_id, 
3265+                     -- TM50_8_tool 
3266+                     epmotion_tm50_8_tool_id, 
3267+                     -- sample_plate 
3268+                     p1.external_id AS sample_plate, 
3269+                     -- project_name 
3270+                     study_alias AS project_name, 
3271+                     -- orig_name 
3272+                     sample_id AS orig_name, 
3273+                     -- experiment_design_description 
3274+                     study_description AS experiment_design_description, 
3275+                     -- run_center 
3276+                     'UCSDMI' AS run_center, 
3277+                     -- primer_date 
3278+                     run_date AS primer_date, 
3279+                     -- run_date 
3280+                     '' AS run_date, 
3281+                     -- RUN_PREFIX 
3282+                     '' AS run_prefix, 
3283+                     -- sequencing_meth 
3284+                     'Sequencing by synthesis' AS sequencing_meth, 
3285+                     -- center_name 
3286+                     'UCSDMI' AS center_name, 
3287+                     -- center_project_name 
3288+                     '' AS center_project_name, 
3289+                     -- instrument_model 
3290+                     et.description AS instrument_model, 
3291+                     -- runid 
3292+                     '' AS runid 
32513293                -- Retrieve sequencing information 
32523294                FROM labman.sequencing_process sp 
3295+                 LEFT JOIN labman.process process USING (process_id) 
32533296                LEFT JOIN labman.equipment e ON ( 
32543297                    sequencer_id = equipment_id) 
32553298                LEFT JOIN labman.equipment_type et ON ( 
@@ -3300,8 +3343,8 @@ def generate_prep_information(self):
33003343                    pc.primer_set_composition_id = 
33013344                    psc.primer_set_composition_id) 
33023345                FULL JOIN qiita.study_sample USING (sample_id) 
3303-                 WHERE sequencing_process_id = %s  
3304-                 ORDER BY study_id, sample_id, row_num, col_num """ 
3346+                 LEFT JOIN qiita.study USING (study_id)  
3347+                 WHERE sequencing_process_id = %s """ 
33053348        elif  assay  ==  self ._metagenomics_assay_type :
33063349            extra_fields  =  [
33073350                ('e' , 'gepmotion_robot_id' , 'gdata_robot' ),
@@ -3380,32 +3423,32 @@ def generate_prep_information(self):
33803423                    w1.plate_id = p1.plate_id) 
33813424                FULL JOIN qiita.study_sample USING (sample_id) 
33823425                WHERE sequencing_process_id = %s 
3383-                 ORDER BY study_id, sample_id, row_num, col_num, i5.barcode_seq 
33843426                """ 
33853427
33863428        with  sql_connection .TRN  as  TRN :
3387-             # to simplify the main queries, let's get all the equipment info 
3429+             # Let's cache some data to avoid quering the DB multiple times 
3430+             # 1/3. equipment 
33883431            TRN .add ("""SELECT equipment_id, external_id, notes, description 
33893432                       FROM labman.equipment 
33903433                       LEFT JOIN labman.equipment_type 
33913434                       USING (equipment_type_id)""" )
3392-             equipment  =  {}
3393-             for  row  in  TRN .execute_fetchindex ():
3394-                 row  =  dict (row )
3395-                 eid  =  row .pop ('equipment_id' )
3396-                 equipment [eid ] =  row 
3397- 
3398-             # and the reagents 
3435+             equipment  =  {dict (row )['equipment_id' ]: dict (row )
3436+                          for  row  in  TRN .execute_fetchindex ()}
3437+             # 2/3. reagents 
33993438            TRN .add ("""SELECT reagent_composition_id, composition_id, 
34003439                           external_lot_id, description 
34013440                       FROM labman.reagent_composition 
34023441                       LEFT JOIN labman.reagent_composition_type 
34033442                       USING (reagent_composition_type_id)""" )
3404-             reagent  =  {}
3405-             for  row  in  TRN .execute_fetchindex ():
3406-                 row  =  dict (row )
3407-                 rid  =  row .pop ('reagent_composition_id' )
3408-                 reagent [rid ] =  row 
3443+             reagent  =  {dict (row )['reagent_composition_id' ]: dict (row )
3444+                        for  row  in  TRN .execute_fetchindex ()}
3445+             # 3/3. marker gene primer sets 
3446+             TRN .add ("""SELECT marker_gene_primer_set_id, primer_set_id, 
3447+                            target_gene, target_subfragment, linker_sequence, 
3448+                            fwd_primer_sequence, rev_primer_sequence, region 
3449+                        FROM labman.marker_gene_primer_set""" )
3450+             marker_gene_primer_set  =  {dict (row )['primer_set_id' ]: dict (row )
3451+                                       for  row  in  TRN .execute_fetchindex ()}
34093452
34103453            TRN .add (sql , [self .id ])
34113454            for  result  in  TRN .execute_fetchindex ():
@@ -3415,13 +3458,13 @@ def generate_prep_information(self):
34153458                content  =  result .pop ('content' )
34163459
34173460                # format well 
3418-                 col  =  result . pop ( 'col_num' ) 
3419-                 row  =  result . pop ( 'row_num' ) 
3461+                 col  =  result [ 'col_num' ] 
3462+                 row  =  result [ 'row_num' ] 
34203463                well  =  []
34213464                while  row :
34223465                    row , rem  =  divmod (row - 1 , 26 )
34233466                    well [:0 ] =  container_module .LETTERS [rem ]
3424-                 result ['well ' ] =  '' .join (well ) +  str (col )
3467+                 result ['well_id ' ] =  '' .join (well ) +  str (col )
34253468
34263469                # format extra fields list 
34273470                for  t , k , nk  in  extra_fields :
@@ -3436,13 +3479,34 @@ def generate_prep_information(self):
34363479                    result [nk ] =  val 
34373480
34383481                # format some final fields 
3439-                 result ['platform' ] =  equipment [
3440-                     result .pop ('platform_id' )]['description' ]
3482+                 result ['platform' ] =  'Illumina' 
3483+                 result ['instrument_model' ] =  '' 
3484+                 if  assay  ==  self ._amplicon_assay_type :
3485+                     result ['extraction_robot' ] =  '%s_%s'  %  (
3486+                         result .pop ('epmotion_robot' ),
3487+                         result .pop ('kingfisher_robot' ))
3488+                     result ['primer_plate' ] =  result [
3489+                         'primer_plate' ].split (' ' )[- 1 ]
3490+                     mgps  =  marker_gene_primer_set [result .pop ('primer_set_id' )]
3491+                     result ['PRIMER' ] =  '%s%s'  %  (
3492+                         mgps ['linker_sequence' ], mgps ['fwd_primer_sequence' ])
3493+                     result ['pcr_primers' ] =  'FWD:%s; REV:%s'  %  (
3494+                         mgps ['fwd_primer_sequence' ],
3495+                         mgps ['rev_primer_sequence' ])
3496+                     result ['linker' ] =  mgps ['linker_sequence' ]
3497+                     result ['target_gene' ] =  mgps ['target_gene' ]
3498+                     result ['target_subfragment' ] =  mgps ['target_subfragment' ]
3499+                     result ['library_construction_protocol' ] =  (
3500+                         'Illumina EMP protocol {0} amplification of {1}' 
3501+                         ' {2}' .format (mgps ['region' ], mgps ['target_gene' ],
3502+                                       mgps ['target_subfragment' ]))
34413503
34423504                if  sid  is  not None  and  study_id  is  not None :
34433505                    study  =  Study (study_id )
34443506                    if  study  not  in data :
34453507                        data [study ] =  {}
3508+                     # if we want the sample_name.well_id, just replace sid 
3509+                     # for content 
34463510                    data [study ][content ] =  result 
34473511
34483512                    if  assay  ==  self ._metagenomics_assay_type :
@@ -3452,16 +3516,66 @@ def generate_prep_information(self):
34523516                    if  assay  ==  self ._metagenomics_assay_type :
34533517                        result ['run_prefix' ] =  \
34543518                            SequencingProcess ._bcl_scrub_name (content )
3519+ 
34553520                    blanks [content ] =  result 
34563521
34573522        # converting from dict to pandas and then to tsv 
34583523        for  study , vals  in  data .items ():
34593524            merged  =  {** vals , ** blanks }
34603525            df  =  pd .DataFrame .from_dict (merged , orient = 'index' )
3461-             df .sort_index (inplace = True )
3462-             cols  =  sorted (list (df .columns ))
3526+             # the index/sample_name should be the original name if the 
3527+             # original name if it's not duplicated or None (blanks/spikes) 
3528+             dup_names  =  df [df .orig_name .duplicated ()].orig_name .unique ()
3529+             df .index  =  [v  if  v  and  v  not  in dup_names  else  k 
3530+                         for  k , v  in  df .orig_name .iteritems ()]
3531+             df ['well_description' ] =  ['%s_%s_%s'  %  (
3532+                 x .sample_plate , i , x .well_id ) for  i , x  in  df .iterrows ()]
3533+ 
3534+             # the following lines apply for assay == self._amplicon_assay_type 
3535+             # when we add shotgun (ToDo: #327), we'll need to modify 
3536+             # 1/3. renaming colums so they match expected casing 
3537+             mv  =  {
3538+                 'barcode' : 'BARCODE' , 'master_mix' : 'MasterMix_lot' ,
3539+                 'platform' : 'PLATFORM' , 'sample_plate' : 'Sample_Plate' ,
3540+                 'run_prefix' : 'RUN_PREFIX' , 'primer_date' : 'Primer_date' ,
3541+                 'extraction_robot' : 'Extraction_robot' ,
3542+                 'runid' : 'RUNID' , 'epmotion_tm50_8_tool' : 'TM50_8_tool' ,
3543+                 'library_construction_protocol' :
3544+                     'LIBRARY_CONSTRUCTION_PROTOCOL' ,
3545+                 'plating' : 'Plating' , 'linker' : 'LINKER' ,
3546+                 'project_name' : 'Project_name' , 'orig_name' : 'Orig_name' ,
3547+                 'well_id' : 'Well_ID' ,  'water_lot' : 'Water_Lot' ,
3548+                 'well_description' : 'Well_description' ,
3549+                 'run_center' : 'RUN_CENTER' ,
3550+                 'epmotion_tool' : 'TM1000_8_tool' ,
3551+                 'extraction_kit' : 'ExtractionKit_lot' ,
3552+                 'primer_plate' : 'Primer_Plate' , 'run_date' : 'RUN_DATE' ,
3553+                 'gdata_robot' : 'Processing_robot' ,
3554+                 'epmotion_tm300_8_tool' : 'TM300_8_tool' ,
3555+                 'instrument_model' : 'INSTRUMENT_MODEL' ,
3556+                 'experiment_design_description' :
3557+                     'EXPERIMENT_DESIGN_DESCRIPTION' 
3558+             }
3559+             df .rename (index = str , columns = mv , inplace = True )
3560+             # 2/3. sorting rows 
3561+             rows_order  =  ['Sample_Plate' , 'row_num' , 'col_num' ]
3562+             df .sort_values (by = rows_order , inplace = True )
3563+             # 3/3. sorting and keeping only required columns 
3564+             order  =  [
3565+                 'BARCODE' , 'PRIMER' , 'Primer_Plate' , 'Well_ID' , 'Plating' ,
3566+                 'ExtractionKit_lot' , 'Extraction_robot' , 'TM1000_8_tool' ,
3567+                 'Primer_date' , 'MasterMix_lot' , 'Water_Lot' ,
3568+                 'Processing_robot' , 'TM300_8_tool' , 'TM50_8_tool' ,
3569+                 'Sample_Plate' , 'Project_name' , 'Orig_name' ,
3570+                 'Well_description' , 'EXPERIMENT_DESIGN_DESCRIPTION' ,
3571+                 'LIBRARY_CONSTRUCTION_PROTOCOL' , 'LINKER' , 'PLATFORM' ,
3572+                 'RUN_CENTER' , 'RUN_DATE' , 'RUN_PREFIX' , 'pcr_primers' ,
3573+                 'sequencing_meth' , 'target_gene' , 'target_subfragment' ,
3574+                 'center_name' , 'center_project_name' , 'INSTRUMENT_MODEL' ,
3575+                 'RUNID' ]
3576+             df  =  df [order ]
34633577            sio  =  StringIO ()
3464-             df [ cols ] .to_csv (sio , sep = '\t ' , index_label = 'sample_name' )
3578+             df .to_csv (sio , sep = '\t ' , index_label = 'sample_name' )
34653579            data [study ] =  sio .getvalue ()
34663580
34673581        return  data 
0 commit comments