Skip to content

Commit bb3e574

Browse files
Merge pull request #328 from antgonza/fix-301
fix #301
2 parents 75cfd27 + 8621e64 commit bb3e574

File tree

6 files changed

+190
-74
lines changed

6 files changed

+190
-74
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,3 +92,6 @@ ENV/
9292
*.swp
9393
*.swo
9494
*~
95+
96+
# keys for https
97+
support_files/*

labman/db/process.py

Lines changed: 153 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -3233,23 +3233,66 @@ def generate_prep_information(self):
32333233
('r', 'water_lot_id', 'water_lot'),
32343234
]
32353235
sql = """
3236-
SELECT study_id, sample_id, content, run_name, experiment,
3237-
fwd_cycles, rev_cycles, principal_investigator,
3238-
et.description as sequencer_description,
3239-
lpp.epmotion_robot_id as lepmotion_robot_id,
3240-
epmotion_tm300_8_tool_id, epmotion_tm50_8_tool_id,
3241-
master_mix_id, water_lot_id,
3242-
gep.epmotion_robot_id as gepmotion_robot_id,
3243-
epmotion_tool_id, kingfisher_robot_id,
3244-
extraction_kit_id,
3245-
p1.external_id as plate, w1.row_num as row_num,
3246-
w1.col_num as col_num,
3247-
p2.external_id as primer_composition,
3248-
psc.barcode_seq as primer_set_composition,
3249-
run_name as run_prefix, sp.sequencer_id as platform_id,
3250-
sp.experiment as center_project_name
3236+
SELECT study_id, sample_id,
3237+
-- BARCODE
3238+
psc.barcode_seq AS barcode,
3239+
-- primer_set_id links to marker_gene_primer_set_id,
3240+
-- where we can get the linker/primer
3241+
primer_set_id,
3242+
-- primer_plate
3243+
p2.external_id AS primer_plate,
3244+
-- the well_id is a combination of the
3245+
--- row/col_num + content
3246+
w1.row_num AS row_num, w1.col_num AS col_num, content,
3247+
-- where to get the platting name
3248+
process.run_personnel_id AS plating,
3249+
-- extractionkit_lot
3250+
extraction_kit_id,
3251+
-- extraction_robot, both the epmotion robot and the
3252+
-- kingfisher robot
3253+
lpp.epmotion_robot_id AS lepmotion_robot_id,
3254+
kingfisher_robot_id,
3255+
-- TM1000_8_tool
3256+
epmotion_tool_id,
3257+
-- mastermix_lot
3258+
master_mix_id,
3259+
-- water_lot
3260+
water_lot_id,
3261+
-- processing_robot
3262+
gep.epmotion_robot_id AS gepmotion_robot_id,
3263+
-- TM300_8_tool
3264+
epmotion_tm300_8_tool_id,
3265+
-- TM50_8_tool
3266+
epmotion_tm50_8_tool_id,
3267+
-- sample_plate
3268+
p1.external_id AS sample_plate,
3269+
-- project_name
3270+
study_alias AS project_name,
3271+
-- orig_name
3272+
sample_id AS orig_name,
3273+
-- experiment_design_description
3274+
study_description AS experiment_design_description,
3275+
-- run_center
3276+
'UCSDMI' AS run_center,
3277+
-- primer_date
3278+
run_date AS primer_date,
3279+
-- run_date
3280+
'' AS run_date,
3281+
-- RUN_PREFIX
3282+
'' AS run_prefix,
3283+
-- sequencing_meth
3284+
'Sequencing by synthesis' AS sequencing_meth,
3285+
-- center_name
3286+
'UCSDMI' AS center_name,
3287+
-- center_project_name
3288+
'' AS center_project_name,
3289+
-- instrument_model
3290+
et.description AS instrument_model,
3291+
-- runid
3292+
'' AS runid
32513293
-- Retrieve sequencing information
32523294
FROM labman.sequencing_process sp
3295+
LEFT JOIN labman.process process USING (process_id)
32533296
LEFT JOIN labman.equipment e ON (
32543297
sequencer_id = equipment_id)
32553298
LEFT JOIN labman.equipment_type et ON (
@@ -3300,8 +3343,8 @@ def generate_prep_information(self):
33003343
pc.primer_set_composition_id =
33013344
psc.primer_set_composition_id)
33023345
FULL JOIN qiita.study_sample USING (sample_id)
3303-
WHERE sequencing_process_id = %s
3304-
ORDER BY study_id, sample_id, row_num, col_num"""
3346+
LEFT JOIN qiita.study USING (study_id)
3347+
WHERE sequencing_process_id = %s"""
33053348
elif assay == self._metagenomics_assay_type:
33063349
extra_fields = [
33073350
('e', 'gepmotion_robot_id', 'gdata_robot'),
@@ -3380,32 +3423,32 @@ def generate_prep_information(self):
33803423
w1.plate_id = p1.plate_id)
33813424
FULL JOIN qiita.study_sample USING (sample_id)
33823425
WHERE sequencing_process_id = %s
3383-
ORDER BY study_id, sample_id, row_num, col_num, i5.barcode_seq
33843426
"""
33853427

33863428
with sql_connection.TRN as TRN:
3387-
# to simplify the main queries, let's get all the equipment info
3429+
# Let's cache some data to avoid quering the DB multiple times
3430+
# 1/3. equipment
33883431
TRN.add("""SELECT equipment_id, external_id, notes, description
33893432
FROM labman.equipment
33903433
LEFT JOIN labman.equipment_type
33913434
USING (equipment_type_id)""")
3392-
equipment = {}
3393-
for row in TRN.execute_fetchindex():
3394-
row = dict(row)
3395-
eid = row.pop('equipment_id')
3396-
equipment[eid] = row
3397-
3398-
# and the reagents
3435+
equipment = {dict(row)['equipment_id']: dict(row)
3436+
for row in TRN.execute_fetchindex()}
3437+
# 2/3. reagents
33993438
TRN.add("""SELECT reagent_composition_id, composition_id,
34003439
external_lot_id, description
34013440
FROM labman.reagent_composition
34023441
LEFT JOIN labman.reagent_composition_type
34033442
USING (reagent_composition_type_id)""")
3404-
reagent = {}
3405-
for row in TRN.execute_fetchindex():
3406-
row = dict(row)
3407-
rid = row.pop('reagent_composition_id')
3408-
reagent[rid] = row
3443+
reagent = {dict(row)['reagent_composition_id']: dict(row)
3444+
for row in TRN.execute_fetchindex()}
3445+
# 3/3. marker gene primer sets
3446+
TRN.add("""SELECT marker_gene_primer_set_id, primer_set_id,
3447+
target_gene, target_subfragment, linker_sequence,
3448+
fwd_primer_sequence, rev_primer_sequence, region
3449+
FROM labman.marker_gene_primer_set""")
3450+
marker_gene_primer_set = {dict(row)['primer_set_id']: dict(row)
3451+
for row in TRN.execute_fetchindex()}
34093452

34103453
TRN.add(sql, [self.id])
34113454
for result in TRN.execute_fetchindex():
@@ -3415,13 +3458,13 @@ def generate_prep_information(self):
34153458
content = result.pop('content')
34163459

34173460
# format well
3418-
col = result.pop('col_num')
3419-
row = result.pop('row_num')
3461+
col = result['col_num']
3462+
row = result['row_num']
34203463
well = []
34213464
while row:
34223465
row, rem = divmod(row-1, 26)
34233466
well[:0] = container_module.LETTERS[rem]
3424-
result['well'] = ''.join(well) + str(col)
3467+
result['well_id'] = ''.join(well) + str(col)
34253468

34263469
# format extra fields list
34273470
for t, k, nk in extra_fields:
@@ -3436,13 +3479,34 @@ def generate_prep_information(self):
34363479
result[nk] = val
34373480

34383481
# format some final fields
3439-
result['platform'] = equipment[
3440-
result.pop('platform_id')]['description']
3482+
result['platform'] = 'Illumina'
3483+
result['instrument_model'] = ''
3484+
if assay == self._amplicon_assay_type:
3485+
result['extraction_robot'] = '%s_%s' % (
3486+
result.pop('epmotion_robot'),
3487+
result.pop('kingfisher_robot'))
3488+
result['primer_plate'] = result[
3489+
'primer_plate'].split(' ')[-1]
3490+
mgps = marker_gene_primer_set[result.pop('primer_set_id')]
3491+
result['PRIMER'] = '%s%s' % (
3492+
mgps['linker_sequence'], mgps['fwd_primer_sequence'])
3493+
result['pcr_primers'] = 'FWD:%s; REV:%s' % (
3494+
mgps['fwd_primer_sequence'],
3495+
mgps['rev_primer_sequence'])
3496+
result['linker'] = mgps['linker_sequence']
3497+
result['target_gene'] = mgps['target_gene']
3498+
result['target_subfragment'] = mgps['target_subfragment']
3499+
result['library_construction_protocol'] = (
3500+
'Illumina EMP protocol {0} amplification of {1}'
3501+
' {2}'.format(mgps['region'], mgps['target_gene'],
3502+
mgps['target_subfragment']))
34413503

34423504
if sid is not None and study_id is not None:
34433505
study = Study(study_id)
34443506
if study not in data:
34453507
data[study] = {}
3508+
# if we want the sample_name.well_id, just replace sid
3509+
# for content
34463510
data[study][content] = result
34473511

34483512
if assay == self._metagenomics_assay_type:
@@ -3452,16 +3516,66 @@ def generate_prep_information(self):
34523516
if assay == self._metagenomics_assay_type:
34533517
result['run_prefix'] = \
34543518
SequencingProcess._bcl_scrub_name(content)
3519+
34553520
blanks[content] = result
34563521

34573522
# converting from dict to pandas and then to tsv
34583523
for study, vals in data.items():
34593524
merged = {**vals, **blanks}
34603525
df = pd.DataFrame.from_dict(merged, orient='index')
3461-
df.sort_index(inplace=True)
3462-
cols = sorted(list(df.columns))
3526+
# the index/sample_name should be the original name if the
3527+
# original name if it's not duplicated or None (blanks/spikes)
3528+
dup_names = df[df.orig_name.duplicated()].orig_name.unique()
3529+
df.index = [v if v and v not in dup_names else k
3530+
for k, v in df.orig_name.iteritems()]
3531+
df['well_description'] = ['%s_%s_%s' % (
3532+
x.sample_plate, i, x.well_id) for i, x in df.iterrows()]
3533+
3534+
# the following lines apply for assay == self._amplicon_assay_type
3535+
# when we add shotgun (ToDo: #327), we'll need to modify
3536+
# 1/3. renaming colums so they match expected casing
3537+
mv = {
3538+
'barcode': 'BARCODE', 'master_mix': 'MasterMix_lot',
3539+
'platform': 'PLATFORM', 'sample_plate': 'Sample_Plate',
3540+
'run_prefix': 'RUN_PREFIX', 'primer_date': 'Primer_date',
3541+
'extraction_robot': 'Extraction_robot',
3542+
'runid': 'RUNID', 'epmotion_tm50_8_tool': 'TM50_8_tool',
3543+
'library_construction_protocol':
3544+
'LIBRARY_CONSTRUCTION_PROTOCOL',
3545+
'plating': 'Plating', 'linker': 'LINKER',
3546+
'project_name': 'Project_name', 'orig_name': 'Orig_name',
3547+
'well_id': 'Well_ID', 'water_lot': 'Water_Lot',
3548+
'well_description': 'Well_description',
3549+
'run_center': 'RUN_CENTER',
3550+
'epmotion_tool': 'TM1000_8_tool',
3551+
'extraction_kit': 'ExtractionKit_lot',
3552+
'primer_plate': 'Primer_Plate', 'run_date': 'RUN_DATE',
3553+
'gdata_robot': 'Processing_robot',
3554+
'epmotion_tm300_8_tool': 'TM300_8_tool',
3555+
'instrument_model': 'INSTRUMENT_MODEL',
3556+
'experiment_design_description':
3557+
'EXPERIMENT_DESIGN_DESCRIPTION'
3558+
}
3559+
df.rename(index=str, columns=mv, inplace=True)
3560+
# 2/3. sorting rows
3561+
rows_order = ['Sample_Plate', 'row_num', 'col_num']
3562+
df.sort_values(by=rows_order, inplace=True)
3563+
# 3/3. sorting and keeping only required columns
3564+
order = [
3565+
'BARCODE', 'PRIMER', 'Primer_Plate', 'Well_ID', 'Plating',
3566+
'ExtractionKit_lot', 'Extraction_robot', 'TM1000_8_tool',
3567+
'Primer_date', 'MasterMix_lot', 'Water_Lot',
3568+
'Processing_robot', 'TM300_8_tool', 'TM50_8_tool',
3569+
'Sample_Plate', 'Project_name', 'Orig_name',
3570+
'Well_description', 'EXPERIMENT_DESIGN_DESCRIPTION',
3571+
'LIBRARY_CONSTRUCTION_PROTOCOL', 'LINKER', 'PLATFORM',
3572+
'RUN_CENTER', 'RUN_DATE', 'RUN_PREFIX', 'pcr_primers',
3573+
'sequencing_meth', 'target_gene', 'target_subfragment',
3574+
'center_name', 'center_project_name', 'INSTRUMENT_MODEL',
3575+
'RUNID']
3576+
df = df[order]
34633577
sio = StringIO()
3464-
df[cols].to_csv(sio, sep='\t', index_label='sample_name')
3578+
df.to_csv(sio, sep='\t', index_label='sample_name')
34653579
data[study] = sio.getvalue()
34663580

34673581
return data

labman/db/tests/test_plate.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
# ----------------------------------------------------------------------------
88

99
from unittest import main
10-
from datetime import datetime
1110
from types import GeneratorType
1211

1312
from labman.db.testing import LabmanTestCase
@@ -376,12 +375,12 @@ def test_properties(self):
376375
# orders multiple processes in order from oldest to newest
377376
tester2 = Plate(26)
378377
self.assertEqual(len(tester2.quantification_processes), 2)
379-
self.assertEqual(tester2.quantification_processes[0].date,
380-
datetime.strptime("2017-10-25 19:10:25",
381-
'%Y-%m-%d %H:%M:%S'))
382-
self.assertEqual(tester2.quantification_processes[1].date,
383-
datetime.strptime("2017-10-26 03:10:25",
384-
'%Y-%m-%d %H:%M:%S'))
378+
# we are going to test the dates as string because in the database we
379+
# have the full date (including seconds)
380+
obs_date = str(tester2.quantification_processes[0].date)
381+
self.assertEqual(obs_date, "2017-10-25 19:10:25")
382+
obs_date = str(tester2.quantification_processes[1].date)
383+
self.assertEqual(obs_date, "2017-10-26 03:10:25")
385384

386385
def test_get_well(self):
387386
# Plate 21 - Defined in the test DB

0 commit comments

Comments
 (0)