Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,6 @@ ENV/
*.swp
*.swo
*~

# keys for https
support_files/*
192 changes: 153 additions & 39 deletions labman/db/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -3233,23 +3233,66 @@ def generate_prep_information(self):
('r', 'water_lot_id', 'water_lot'),
]
sql = """
SELECT study_id, sample_id, content, run_name, experiment,
fwd_cycles, rev_cycles, principal_investigator,
et.description as sequencer_description,
lpp.epmotion_robot_id as lepmotion_robot_id,
epmotion_tm300_8_tool_id, epmotion_tm50_8_tool_id,
master_mix_id, water_lot_id,
gep.epmotion_robot_id as gepmotion_robot_id,
epmotion_tool_id, kingfisher_robot_id,
extraction_kit_id,
p1.external_id as plate, w1.row_num as row_num,
w1.col_num as col_num,
p2.external_id as primer_composition,
psc.barcode_seq as primer_set_composition,
run_name as run_prefix, sp.sequencer_id as platform_id,
sp.experiment as center_project_name
SELECT study_id, sample_id,
-- BARCODE
psc.barcode_seq AS barcode,
-- primer_set_id links to marker_gene_primer_set_id,
-- where we can get the linker/primer
primer_set_id,
-- primer_plate
p2.external_id AS primer_plate,
-- the well_id is a combination of the
--- row/col_num + content
w1.row_num AS row_num, w1.col_num AS col_num, content,
-- where to get the platting name
process.run_personnel_id AS plating,
-- extractionkit_lot
extraction_kit_id,
-- extraction_robot, both the epmotion robot and the
-- kingfisher robot
lpp.epmotion_robot_id AS lepmotion_robot_id,
kingfisher_robot_id,
-- TM1000_8_tool
epmotion_tool_id,
-- mastermix_lot
master_mix_id,
-- water_lot
water_lot_id,
-- processing_robot
gep.epmotion_robot_id AS gepmotion_robot_id,
-- TM300_8_tool
epmotion_tm300_8_tool_id,
-- TM50_8_tool
epmotion_tm50_8_tool_id,
-- sample_plate
p1.external_id AS sample_plate,
-- project_name
study_alias AS project_name,
-- orig_name
sample_id AS orig_name,
-- experiment_design_description
study_description AS experiment_design_description,
-- run_center
'UCSDMI' AS run_center,
-- primer_date
run_date AS primer_date,
-- run_date
'' AS run_date,
-- RUN_PREFIX
'' AS run_prefix,
-- sequencing_meth
'Sequencing by synthesis' AS sequencing_meth,
-- center_name
'UCSDMI' AS center_name,
-- center_project_name
'' AS center_project_name,
-- instrument_model
et.description AS instrument_model,
-- runid
'' AS runid
-- Retrieve sequencing information
FROM labman.sequencing_process sp
LEFT JOIN labman.process process USING (process_id)
LEFT JOIN labman.equipment e ON (
sequencer_id = equipment_id)
LEFT JOIN labman.equipment_type et ON (
Expand Down Expand Up @@ -3300,8 +3343,8 @@ def generate_prep_information(self):
pc.primer_set_composition_id =
psc.primer_set_composition_id)
FULL JOIN qiita.study_sample USING (sample_id)
WHERE sequencing_process_id = %s
ORDER BY study_id, sample_id, row_num, col_num"""
LEFT JOIN qiita.study USING (study_id)
WHERE sequencing_process_id = %s"""
elif assay == self._metagenomics_assay_type:
extra_fields = [
('e', 'gepmotion_robot_id', 'gdata_robot'),
Expand Down Expand Up @@ -3380,32 +3423,32 @@ def generate_prep_information(self):
w1.plate_id = p1.plate_id)
FULL JOIN qiita.study_sample USING (sample_id)
WHERE sequencing_process_id = %s
ORDER BY study_id, sample_id, row_num, col_num, i5.barcode_seq
"""

with sql_connection.TRN as TRN:
# to simplify the main queries, let's get all the equipment info
# Let's cache some data to avoid quering the DB multiple times
# 1/3. equipment
TRN.add("""SELECT equipment_id, external_id, notes, description
FROM labman.equipment
LEFT JOIN labman.equipment_type
USING (equipment_type_id)""")
equipment = {}
for row in TRN.execute_fetchindex():
row = dict(row)
eid = row.pop('equipment_id')
equipment[eid] = row

# and the reagents
equipment = {dict(row)['equipment_id']: dict(row)
for row in TRN.execute_fetchindex()}
# 2/3. reagents
TRN.add("""SELECT reagent_composition_id, composition_id,
external_lot_id, description
FROM labman.reagent_composition
LEFT JOIN labman.reagent_composition_type
USING (reagent_composition_type_id)""")
reagent = {}
for row in TRN.execute_fetchindex():
row = dict(row)
rid = row.pop('reagent_composition_id')
reagent[rid] = row
reagent = {dict(row)['reagent_composition_id']: dict(row)
for row in TRN.execute_fetchindex()}
# 3/3. marker gene primer sets
TRN.add("""SELECT marker_gene_primer_set_id, primer_set_id,
target_gene, target_subfragment, linker_sequence,
fwd_primer_sequence, rev_primer_sequence, region
FROM labman.marker_gene_primer_set""")
marker_gene_primer_set = {dict(row)['primer_set_id']: dict(row)
for row in TRN.execute_fetchindex()}

TRN.add(sql, [self.id])
for result in TRN.execute_fetchindex():
Expand All @@ -3415,13 +3458,13 @@ def generate_prep_information(self):
content = result.pop('content')

# format well
col = result.pop('col_num')
row = result.pop('row_num')
col = result['col_num']
row = result['row_num']
well = []
while row:
row, rem = divmod(row-1, 26)
well[:0] = container_module.LETTERS[rem]
result['well'] = ''.join(well) + str(col)
result['well_id'] = ''.join(well) + str(col)

# format extra fields list
for t, k, nk in extra_fields:
Expand All @@ -3436,13 +3479,34 @@ def generate_prep_information(self):
result[nk] = val

# format some final fields
result['platform'] = equipment[
result.pop('platform_id')]['description']
result['platform'] = 'Illumina'
result['instrument_model'] = ''
if assay == self._amplicon_assay_type:
result['extraction_robot'] = '%s_%s' % (
result.pop('epmotion_robot'),
result.pop('kingfisher_robot'))
result['primer_plate'] = result[
'primer_plate'].split(' ')[-1]
mgps = marker_gene_primer_set[result.pop('primer_set_id')]
result['PRIMER'] = '%s%s' % (
mgps['linker_sequence'], mgps['fwd_primer_sequence'])
result['pcr_primers'] = 'FWD:%s; REV:%s' % (
mgps['fwd_primer_sequence'],
mgps['rev_primer_sequence'])
result['linker'] = mgps['linker_sequence']
result['target_gene'] = mgps['target_gene']
result['target_subfragment'] = mgps['target_subfragment']
result['library_construction_protocol'] = (
'Illumina EMP protocol {0} amplification of {1}'
' {2}'.format(mgps['region'], mgps['target_gene'],
mgps['target_subfragment']))

if sid is not None and study_id is not None:
study = Study(study_id)
if study not in data:
data[study] = {}
# if we want the sample_name.well_id, just replace sid
# for content
data[study][content] = result

if assay == self._metagenomics_assay_type:
Expand All @@ -3452,16 +3516,66 @@ def generate_prep_information(self):
if assay == self._metagenomics_assay_type:
result['run_prefix'] = \
SequencingProcess._bcl_scrub_name(content)

blanks[content] = result

# converting from dict to pandas and then to tsv
for study, vals in data.items():
merged = {**vals, **blanks}
df = pd.DataFrame.from_dict(merged, orient='index')
df.sort_index(inplace=True)
cols = sorted(list(df.columns))
# the index/sample_name should be the original name if the
# original name if it's not duplicated or None (blanks/spikes)
dup_names = df[df.orig_name.duplicated()].orig_name.unique()
df.index = [v if v and v not in dup_names else k
for k, v in df.orig_name.iteritems()]
df['well_description'] = ['%s_%s_%s' % (
x.sample_plate, i, x.well_id) for i, x in df.iterrows()]

# the following lines apply for assay == self._amplicon_assay_type
# when we add shotgun (ToDo: #327), we'll need to modify
# 1/3. renaming colums so they match expected casing
mv = {
'barcode': 'BARCODE', 'master_mix': 'MasterMix_lot',
'platform': 'PLATFORM', 'sample_plate': 'Sample_Plate',
'run_prefix': 'RUN_PREFIX', 'primer_date': 'Primer_date',
'extraction_robot': 'Extraction_robot',
'runid': 'RUNID', 'epmotion_tm50_8_tool': 'TM50_8_tool',
'library_construction_protocol':
'LIBRARY_CONSTRUCTION_PROTOCOL',
'plating': 'Plating', 'linker': 'LINKER',
'project_name': 'Project_name', 'orig_name': 'Orig_name',
'well_id': 'Well_ID', 'water_lot': 'Water_Lot',
'well_description': 'Well_description',
'run_center': 'RUN_CENTER',
'epmotion_tool': 'TM1000_8_tool',
'extraction_kit': 'ExtractionKit_lot',
'primer_plate': 'Primer_Plate', 'run_date': 'RUN_DATE',
'gdata_robot': 'Processing_robot',
'epmotion_tm300_8_tool': 'TM300_8_tool',
'instrument_model': 'INSTRUMENT_MODEL',
'experiment_design_description':
'EXPERIMENT_DESIGN_DESCRIPTION'
}
df.rename(index=str, columns=mv, inplace=True)
# 2/3. sorting rows
rows_order = ['Sample_Plate', 'row_num', 'col_num']
df.sort_values(by=rows_order, inplace=True)
# 3/3. sorting and keeping only required columns
order = [
'BARCODE', 'PRIMER', 'Primer_Plate', 'Well_ID', 'Plating',
'ExtractionKit_lot', 'Extraction_robot', 'TM1000_8_tool',
'Primer_date', 'MasterMix_lot', 'Water_Lot',
'Processing_robot', 'TM300_8_tool', 'TM50_8_tool',
'Sample_Plate', 'Project_name', 'Orig_name',
'Well_description', 'EXPERIMENT_DESIGN_DESCRIPTION',
'LIBRARY_CONSTRUCTION_PROTOCOL', 'LINKER', 'PLATFORM',
'RUN_CENTER', 'RUN_DATE', 'RUN_PREFIX', 'pcr_primers',
'sequencing_meth', 'target_gene', 'target_subfragment',
'center_name', 'center_project_name', 'INSTRUMENT_MODEL',
'RUNID']
df = df[order]
sio = StringIO()
df[cols].to_csv(sio, sep='\t', index_label='sample_name')
df.to_csv(sio, sep='\t', index_label='sample_name')
data[study] = sio.getvalue()

return data
13 changes: 6 additions & 7 deletions labman/db/tests/test_plate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
# ----------------------------------------------------------------------------

from unittest import main
from datetime import datetime
from types import GeneratorType

from labman.db.testing import LabmanTestCase
Expand Down Expand Up @@ -376,12 +375,12 @@ def test_properties(self):
# orders multiple processes in order from oldest to newest
tester2 = Plate(26)
self.assertEqual(len(tester2.quantification_processes), 2)
self.assertEqual(tester2.quantification_processes[0].date,
datetime.strptime("2017-10-25 19:10:25",
'%Y-%m-%d %H:%M:%S'))
self.assertEqual(tester2.quantification_processes[1].date,
datetime.strptime("2017-10-26 03:10:25",
'%Y-%m-%d %H:%M:%S'))
# we are going to test the dates as string because in the database we
# have the full date (including seconds)
obs_date = str(tester2.quantification_processes[0].date)
self.assertEqual(obs_date, "2017-10-25 19:10:25")
obs_date = str(tester2.quantification_processes[1].date)
self.assertEqual(obs_date, "2017-10-26 03:10:25")

def test_get_well(self):
# Plate 21 - Defined in the test DB
Expand Down
Loading