8
8
9
9
from __future__ import division
10
10
from future .builtins import zip
11
- from future .utils import viewitems , PY3
12
11
from copy import deepcopy
13
12
from os .path import join
14
13
from time import strftime
15
- from functools import partial
16
- from os .path import basename
17
- from future .utils .six import StringIO
18
14
19
- import pandas as pd
20
- import numpy as np
21
- import warnings
22
15
from skbio .util import find_duplicates
23
- from skbio .io .util import open_file
24
16
25
17
from qiita_core .exceptions import IncompetentQiitaDeveloperError
26
- from qiita_db .exceptions import (QiitaDBDuplicateError , QiitaDBColumnError ,
27
- QiitaDBUnknownIDError , QiitaDBNotImplementedError ,
28
- QiitaDBDuplicateHeaderError , QiitaDBError ,
29
- QiitaDBWarning , QiitaDBExecutionError )
30
- from qiita_db .base import QiitaObject
18
+ from qiita_db .exceptions import (QiitaDBColumnError , QiitaDBUnknownIDError ,
19
+ QiitaDBDuplicateHeaderError , QiitaDBError ,
20
+ QiitaDBExecutionError )
31
21
from qiita_db .sql_connection import SQLConnectionHandler
32
22
from qiita_db .ontology import Ontology
33
- from qiita_db .util import (exists_table , get_table_cols ,
34
- convert_to_id ,
35
- convert_from_id , get_mountpoint , insert_filepaths ,
36
- scrub_data , infer_status )
37
- from qiita_db .study import Study
38
- from qiita_db .data import RawData
39
- from qiita_db .logger import LogEntry
23
+ from qiita_db .util import (convert_to_id , convert_from_id , get_mountpoint ,
24
+ infer_status )
40
25
from .base_metadata_template import BaseSample , MetadataTemplate
41
26
from .util import (as_python_types , get_invalid_sample_names , get_datatypes ,
42
- prefix_sample_names_with_id )
27
+ prefix_sample_names_with_id , load_template_to_dataframe )
43
28
44
- if PY3 :
45
- from string import ascii_letters as letters , digits
46
- else :
47
- from string import letters , digits
29
+
30
+ TARGET_GENE_DATA_TYPES = ['16S' , '18S' , 'ITS' ]
31
+ REQUIRED_TARGET_GENE_COLS = {'barcodesequence' , 'linkerprimersequence' ,
32
+ 'run_prefix' , 'library_construction_protocol' ,
33
+ 'experiment_design_description' , 'platform' }
34
+ RENAME_COLS_DICT = {'barcode' : 'barcodesequence' ,
35
+ 'primer' : 'linkerprimersequence' }
48
36
49
37
50
38
class PrepSample (BaseSample ):
@@ -163,98 +151,73 @@ def create(cls, md_template, raw_data, study, data_type,
163
151
164
152
# We need to check for some special columns, that are not present on
165
153
# the database, but depending on the data type are required.
166
- missing = cls ._check_special_columns (md_template , data_type_str )
154
+ missing = cls ._check_template_special_columns (md_template ,
155
+ data_type_str )
156
+ if missing :
157
+ raise QiitaDBColumnError ("Missing columns: %s"
158
+ % ', ' .join (missing ))
167
159
168
160
# Get some useful information from the metadata template
169
161
sample_ids = md_template .index .tolist ()
170
- num_samples = len (sample_ids )
171
-
172
- # Get the required columns from the DB
173
- db_cols = get_table_cols (cls ._table , conn_handler )
174
-
175
- # Remove the sample_id and study_id columns
176
- db_cols .remove ('sample_id' )
177
- db_cols .remove (cls ._id_column )
178
-
179
- # Retrieve the headers of the metadata template
180
162
headers = list (md_template .keys ())
181
163
182
- # Check that md_template has the required columns
183
- remaining = set (db_cols ).difference (headers )
184
- missing = missing .union (remaining )
185
- missing = missing .difference (cls .translate_cols_dict )
186
- if missing :
187
- raise QiitaDBColumnError ("Missing columns: %s"
188
- % ', ' .join (missing ))
189
-
190
164
# Insert the metadata template
191
165
# We need the prep_id for multiple calls below, which currently is not
192
166
# supported by the queue system. Thus, executing this outside the queue
167
+ sql = """INSERT INTO qiita.prep_template
168
+ (data_type_id, raw_data_id, investigation_type)
169
+ VALUES (%s, %s, %s)
170
+ RETURNING prep_template_id"""
193
171
prep_id = conn_handler .execute_fetchone (
194
- "INSERT INTO qiita.prep_template (data_type_id, raw_data_id, "
195
- "investigation_type) VALUES (%s, %s, %s) RETURNING "
196
- "prep_template_id" , (data_type_id , raw_data .id ,
197
- investigation_type ))[0 ]
172
+ sql , (data_type_id , raw_data .id , investigation_type ))[0 ]
198
173
199
174
# Insert values on required columns
200
- values = _as_python_types (md_template , db_cols )
201
- values .insert (0 , sample_ids )
202
- values .insert (0 , [prep_id ] * num_samples )
203
- values = [v for v in zip (* values )]
204
- conn_handler .add_to_queue (
205
- queue_name ,
206
- "INSERT INTO qiita.{0} ({1}, sample_id, {2}) "
207
- "VALUES (%s, %s, {3})" .format (
208
- cls ._table , cls ._id_column , ', ' .join (db_cols ),
209
- ', ' .join (['%s' ] * len (db_cols ))),
210
- values , many = True )
175
+ values = [(prep_id , s_id ) for s_id in sample_ids ]
176
+ sql = "INSERT INTO qiita.{0} ({1}, sample_id) VALUES (%s, %s)" .format (
177
+ cls ._table , cls ._id_column )
178
+ conn_handler .add_to_queue (queue_name , sql , values , many = True )
211
179
212
180
# Insert rows on *_columns table
213
- headers = list (set (headers ).difference (db_cols ))
214
- datatypes = _get_datatypes (md_template .ix [:, headers ])
181
+ datatypes = get_datatypes (md_template .ix [:, headers ])
215
182
# psycopg2 requires a list of tuples, in which each tuple is a set
216
183
# of values to use in the string formatting of the query. We have all
217
184
# the values in different lists (but in the same order) so use zip
218
185
# to create the list of tuples that psycopg2 requires.
219
- values = [
220
- v for v in zip ([prep_id ] * len (headers ), headers , datatypes )]
221
- conn_handler .add_to_queue (
222
- queue_name ,
223
- "INSERT INTO qiita.{0} ({1}, column_name, column_type) "
224
- "VALUES (%s, %s, %s)" .format (cls ._column_table , cls ._id_column ),
225
- values , many = True )
186
+ values = [(prep_id , h , d ) for h , d in zip (headers , datatypes )]
187
+ sql = """INSERT INTO qiita.{0} ({1}, column_name, column_type)
188
+ VALUES (%s, %s, %s)""" .format (cls ._column_table ,
189
+ cls ._id_column )
190
+ conn_handler .add_to_queue (queue_name , sql , values , many = True )
226
191
227
192
# Create table with custom columns
228
193
table_name = cls ._table_name (prep_id )
229
194
column_datatype = ["%s %s" % (col , dtype )
230
195
for col , dtype in zip (headers , datatypes )]
231
196
conn_handler .add_to_queue (
232
197
queue_name ,
233
- "CREATE TABLE qiita.{0} (sample_id varchar, "
234
- "{1})" . format ( table_name , ', ' .join (column_datatype )))
198
+ "CREATE TABLE qiita.{0} (sample_id varchar, {1})" . format (
199
+ table_name , ', ' .join (column_datatype )))
235
200
236
201
# Insert values on custom table
237
- values = _as_python_types (md_template , headers )
202
+ values = as_python_types (md_template , headers )
238
203
values .insert (0 , sample_ids )
239
204
values = [v for v in zip (* values )]
240
- conn_handler .add_to_queue (
241
- queue_name ,
242
- "INSERT INTO qiita.{0} (sample_id, {1}) "
243
- "VALUES (%s, {2})" .format (table_name , ", " .join (headers ),
244
- ', ' .join (["%s" ] * len (headers ))),
245
- values , many = True )
205
+ sql = "INSERT INTO qiita.{0} (sample_id, {1}) VALUES (%s, {2})" .format (
206
+ table_name , ", " .join (headers ), ', ' .join (["%s" ] * len (headers )))
207
+ conn_handler .add_to_queue (queue_name , sql , values , many = True )
246
208
247
209
try :
248
210
conn_handler .execute_queue (queue_name )
249
211
except Exception :
250
212
# Clean up row from qiita.prep_template
251
213
conn_handler .execute (
252
- "DELETE FROM qiita.prep_template where "
253
- "{0} = %s" .format (cls ._id_column ), (prep_id ,))
214
+ "DELETE FROM qiita.prep_template WHERE {0} = %s" .format (
215
+ cls ._id_column ),
216
+ (prep_id ,))
254
217
255
218
# Check if sample IDs present here but not in sample template
256
- sql = ( " SELECT sample_id from qiita.required_sample_info WHERE "
257
- " study_id = %s")
219
+ sql = """ SELECT sample_id FROM qiita.required_sample_info
220
+ WHERE study_id = %s"""
258
221
# Get list of study sample IDs, prep template study IDs,
259
222
# and their intersection
260
223
prep_samples = set (md_template .index .values )
@@ -426,11 +389,11 @@ def raw_data(self):
426
389
@property
427
390
def preprocessed_data (self ):
428
391
conn_handler = SQLConnectionHandler ()
429
- prep_datas = conn_handler .execute_fetchall (
392
+ prep_data = conn_handler .execute_fetchall (
430
393
"SELECT preprocessed_data_id FROM "
431
394
"qiita.prep_template_preprocessed_data WHERE prep_template_id=%s" ,
432
395
(self .id ,))
433
- return [x [0 ] for x in prep_datas ]
396
+ return [x [0 ] for x in prep_data ]
434
397
435
398
@property
436
399
def preprocessing_status (self ):
@@ -549,9 +512,19 @@ def create_qiime_mapping_file(self, prep_template_fp):
549
512
'description' : 'Description' ,
550
513
}
551
514
515
+ sql = """SELECT filepath_id, filepath
516
+ FROM qiita.filepath
517
+ JOIN qiita.sample_template_filepath
518
+ USING (filepath_id)
519
+ WHERE study_id=%s
520
+ ORDER BY filepath_id DESC"""
521
+
552
522
# getting the latest sample template
553
- _ , sample_template_fp = SampleTemplate (
554
- self .study_id ).get_filepaths ()[0 ]
523
+ conn_handler = SQLConnectionHandler ()
524
+ sample_template_fname = conn_handler .execute_fetchall (
525
+ sql , (self .study_id ,))[0 ][1 ]
526
+ _ , fp = get_mountpoint ('templates' )[0 ]
527
+ sample_template_fp = join (fp , sample_template_fname )
555
528
556
529
# reading files via pandas
557
530
st = load_template_to_dataframe (sample_template_fp )
@@ -562,8 +535,9 @@ def create_qiime_mapping_file(self, prep_template_fp):
562
535
if not pt_sample_names .issubset (st_sample_names ):
563
536
raise ValueError (
564
537
"Prep template is not a sub set of the sample template, files:"
565
- "%s %s - samples: %s" % (sample_template_fp , prep_template_fp ,
566
- str (pt_sample_names - st_sample_names )))
538
+ "%s %s - samples: %s"
539
+ % (sample_template_fp , prep_template_fp ,
540
+ str (pt_sample_names - st_sample_names )))
567
541
568
542
mapping = pt .join (st , lsuffix = "_prep" )
569
543
mapping .rename (columns = rename_cols , inplace = True )
@@ -580,7 +554,6 @@ def create_qiime_mapping_file(self, prep_template_fp):
580
554
mapping = mapping [new_cols ]
581
555
582
556
# figuring out the filepath for the QIIME map file
583
- _id , fp = get_mountpoint ('templates' )[0 ]
584
557
filepath = join (fp , '%d_prep_%d_qiime_%s.txt' % (self .study_id ,
585
558
self .id , strftime ("%Y%m%d-%H%M%S" )))
586
559
0 commit comments