Skip to content

Commit ed57160

Browse files
committed
Merge pull request #125 from josenavas/data_issue
Data issue
2 parents fb65166 + be3eac8 commit ed57160

File tree

8 files changed

+2649
-2487
lines changed

8 files changed

+2649
-2487
lines changed

qiita_db/data.py

Lines changed: 66 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,6 @@
3535
>>> print rd.id # doctest: +SKIP
3636
2
3737
38-
Retrieve if the raw data files have been submitted to insdc
39-
40-
>>> rd.is_submitted_to_insdc() # doctest: +SKIP
41-
False
42-
4338
Retrieve the filepaths associated with the raw data
4439
4540
>>> rd.get_filepaths() # doctest: +SKIP
@@ -91,7 +86,7 @@
9186
from .base import QiitaObject
9287
from .sql_connection import SQLConnectionHandler
9388
from .util import (exists_dynamic_table, get_db_files_base_dir,
94-
compute_checksum, insert_filepaths)
89+
insert_filepaths)
9590

9691

9792
class BaseData(QiitaObject):
@@ -195,7 +190,6 @@ class RawData(BaseData):
195190
Methods
196191
-------
197192
create
198-
is_submitted_to_insdc
199193
200194
See Also
201195
--------
@@ -210,7 +204,7 @@ class RawData(BaseData):
210204
_study_raw_table = "study_raw_data"
211205

212206
@classmethod
213-
def create(cls, filetype, filepaths, studies, submitted_to_insdc=False):
207+
def create(cls, filetype, filepaths, studies):
214208
r"""Creates a new object with a new id on the storage system
215209
216210
Parameters
@@ -221,8 +215,6 @@ def create(cls, filetype, filepaths, studies, submitted_to_insdc=False):
221215
The list of paths to the raw files and its filepath type identifier
222216
studies : list of Study
223217
The list of Study objects to which the raw data belongs to
224-
submitted_to_insdc : bool
225-
If true, the raw data files have been submitted to insdc
226218
227219
Returns
228220
-------
@@ -231,10 +223,9 @@ def create(cls, filetype, filepaths, studies, submitted_to_insdc=False):
231223
# Add the raw data to the database, and get the raw data id back
232224
conn_handler = SQLConnectionHandler()
233225
rd_id = conn_handler.execute_fetchone(
234-
"INSERT INTO qiita.{0} (filetype_id, submitted_to_insdc) VALUES "
235-
"(%(type_id)s, %(insdc)s) RETURNING "
236-
"raw_data_id".format(cls._table), {'type_id': filetype,
237-
'insdc': submitted_to_insdc})[0]
226+
"INSERT INTO qiita.{0} (filetype_id) VALUES (%s) RETURNING "
227+
"raw_data_id".format(cls._table),
228+
(filetype, ))[0]
238229
rd = cls(rd_id)
239230

240231
# Connect the raw data with its studies
@@ -247,19 +238,6 @@ def create(cls, filetype, filepaths, studies, submitted_to_insdc=False):
247238

248239
return rd
249240

250-
def is_submitted_to_insdc(self):
251-
r"""Tells if the raw data has been submitted to insdc
252-
253-
Returns
254-
-------
255-
bool
256-
True if the raw data have been submitted to insdc. False otherwise
257-
"""
258-
conn_handler = SQLConnectionHandler()
259-
return conn_handler.execute_fetchone(
260-
"SELECT submitted_to_insdc FROM qiita.{0} "
261-
"WHERE raw_data_id=%s".format(self._table), [self.id])[0]
262-
263241
@property
264242
def studies(self):
265243
r"""The list of study ids to which the raw data belongs to
@@ -287,6 +265,7 @@ class PreprocessedData(BaseData):
287265
Methods
288266
-------
289267
create
268+
is_submitted_to_insdc
290269
291270
See Also
292271
--------
@@ -297,16 +276,15 @@ class PreprocessedData(BaseData):
297276
_data_filepath_table = "preprocessed_filepath"
298277
_data_filepath_column = "preprocessed_data_id"
299278
_study_preprocessed_table = "study_preprocessed_data"
279+
_raw_preprocessed_table = "raw_preprocessed_data"
300280

301281
@classmethod
302-
def create(cls, raw_data, study, preprocessed_params_table,
303-
preprocessed_params_id, filepaths):
282+
def create(cls, study, preprocessed_params_table, preprocessed_params_id,
283+
filepaths, raw_data=None, submitted_to_insdc=False):
304284
r"""Creates a new object with a new id on the storage system
305285
306286
Parameters
307287
----------
308-
raw_data : RawData
309-
The RawData object used as base to this preprocessed data
310288
study : Study
311289
The study to which this preprocessed data belongs to
312290
preprocessed_params_table : str
@@ -317,6 +295,10 @@ def create(cls, raw_data, study, preprocessed_params_table,
317295
filepaths : iterable of tuples (str, int)
318296
The list of paths to the preprocessed files and its filepath type
319297
identifier
298+
submitted_to_insdc : bool, optional
299+
If true, the raw data files have been submitted to insdc
300+
raw_data : RawData, optional
301+
The RawData object used as base to this preprocessed data
320302
321303
Raises
322304
------
@@ -333,11 +315,13 @@ def create(cls, raw_data, study, preprocessed_params_table,
333315
# Add the preprocessed data to the database,
334316
# and get the preprocessed data id back
335317
ppd_id = conn_handler.execute_fetchone(
336-
"INSERT INTO qiita.{0} (raw_data_id, preprocessed_params_table, "
337-
"preprocessed_params_id) VALUES (%(raw_id)s, %(param_table)s, "
338-
"%(param_id)s) RETURNING preprocessed_data_id".format(cls._table),
339-
{'raw_id': raw_data.id, 'param_table': preprocessed_params_table,
340-
'param_id': preprocessed_params_id})[0]
318+
"INSERT INTO qiita.{0} (preprocessed_params_table, "
319+
"preprocessed_params_id, submitted_to_insdc) VALUES "
320+
"(%(param_table)s, %(param_id)s, %(insdc)s) "
321+
"RETURNING preprocessed_data_id".format(cls._table),
322+
{'param_table': preprocessed_params_table,
323+
'param_id': preprocessed_params_id,
324+
'insdc': submitted_to_insdc})[0]
341325
ppd = cls(ppd_id)
342326

343327
# Connect the preprocessed data with its study
@@ -346,6 +330,13 @@ def create(cls, raw_data, study, preprocessed_params_table,
346330
"VALUES (%s, %s)".format(ppd._study_preprocessed_table),
347331
(study.id, ppd.id))
348332

333+
if raw_data is not None:
334+
# Connect the preprocessed data with the raw data
335+
conn_handler.execute(
336+
"INSERT INTO qiita.{0} (raw_data_id, preprocessed_data_id) "
337+
"VALUES (%s, %s)".format(cls._raw_preprocessed_table),
338+
(raw_data.id, ppd_id))
339+
349340
ppd._add_filepaths(filepaths, conn_handler)
350341
return ppd
351342

@@ -355,7 +346,7 @@ def raw_data(self):
355346
conn_handler = SQLConnectionHandler()
356347
return conn_handler.execute_fetchone(
357348
"SELECT raw_data_id FROM qiita.{0} WHERE "
358-
"preprocessed_data_id=%s".format(self._table),
349+
"preprocessed_data_id=%s".format(self._raw_preprocessed_table),
359350
[self._id])[0]
360351

361352
@property
@@ -372,6 +363,19 @@ def study(self):
372363
"preprocessed_data_id=%s".format(self._study_preprocessed_table),
373364
[self._id])[0]
374365

366+
def is_submitted_to_insdc(self):
367+
r"""Tells if the raw data has been submitted to insdc
368+
369+
Returns
370+
-------
371+
bool
372+
True if the raw data have been submitted to insdc. False otherwise
373+
"""
374+
conn_handler = SQLConnectionHandler()
375+
return conn_handler.execute_fetchone(
376+
"SELECT submitted_to_insdc FROM qiita.{0} "
377+
"WHERE preprocessed_data_id=%s".format(self._table), (self.id,))[0]
378+
375379

376380
class ProcessedData(BaseData):
377381
r"""Object for dealing with processed data
@@ -392,15 +396,14 @@ class ProcessedData(BaseData):
392396
_table = "processed_data"
393397
_data_filepath_table = "processed_filepath"
394398
_data_filepath_column = "processed_data_id"
399+
_preprocessed_processed_table = "preprocessed_processed_data"
395400

396401
@classmethod
397-
def create(cls, preprocessed_data, processed_params_table,
398-
processed_params_id, filepaths, processed_date=None):
402+
def create(cls, processed_params_table, processed_params_id, filepaths,
403+
preprocessed_data=None, processed_date=None):
399404
r"""
400405
Parameters
401406
----------
402-
preprocessed_data : PreprocessedData
403-
The PreprocessedData object used as base to this processed data
404407
processed_params_table : str
405408
Name of the table that holds the preprocessing parameters used
406409
processed_params_id : int
@@ -409,6 +412,8 @@ def create(cls, preprocessed_data, processed_params_table,
409412
filepaths : iterable of tuples (str, int)
410413
The list of paths to the processed files and its filepath type
411414
identifier
415+
preprocessed_data : PreprocessedData, optional
416+
The PreprocessedData object used as base to this processed data
412417
processed_date : datetime, optional
413418
Date in which the data have been processed. Default: now
414419
@@ -432,16 +437,22 @@ def create(cls, preprocessed_data, processed_params_table,
432437
# Add the processed data to the database,
433438
# and get the processed data id back
434439
pd_id = conn_handler.execute_fetchone(
435-
"INSERT INTO qiita.{0} (preprocessed_data_id, "
436-
"processed_params_table, processed_params_id, processed_date) "
437-
"VALUES (%(prep_data_id)s, %(param_table)s, %(param_id)s, "
438-
"%(date)s) RETURNING processed_data_id".format(cls._table),
439-
{'prep_data_id': preprocessed_data.id,
440-
'param_table': processed_params_table,
440+
"INSERT INTO qiita.{0} (processed_params_table, "
441+
"processed_params_id, processed_date) VALUES (%(param_table)s, "
442+
"%(param_id)s, %(date)s) RETURNING "
443+
"processed_data_id".format(cls._table),
444+
{'param_table': processed_params_table,
441445
'param_id': processed_params_id,
442446
'date': processed_date})[0]
443447

444448
pd = cls(pd_id)
449+
450+
if preprocessed_data is not None:
451+
conn_handler.execute(
452+
"INSERT INTO qiita.{0} (preprocessed_data_id, "
453+
"processed_data_id) VALUES "
454+
"(%s, %s)".format(cls._preprocessed_processed_table),
455+
(preprocessed_data.id, pd_id))
445456
pd._add_filepaths(filepaths, conn_handler)
446457
return cls(pd_id)
447458

@@ -451,18 +462,18 @@ def preprocessed_data(self):
451462
conn_handler = SQLConnectionHandler()
452463
return conn_handler.execute_fetchone(
453464
"SELECT preprocessed_data_id FROM qiita.{0} WHERE "
454-
"processed_data_id=%s".format(self._table),
465+
"processed_data_id=%s".format(self._preprocessed_processed_table),
455466
[self._id])[0]
456467

457468
@property
458469
def data_type(self):
459470
r"""The data_type of the data used"""
460471
conn_handler = SQLConnectionHandler()
461-
sql = ("SELECT DISTINCT DT.data_type FROM qiita.processed_data PD "
462-
"JOIN qiita.preprocessed_data PPD on PD.preprocessed_data_id "
463-
"= PPD.preprocessed_data_id JOIN qiita.raw_data RD on "
464-
"PPD.raw_data_id = RD.raw_data_id "
465-
"JOIN qiita.common_prep_info CPI ON RD.raw_data_id = "
466-
"CPI.raw_data_id JOIN qiita.data_type DT ON CPI.data_type_id = "
467-
"DT.data_type_id WHERE PD.processed_data_id = %s")
472+
sql = ("SELECT DISTINCT DT.data_type FROM "
473+
"qiita.preprocessed_processed_data PPD JOIN "
474+
"qiita.raw_preprocessed_data RPD on PPD.preprocessed_data_id = "
475+
"RPD.preprocessed_data_id JOIN qiita.common_prep_info CPI ON "
476+
"RPD.raw_data_id = CPI.raw_data_id JOIN qiita.data_type DT ON "
477+
"CPI.data_type_id = DT.data_type_id WHERE "
478+
"PPD.processed_data_id = %s")
468479
return conn_handler.execute_fetchone(sql, [self._id])[0]

qiita_db/study.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,8 @@
102102

103103
from qiita_core.exceptions import IncompetentQiitaDeveloperError
104104
from .base import QiitaStatusObject, QiitaObject
105-
from .exceptions import (QiitaDBDuplicateError, QiitaDBStatusError,
106-
QiitaDBColumnError)
107-
from .util import check_required_columns, check_table_cols, convert_to_id
105+
from .exceptions import (QiitaDBStatusError, QiitaDBColumnError)
106+
from .util import check_required_columns, check_table_cols
108107
from .sql_connection import SQLConnectionHandler
109108

110109

@@ -475,7 +474,8 @@ def processed_data(self):
475474
list of ProcessedData ids
476475
"""
477476
conn_handler = SQLConnectionHandler()
478-
sql = ("SELECT processed_data_id FROM qiita.processed_data WHERE "
477+
sql = ("SELECT processed_data_id FROM "
478+
"qiita.preprocessed_processed_data WHERE "
479479
"preprocessed_data_id IN (SELECT preprocessed_data_id FROM "
480480
"qiita.study_preprocessed_data where study_id = %s)")
481481
return [x[0] for x in conn_handler.execute_fetchall(sql, (self._id,))]

qiita_db/support_files/populate_test_db.sql

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ INSERT INTO qiita.study_experimental_factor (study_id, efo_id) VALUES (1, 1);
5353
INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id) VALUES ('1_s_G1_L001_sequences.fastq.gz', 1, '852952723', 1), ('1_s_G1_L001_sequences_barcodes.fastq.gz', 2, '852952723', 1), ('2_sequences.fastq.gz', 1, '852952723', 1), ('2_sequences_barcodes.fastq.gz', 2, '852952723', 1);
5454

5555
-- Insert the raw data information for study 1
56-
INSERT INTO qiita.raw_data (filetype_id, submitted_to_insdc) VALUES (2, FALSE), (2, TRUE);
56+
INSERT INTO qiita.raw_data (filetype_id) VALUES (2), (2);
5757

5858
-- Insert (link) the raw data with the raw filepaths
5959
INSERT INTO qiita.raw_filepath (raw_data_id, filepath_id) VALUES (1, 1), (1, 2), (2, 3), (2, 4);
@@ -284,7 +284,10 @@ INSERT INTO qiita.prep_1 (sample_id, BarcodeSequence, LIBRARY_CONSTRUCTION_PROTO
284284
('SKM9.640192', 'AGCAGGCACGAA', 'This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions.', 'GTGCCAGCMGCCGCGGTAA', 'V4', '16S rRNA', 'ANL', 's_G1_L001_sequences', '8/1/12', 'ANL', 'micro biome of soil and rhizosphere of cannabis plants from CA', 'Cannabis Soil Microbiome', 'Illumina', '.25,g', 'Sequencing by synthesis', 'MiSeq', 'ANL', 'FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT', 'CCME');
285285

286286
-- Insert preprocessed information for raw data 1
287-
INSERT INTO qiita.preprocessed_data (raw_data_id, preprocessed_params_table, preprocessed_params_id) VALUES (1, 'preprocessed_sequence_illumina_params', 1), (1, 'preprocessed_sequence_illumina_params', 2);
287+
INSERT INTO qiita.preprocessed_data (preprocessed_params_table, preprocessed_params_id, submitted_to_insdc) VALUES ('preprocessed_sequence_illumina_params', 1, TRUE), ('preprocessed_sequence_illumina_params', 2, FALSE);
288+
289+
-- Link the new preprocessed data with the raw data
290+
INSERT INTO qiita.raw_preprocessed_data (raw_data_id, preprocessed_data_id) VALUES (1, 1), (1, 2);
288291

289292
-- Insert (link) preprocessed information to study 1
290293
INSERT INTO qiita.study_preprocessed_data (preprocessed_data_id, study_id) VALUES (1, 1), (2, 1);
@@ -299,7 +302,10 @@ INSERT INTO qiita.preprocessed_filepath (preprocessed_data_id, filepath_id) VALU
299302
INSERT INTO qiita.preprocessed_sequence_illumina_params (trim_length) VALUES (151), (100);
300303

301304
-- Insert processed information for study 0 and processed data 1
302-
INSERT INTO qiita.processed_data (preprocessed_data_id, processed_params_table, processed_params_id, processed_date) VALUES (1, 'processed_params_uclust', 1, 'Mon Oct 1 09:30:27 2012');
305+
INSERT INTO qiita.processed_data (processed_params_table, processed_params_id, processed_date) VALUES ('processed_params_uclust', 1, 'Mon Oct 1 09:30:27 2012');
306+
307+
-- Link the processed data with the preprocessed data
308+
INSERT INTO qiita.preprocessed_processed_data (preprocessed_data_id, processed_data_id) VALUES (1, 1);
303309

304310
-- Populate the reference table
305311
INSERT INTO qiita.reference (reference_name, reference_version, sequence_filepath, taxonomy_filepath, tree_filepath) VALUES ('GreenGenes', '4feb2011', 'gg_97_otus_4feb2011.fasta', 'greengenes_tax.txt', 'gg_97_otus_4feb2011.tre');

0 commit comments

Comments
 (0)