Skip to content

Data issue #125

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 20, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 66 additions & 55 deletions qiita_db/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,6 @@
>>> print rd.id # doctest: +SKIP
2

Retrieve if the raw data files have been submitted to insdc

>>> rd.is_submitted_to_insdc() # doctest: +SKIP
False

Retrieve the filepaths associated with the raw data

>>> rd.get_filepaths() # doctest: +SKIP
Expand Down Expand Up @@ -91,7 +86,7 @@
from .base import QiitaObject
from .sql_connection import SQLConnectionHandler
from .util import (exists_dynamic_table, get_db_files_base_dir,
compute_checksum, insert_filepaths)
insert_filepaths)


class BaseData(QiitaObject):
Expand Down Expand Up @@ -195,7 +190,6 @@ class RawData(BaseData):
Methods
-------
create
is_submitted_to_insdc

See Also
--------
Expand All @@ -210,7 +204,7 @@ class RawData(BaseData):
_study_raw_table = "study_raw_data"

@classmethod
def create(cls, filetype, filepaths, studies, submitted_to_insdc=False):
def create(cls, filetype, filepaths, studies):
r"""Creates a new object with a new id on the storage system

Parameters
Expand All @@ -221,8 +215,6 @@ def create(cls, filetype, filepaths, studies, submitted_to_insdc=False):
The list of paths to the raw files and its filepath type identifier
studies : list of Study
The list of Study objects to which the raw data belongs to
submitted_to_insdc : bool
If true, the raw data files have been submitted to insdc

Returns
-------
Expand All @@ -231,10 +223,9 @@ def create(cls, filetype, filepaths, studies, submitted_to_insdc=False):
# Add the raw data to the database, and get the raw data id back
conn_handler = SQLConnectionHandler()
rd_id = conn_handler.execute_fetchone(
"INSERT INTO qiita.{0} (filetype_id, submitted_to_insdc) VALUES "
"(%(type_id)s, %(insdc)s) RETURNING "
"raw_data_id".format(cls._table), {'type_id': filetype,
'insdc': submitted_to_insdc})[0]
"INSERT INTO qiita.{0} (filetype_id) VALUES (%s) RETURNING "
"raw_data_id".format(cls._table),
(filetype, ))[0]
rd = cls(rd_id)

# Connect the raw data with its studies
Expand All @@ -247,19 +238,6 @@ def create(cls, filetype, filepaths, studies, submitted_to_insdc=False):

return rd

def is_submitted_to_insdc(self):
r"""Tells if the raw data has been submitted to insdc

Returns
-------
bool
True if the raw data have been submitted to insdc. False otherwise
"""
conn_handler = SQLConnectionHandler()
return conn_handler.execute_fetchone(
"SELECT submitted_to_insdc FROM qiita.{0} "
"WHERE raw_data_id=%s".format(self._table), [self.id])[0]

@property
def studies(self):
r"""The list of study ids to which the raw data belongs to
Expand Down Expand Up @@ -287,6 +265,7 @@ class PreprocessedData(BaseData):
Methods
-------
create
is_submitted_to_insdc

See Also
--------
Expand All @@ -297,16 +276,15 @@ class PreprocessedData(BaseData):
_data_filepath_table = "preprocessed_filepath"
_data_filepath_column = "preprocessed_data_id"
_study_preprocessed_table = "study_preprocessed_data"
_raw_preprocessed_table = "raw_preprocessed_data"

@classmethod
def create(cls, raw_data, study, preprocessed_params_table,
preprocessed_params_id, filepaths):
def create(cls, study, preprocessed_params_table, preprocessed_params_id,
filepaths, raw_data=None, submitted_to_insdc=False):
r"""Creates a new object with a new id on the storage system

Parameters
----------
raw_data : RawData
The RawData object used as base to this preprocessed data
study : Study
The study to which this preprocessed data belongs to
preprocessed_params_table : str
Expand All @@ -317,6 +295,10 @@ def create(cls, raw_data, study, preprocessed_params_table,
filepaths : iterable of tuples (str, int)
The list of paths to the preprocessed files and its filepath type
identifier
submitted_to_insdc : bool, optional
If true, the raw data files have been submitted to insdc
raw_data : RawData, optional
The RawData object used as base to this preprocessed data

Raises
------
Expand All @@ -333,11 +315,13 @@ def create(cls, raw_data, study, preprocessed_params_table,
# Add the preprocessed data to the database,
# and get the preprocessed data id back
ppd_id = conn_handler.execute_fetchone(
"INSERT INTO qiita.{0} (raw_data_id, preprocessed_params_table, "
"preprocessed_params_id) VALUES (%(raw_id)s, %(param_table)s, "
"%(param_id)s) RETURNING preprocessed_data_id".format(cls._table),
{'raw_id': raw_data.id, 'param_table': preprocessed_params_table,
'param_id': preprocessed_params_id})[0]
"INSERT INTO qiita.{0} (preprocessed_params_table, "
"preprocessed_params_id, submitted_to_insdc) VALUES "
"(%(param_table)s, %(param_id)s, %(insdc)s) "
"RETURNING preprocessed_data_id".format(cls._table),
{'param_table': preprocessed_params_table,
'param_id': preprocessed_params_id,
'insdc': submitted_to_insdc})[0]
ppd = cls(ppd_id)

# Connect the preprocessed data with its study
Expand All @@ -346,6 +330,13 @@ def create(cls, raw_data, study, preprocessed_params_table,
"VALUES (%s, %s)".format(ppd._study_preprocessed_table),
(study.id, ppd.id))

if raw_data is not None:
# Connect the preprocessed data with the raw data
conn_handler.execute(
"INSERT INTO qiita.{0} (raw_data_id, preprocessed_data_id) "
"VALUES (%s, %s)".format(cls._raw_preprocessed_table),
(raw_data.id, ppd_id))

ppd._add_filepaths(filepaths, conn_handler)
return ppd

Expand All @@ -355,7 +346,7 @@ def raw_data(self):
conn_handler = SQLConnectionHandler()
return conn_handler.execute_fetchone(
"SELECT raw_data_id FROM qiita.{0} WHERE "
"preprocessed_data_id=%s".format(self._table),
"preprocessed_data_id=%s".format(self._raw_preprocessed_table),
[self._id])[0]

@property
Expand All @@ -372,6 +363,19 @@ def study(self):
"preprocessed_data_id=%s".format(self._study_preprocessed_table),
[self._id])[0]

def is_submitted_to_insdc(self):
r"""Tells if the raw data has been submitted to insdc

Returns
-------
bool
True if the raw data have been submitted to insdc. False otherwise
"""
conn_handler = SQLConnectionHandler()
return conn_handler.execute_fetchone(
"SELECT submitted_to_insdc FROM qiita.{0} "
"WHERE preprocessed_data_id=%s".format(self._table), (self.id,))[0]


class ProcessedData(BaseData):
r"""Object for dealing with processed data
Expand All @@ -392,15 +396,14 @@ class ProcessedData(BaseData):
_table = "processed_data"
_data_filepath_table = "processed_filepath"
_data_filepath_column = "processed_data_id"
_preprocessed_processed_table = "preprocessed_processed_data"

@classmethod
def create(cls, preprocessed_data, processed_params_table,
processed_params_id, filepaths, processed_date=None):
def create(cls, processed_params_table, processed_params_id, filepaths,
preprocessed_data=None, processed_date=None):
r"""
Parameters
----------
preprocessed_data : PreprocessedData
The PreprocessedData object used as base to this processed data
processed_params_table : str
Name of the table that holds the preprocessing parameters used
processed_params_id : int
Expand All @@ -409,6 +412,8 @@ def create(cls, preprocessed_data, processed_params_table,
filepaths : iterable of tuples (str, int)
The list of paths to the processed files and its filepath type
identifier
preprocessed_data : PreprocessedData, optional
The PreprocessedData object used as base to this processed data
processed_date : datetime, optional
Date in which the data have been processed. Default: now

Expand All @@ -432,16 +437,22 @@ def create(cls, preprocessed_data, processed_params_table,
# Add the processed data to the database,
# and get the processed data id back
pd_id = conn_handler.execute_fetchone(
"INSERT INTO qiita.{0} (preprocessed_data_id, "
"processed_params_table, processed_params_id, processed_date) "
"VALUES (%(prep_data_id)s, %(param_table)s, %(param_id)s, "
"%(date)s) RETURNING processed_data_id".format(cls._table),
{'prep_data_id': preprocessed_data.id,
'param_table': processed_params_table,
"INSERT INTO qiita.{0} (processed_params_table, "
"processed_params_id, processed_date) VALUES (%(param_table)s, "
"%(param_id)s, %(date)s) RETURNING "
"processed_data_id".format(cls._table),
{'param_table': processed_params_table,
'param_id': processed_params_id,
'date': processed_date})[0]

pd = cls(pd_id)

if preprocessed_data is not None:
conn_handler.execute(
"INSERT INTO qiita.{0} (preprocessed_data_id, "
"processed_data_id) VALUES "
"(%s, %s)".format(cls._preprocessed_processed_table),
(preprocessed_data.id, pd_id))
pd._add_filepaths(filepaths, conn_handler)
return cls(pd_id)

Expand All @@ -451,18 +462,18 @@ def preprocessed_data(self):
conn_handler = SQLConnectionHandler()
return conn_handler.execute_fetchone(
"SELECT preprocessed_data_id FROM qiita.{0} WHERE "
"processed_data_id=%s".format(self._table),
"processed_data_id=%s".format(self._preprocessed_processed_table),
[self._id])[0]

@property
def data_type(self):
r"""The data_type of the data used"""
conn_handler = SQLConnectionHandler()
sql = ("SELECT DISTINCT DT.data_type FROM qiita.processed_data PD "
"JOIN qiita.preprocessed_data PPD on PD.preprocessed_data_id "
"= PPD.preprocessed_data_id JOIN qiita.raw_data RD on "
"PPD.raw_data_id = RD.raw_data_id "
"JOIN qiita.common_prep_info CPI ON RD.raw_data_id = "
"CPI.raw_data_id JOIN qiita.data_type DT ON CPI.data_type_id = "
"DT.data_type_id WHERE PD.processed_data_id = %s")
sql = ("SELECT DISTINCT DT.data_type FROM "
"qiita.preprocessed_processed_data PPD JOIN "
"qiita.raw_preprocessed_data RPD on PPD.preprocessed_data_id = "
"RPD.preprocessed_data_id JOIN qiita.common_prep_info CPI ON "
"RPD.raw_data_id = CPI.raw_data_id JOIN qiita.data_type DT ON "
"CPI.data_type_id = DT.data_type_id WHERE "
"PPD.processed_data_id = %s")
return conn_handler.execute_fetchone(sql, [self._id])[0]
8 changes: 4 additions & 4 deletions qiita_db/study.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,8 @@

from qiita_core.exceptions import IncompetentQiitaDeveloperError
from .base import QiitaStatusObject, QiitaObject
from .exceptions import (QiitaDBDuplicateError, QiitaDBStatusError,
QiitaDBColumnError)
from .util import check_required_columns, check_table_cols, convert_to_id
from .exceptions import (QiitaDBStatusError, QiitaDBColumnError)
from .util import check_required_columns, check_table_cols
from .sql_connection import SQLConnectionHandler


Expand Down Expand Up @@ -475,7 +474,8 @@ def processed_data(self):
list of ProcessedData ids
"""
conn_handler = SQLConnectionHandler()
sql = ("SELECT processed_data_id FROM qiita.processed_data WHERE "
sql = ("SELECT processed_data_id FROM "
"qiita.preprocessed_processed_data WHERE "
"preprocessed_data_id IN (SELECT preprocessed_data_id FROM "
"qiita.study_preprocessed_data where study_id = %s)")
return [x[0] for x in conn_handler.execute_fetchall(sql, (self._id,))]
Expand Down
12 changes: 9 additions & 3 deletions qiita_db/support_files/populate_test_db.sql
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ INSERT INTO qiita.study_experimental_factor (study_id, efo_id) VALUES (1, 1);
INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id) VALUES ('1_s_G1_L001_sequences.fastq.gz', 1, '852952723', 1), ('1_s_G1_L001_sequences_barcodes.fastq.gz', 2, '852952723', 1), ('2_sequences.fastq.gz', 1, '852952723', 1), ('2_sequences_barcodes.fastq.gz', 2, '852952723', 1);

-- Insert the raw data information for study 1
INSERT INTO qiita.raw_data (filetype_id, submitted_to_insdc) VALUES (2, FALSE), (2, TRUE);
INSERT INTO qiita.raw_data (filetype_id) VALUES (2), (2);

-- Insert (link) the raw data with the raw filepaths
INSERT INTO qiita.raw_filepath (raw_data_id, filepath_id) VALUES (1, 1), (1, 2), (2, 3), (2, 4);
Expand Down Expand Up @@ -284,7 +284,10 @@ INSERT INTO qiita.prep_1 (sample_id, BarcodeSequence, LIBRARY_CONSTRUCTION_PROTO
('SKM9.640192', 'AGCAGGCACGAA', 'This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions.', 'GTGCCAGCMGCCGCGGTAA', 'V4', '16S rRNA', 'ANL', 's_G1_L001_sequences', '8/1/12', 'ANL', 'micro biome of soil and rhizosphere of cannabis plants from CA', 'Cannabis Soil Microbiome', 'Illumina', '.25,g', 'Sequencing by synthesis', 'MiSeq', 'ANL', 'FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT', 'CCME');

-- Insert preprocessed information for raw data 1
INSERT INTO qiita.preprocessed_data (raw_data_id, preprocessed_params_table, preprocessed_params_id) VALUES (1, 'preprocessed_sequence_illumina_params', 1), (1, 'preprocessed_sequence_illumina_params', 2);
INSERT INTO qiita.preprocessed_data (preprocessed_params_table, preprocessed_params_id, submitted_to_insdc) VALUES ('preprocessed_sequence_illumina_params', 1, TRUE), ('preprocessed_sequence_illumina_params', 2, FALSE);

-- Link the new preprocessed data with the raw data
INSERT INTO qiita.raw_preprocessed_data (raw_data_id, preprocessed_data_id) VALUES (1, 1), (1, 2);

-- Insert (link) preprocessed information to study 1
INSERT INTO qiita.study_preprocessed_data (preprocessed_data_id, study_id) VALUES (1, 1), (2, 1);
Expand All @@ -299,7 +302,10 @@ INSERT INTO qiita.preprocessed_filepath (preprocessed_data_id, filepath_id) VALU
INSERT INTO qiita.preprocessed_sequence_illumina_params (trim_length) VALUES (151), (100);

-- Insert processed information for study 0 and processed data 1
INSERT INTO qiita.processed_data (preprocessed_data_id, processed_params_table, processed_params_id, processed_date) VALUES (1, 'processed_params_uclust', 1, 'Mon Oct 1 09:30:27 2012');
INSERT INTO qiita.processed_data (processed_params_table, processed_params_id, processed_date) VALUES ('processed_params_uclust', 1, 'Mon Oct 1 09:30:27 2012');

-- Link the processed data with the preprocessed data
INSERT INTO qiita.preprocessed_processed_data (preprocessed_data_id, processed_data_id) VALUES (1, 1);

-- Populate the reference table
INSERT INTO qiita.reference (reference_name, reference_version, sequence_filepath, taxonomy_filepath, tree_filepath) VALUES ('GreenGenes', '4feb2011', 'gg_97_otus_4feb2011.fasta', 'greengenes_tax.txt', 'gg_97_otus_4feb2011.tre');
Expand Down
Loading