qiita-spots · teravest · Jun 20, 2014 · Jun 19, 2014 · Jun 19, 2014 · Jun 19, 2014
diff --git a/qiita_db/data.py b/qiita_db/data.py
@@ -35,11 +35,6 @@
 >>> print rd.id # doctest: +SKIP
 2
 
-Retrieve if the raw data files have been submitted to insdc
-
->>> rd.is_submitted_to_insdc() # doctest: +SKIP
-False
-
 Retrieve the filepaths associated with the raw data
 
 >>> rd.get_filepaths() # doctest: +SKIP
@@ -91,7 +86,7 @@
 from .base import QiitaObject
 from .sql_connection import SQLConnectionHandler
 from .util import (exists_dynamic_table, get_db_files_base_dir,
-                   compute_checksum, insert_filepaths)
+                   insert_filepaths)
 
 
 class BaseData(QiitaObject):
@@ -195,7 +190,6 @@ class RawData(BaseData):
     Methods
     -------
     create
-    is_submitted_to_insdc
 
     See Also
     --------
@@ -210,7 +204,7 @@ class RawData(BaseData):
     _study_raw_table = "study_raw_data"
 
     @classmethod
-    def create(cls, filetype, filepaths, studies, submitted_to_insdc=False):
+    def create(cls, filetype, filepaths, studies):
         r"""Creates a new object with a new id on the storage system
 
         Parameters
@@ -221,8 +215,6 @@ def create(cls, filetype, filepaths, studies, submitted_to_insdc=False):
             The list of paths to the raw files and its filepath type identifier
         studies : list of Study
             The list of Study objects to which the raw data belongs to
-        submitted_to_insdc : bool
-            If true, the raw data files have been submitted to insdc
 
         Returns
         -------
@@ -231,10 +223,9 @@ def create(cls, filetype, filepaths, studies, submitted_to_insdc=False):
         # Add the raw data to the database, and get the raw data id back
         conn_handler = SQLConnectionHandler()
         rd_id = conn_handler.execute_fetchone(
-            "INSERT INTO qiita.{0} (filetype_id, submitted_to_insdc) VALUES "
-            "(%(type_id)s, %(insdc)s) RETURNING "
-            "raw_data_id".format(cls._table), {'type_id': filetype,
-                                               'insdc': submitted_to_insdc})[0]
+            "INSERT INTO qiita.{0} (filetype_id) VALUES (%s) RETURNING "
+            "raw_data_id".format(cls._table),
+            (filetype, ))[0]
         rd = cls(rd_id)
 
         # Connect the raw data with its studies
@@ -247,19 +238,6 @@ def create(cls, filetype, filepaths, studies, submitted_to_insdc=False):
 
         return rd
 
-    def is_submitted_to_insdc(self):
-        r"""Tells if the raw data has been submitted to insdc
-
-        Returns
-        -------
-        bool
-            True if the raw data have been submitted to insdc. False otherwise
-        """
-        conn_handler = SQLConnectionHandler()
-        return conn_handler.execute_fetchone(
-            "SELECT submitted_to_insdc FROM qiita.{0} "
-            "WHERE raw_data_id=%s".format(self._table), [self.id])[0]
-
     @property
     def studies(self):
         r"""The list of study ids to which the raw data belongs to
@@ -287,6 +265,7 @@ class PreprocessedData(BaseData):
     Methods
     -------
     create
+    is_submitted_to_insdc
 
     See Also
     --------
@@ -297,16 +276,15 @@ class PreprocessedData(BaseData):
     _data_filepath_table = "preprocessed_filepath"
     _data_filepath_column = "preprocessed_data_id"
     _study_preprocessed_table = "study_preprocessed_data"
+    _raw_preprocessed_table = "raw_preprocessed_data"
 
     @classmethod
-    def create(cls, raw_data, study, preprocessed_params_table,
-               preprocessed_params_id, filepaths):
+    def create(cls, study, preprocessed_params_table, preprocessed_params_id,
+               filepaths, raw_data=None, submitted_to_insdc=False):
         r"""Creates a new object with a new id on the storage system
 
         Parameters
         ----------
-        raw_data : RawData
-            The RawData object used as base to this preprocessed data
         study : Study
             The study to which this preprocessed data belongs to
         preprocessed_params_table : str
@@ -317,6 +295,10 @@ def create(cls, raw_data, study, preprocessed_params_table,
         filepaths : iterable of tuples (str, int)
             The list of paths to the preprocessed files and its filepath type
             identifier
+        submitted_to_insdc : bool, optional
+            If true, the raw data files have been submitted to insdc
+        raw_data : RawData, optional
+            The RawData object used as base to this preprocessed data
 
         Raises
         ------
@@ -333,11 +315,13 @@ def create(cls, raw_data, study, preprocessed_params_table,
         # Add the preprocessed data to the database,
         # and get the preprocessed data id back
         ppd_id = conn_handler.execute_fetchone(
-            "INSERT INTO qiita.{0} (raw_data_id, preprocessed_params_table, "
-            "preprocessed_params_id) VALUES (%(raw_id)s, %(param_table)s, "
-            "%(param_id)s) RETURNING preprocessed_data_id".format(cls._table),
-            {'raw_id': raw_data.id, 'param_table': preprocessed_params_table,
-             'param_id': preprocessed_params_id})[0]
+            "INSERT INTO qiita.{0} (preprocessed_params_table, "
+            "preprocessed_params_id, submitted_to_insdc) VALUES "
+            "(%(param_table)s, %(param_id)s, %(insdc)s) "
+            "RETURNING preprocessed_data_id".format(cls._table),
+            {'param_table': preprocessed_params_table,
+             'param_id': preprocessed_params_id,
+             'insdc': submitted_to_insdc})[0]
         ppd = cls(ppd_id)
 
         # Connect the preprocessed data with its study
@@ -346,6 +330,13 @@ def create(cls, raw_data, study, preprocessed_params_table,
             "VALUES (%s, %s)".format(ppd._study_preprocessed_table),
             (study.id, ppd.id))
 
+        if raw_data is not None:
+            # Connect the preprocessed data with the raw data
+            conn_handler.execute(
+                "INSERT INTO qiita.{0} (raw_data_id, preprocessed_data_id) "
+                "VALUES (%s, %s)".format(cls._raw_preprocessed_table),
+                (raw_data.id, ppd_id))
+
         ppd._add_filepaths(filepaths, conn_handler)
         return ppd
 
@@ -355,7 +346,7 @@ def raw_data(self):
         conn_handler = SQLConnectionHandler()
         return conn_handler.execute_fetchone(
             "SELECT raw_data_id FROM qiita.{0} WHERE "
-            "preprocessed_data_id=%s".format(self._table),
+            "preprocessed_data_id=%s".format(self._raw_preprocessed_table),
             [self._id])[0]
 
     @property
@@ -372,6 +363,19 @@ def study(self):
             "preprocessed_data_id=%s".format(self._study_preprocessed_table),
             [self._id])[0]
 
+    def is_submitted_to_insdc(self):
+        r"""Tells if the raw data has been submitted to insdc
+
+        Returns
+        -------
+        bool
+            True if the raw data have been submitted to insdc. False otherwise
+        """
+        conn_handler = SQLConnectionHandler()
+        return conn_handler.execute_fetchone(
+            "SELECT submitted_to_insdc FROM qiita.{0} "
+            "WHERE preprocessed_data_id=%s".format(self._table), (self.id,))[0]
+
 
 class ProcessedData(BaseData):
     r"""Object for dealing with processed data
@@ -392,15 +396,14 @@ class ProcessedData(BaseData):
     _table = "processed_data"
     _data_filepath_table = "processed_filepath"
     _data_filepath_column = "processed_data_id"
+    _preprocessed_processed_table = "preprocessed_processed_data"
 
     @classmethod
-    def create(cls, preprocessed_data, processed_params_table,
-               processed_params_id, filepaths, processed_date=None):
+    def create(cls, processed_params_table, processed_params_id, filepaths,
+               preprocessed_data=None, processed_date=None):
         r"""
         Parameters
         ----------
-        preprocessed_data : PreprocessedData
-            The PreprocessedData object used as base to this processed data
         processed_params_table : str
             Name of the table that holds the preprocessing parameters used
         processed_params_id : int
@@ -409,6 +412,8 @@ def create(cls, preprocessed_data, processed_params_table,
         filepaths : iterable of tuples (str, int)
             The list of paths to the processed files and its filepath type
             identifier
+        preprocessed_data : PreprocessedData, optional
+            The PreprocessedData object used as base to this processed data
         processed_date : datetime, optional
             Date in which the data have been processed. Default: now
 
@@ -432,16 +437,22 @@ def create(cls, preprocessed_data, processed_params_table,
         # Add the processed data to the database,
         # and get the processed data id back
         pd_id = conn_handler.execute_fetchone(
-            "INSERT INTO qiita.{0} (preprocessed_data_id, "
-            "processed_params_table, processed_params_id, processed_date) "
-            "VALUES (%(prep_data_id)s, %(param_table)s, %(param_id)s, "
-            "%(date)s) RETURNING processed_data_id".format(cls._table),
-            {'prep_data_id': preprocessed_data.id,
-             'param_table': processed_params_table,
+            "INSERT INTO qiita.{0} (processed_params_table, "
+            "processed_params_id, processed_date) VALUES (%(param_table)s, "
+            "%(param_id)s, %(date)s) RETURNING "
+            "processed_data_id".format(cls._table),
+            {'param_table': processed_params_table,
              'param_id': processed_params_id,
              'date': processed_date})[0]
 
         pd = cls(pd_id)
+
+        if preprocessed_data is not None:
+            conn_handler.execute(
+                "INSERT INTO qiita.{0} (preprocessed_data_id, "
+                "processed_data_id) VALUES "
+                "(%s, %s)".format(cls._preprocessed_processed_table),
+                (preprocessed_data.id, pd_id))
         pd._add_filepaths(filepaths, conn_handler)
         return cls(pd_id)
 
@@ -451,18 +462,18 @@ def preprocessed_data(self):
         conn_handler = SQLConnectionHandler()
         return conn_handler.execute_fetchone(
             "SELECT preprocessed_data_id FROM qiita.{0} WHERE "
-            "processed_data_id=%s".format(self._table),
+            "processed_data_id=%s".format(self._preprocessed_processed_table),
             [self._id])[0]
 
     @property
     def data_type(self):
         r"""The data_type of the data used"""
         conn_handler = SQLConnectionHandler()
-        sql = ("SELECT DISTINCT DT.data_type FROM qiita.processed_data PD "
-               "JOIN qiita.preprocessed_data PPD on PD.preprocessed_data_id "
-               "= PPD.preprocessed_data_id JOIN qiita.raw_data RD on "
-               "PPD.raw_data_id = RD.raw_data_id "
-               "JOIN qiita.common_prep_info CPI ON RD.raw_data_id = "
-               "CPI.raw_data_id JOIN qiita.data_type DT ON CPI.data_type_id = "
-               "DT.data_type_id WHERE PD.processed_data_id = %s")
+        sql = ("SELECT DISTINCT DT.data_type FROM "
+               "qiita.preprocessed_processed_data PPD JOIN "
+               "qiita.raw_preprocessed_data RPD on PPD.preprocessed_data_id = "
+               "RPD.preprocessed_data_id JOIN qiita.common_prep_info CPI ON "
+               "RPD.raw_data_id = CPI.raw_data_id JOIN qiita.data_type DT ON "
+               "CPI.data_type_id = DT.data_type_id WHERE "
+               "PPD.processed_data_id = %s")
         return conn_handler.execute_fetchone(sql, [self._id])[0]
diff --git a/qiita_db/study.py b/qiita_db/study.py
@@ -102,9 +102,8 @@
 
 from qiita_core.exceptions import IncompetentQiitaDeveloperError
 from .base import QiitaStatusObject, QiitaObject
-from .exceptions import (QiitaDBDuplicateError, QiitaDBStatusError,
-                         QiitaDBColumnError)
-from .util import check_required_columns, check_table_cols, convert_to_id
+from .exceptions import (QiitaDBStatusError, QiitaDBColumnError)
+from .util import check_required_columns, check_table_cols
 from .sql_connection import SQLConnectionHandler
 
 
@@ -475,7 +474,8 @@ def processed_data(self):
         list of ProcessedData ids
         """
         conn_handler = SQLConnectionHandler()
-        sql = ("SELECT processed_data_id FROM qiita.processed_data WHERE "
+        sql = ("SELECT processed_data_id FROM "
+               "qiita.preprocessed_processed_data WHERE "
                "preprocessed_data_id IN (SELECT preprocessed_data_id FROM "
                "qiita.study_preprocessed_data where study_id = %s)")
         return [x[0] for x in conn_handler.execute_fetchall(sql, (self._id,))]

diff --git a/qiita_db/support_files/populate_test_db.sql b/qiita_db/support_files/populate_test_db.sql
@@ -53,7 +53,7 @@ INSERT INTO qiita.study_experimental_factor (study_id, efo_id) VALUES (1, 1);
 INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id) VALUES ('1_s_G1_L001_sequences.fastq.gz', 1, '852952723', 1), ('1_s_G1_L001_sequences_barcodes.fastq.gz', 2, '852952723', 1), ('2_sequences.fastq.gz', 1, '852952723', 1), ('2_sequences_barcodes.fastq.gz', 2, '852952723', 1);
 
 -- Insert the raw data information for study 1
-INSERT INTO qiita.raw_data (filetype_id, submitted_to_insdc) VALUES (2, FALSE), (2, TRUE);
+INSERT INTO qiita.raw_data (filetype_id) VALUES (2), (2);
 
 -- Insert (link) the raw data with the raw filepaths
 INSERT INTO qiita.raw_filepath (raw_data_id, filepath_id) VALUES (1, 1), (1, 2), (2, 3), (2, 4);
@@ -284,7 +284,10 @@ INSERT INTO qiita.prep_1 (sample_id, BarcodeSequence, LIBRARY_CONSTRUCTION_PROTO
 	('SKM9.640192', 'AGCAGGCACGAA', 'This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions.', 'GTGCCAGCMGCCGCGGTAA', 'V4', '16S rRNA', 'ANL', 's_G1_L001_sequences', '8/1/12', 'ANL', 'micro biome of soil and rhizosphere of cannabis plants from CA', 'Cannabis Soil Microbiome', 'Illumina', '.25,g', 'Sequencing by synthesis', 'MiSeq', 'ANL', 'FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT', 'CCME');
 
 -- Insert preprocessed information for raw data 1
-INSERT INTO qiita.preprocessed_data (raw_data_id, preprocessed_params_table, preprocessed_params_id) VALUES (1, 'preprocessed_sequence_illumina_params', 1), (1, 'preprocessed_sequence_illumina_params', 2);
+INSERT INTO qiita.preprocessed_data (preprocessed_params_table, preprocessed_params_id, submitted_to_insdc) VALUES ('preprocessed_sequence_illumina_params', 1, TRUE), ('preprocessed_sequence_illumina_params', 2, FALSE);
+
+-- Link the new preprocessed data with the raw data
+INSERT INTO qiita.raw_preprocessed_data (raw_data_id, preprocessed_data_id) VALUES (1, 1), (1, 2);
 
 -- Insert (link) preprocessed information to study 1
 INSERT INTO qiita.study_preprocessed_data (preprocessed_data_id, study_id) VALUES (1, 1), (2, 1);
@@ -299,7 +302,10 @@ INSERT INTO qiita.preprocessed_filepath (preprocessed_data_id, filepath_id) VALU
 INSERT INTO qiita.preprocessed_sequence_illumina_params (trim_length) VALUES (151), (100);
 
 -- Insert processed information for study 0 and processed data 1
-INSERT INTO qiita.processed_data (preprocessed_data_id, processed_params_table, processed_params_id, processed_date) VALUES (1, 'processed_params_uclust', 1, 'Mon Oct 1 09:30:27 2012');
+INSERT INTO qiita.processed_data (processed_params_table, processed_params_id, processed_date) VALUES ('processed_params_uclust', 1, 'Mon Oct 1 09:30:27 2012');
+
+-- Link the processed data with the preprocessed data
+INSERT INTO qiita.preprocessed_processed_data (preprocessed_data_id, processed_data_id) VALUES (1, 1);
 
 -- Populate the reference table
 INSERT INTO qiita.reference (reference_name, reference_version, sequence_filepath, taxonomy_filepath, tree_filepath) VALUES ('GreenGenes', '4feb2011', 'gg_97_otus_4feb2011.fasta', 'greengenes_tax.txt', 'gg_97_otus_4feb2011.tre');