qiita-spots · antgonza · Apr 28, 2015 · Apr 17, 2015 · Apr 17, 2015 · Apr 17, 2015
diff --git a/qiita_db/metadata_template/base_metadata_template.py b/qiita_db/metadata_template/base_metadata_template.py
diff --git a/qiita_db/metadata_template/constants.py b/qiita_db/metadata_template/constants.py
@@ -6,10 +6,74 @@
 # The full license is in the file LICENSE, distributed with this software.
 # -----------------------------------------------------------------------------
 
+from collections import namedtuple
+from future.utils import viewkeys, viewvalues
+
+Restriction = namedtuple('Restriction', ['columns', 'error_msg'])
+
+# A dict containing the restrictions that apply to the sample templates
+SAMPLE_TEMPLATE_COLUMNS = {
+    # The following columns are required by EBI for submission
+    'EBI': Restriction(columns={'collection_timestamp': 'timestamp',
+                                'physical_specimen_location': 'varchar'},
+                       error_msg="EBI submission disabled"),
+    # The following columns are required for the official main QIITA site
+    'qiita_main': Restriction(columns={'sample_type': 'varchar',
+                                       'description': 'varchar',
+                                       'physical_specimen_remaining': 'bool',
+                                       'dna_extracted': 'bool',
+                                       'latitude': 'float8',
+                                       'longitude': 'float8',
+                                       'host_subject_id': 'varchar'},
+                              error_msg="Processed data approval disabled")
+}
+
+# A dict containing the restrictions that apply to the prep templates
+PREP_TEMPLATE_COLUMNS = {
+    # The following columns are required by EBI for submission
+    'EBI': Restriction(
+        columns={'primer': 'varchar',
+                 'center_name': 'varchar',
+                 'platform': 'varchar',
+                 'library_construction_protocol': 'varchar',
+                 'experiment_design_description': 'varchar'},
+        error_msg="EBI submission disabled")
+}
+
+# Different prep templates have different requirements depending on the data
+# type. We create a dictionary for each of these special datatypes
 
 TARGET_GENE_DATA_TYPES = ['16S', '18S', 'ITS']
-REQUIRED_TARGET_GENE_COLS = {'barcodesequence', 'linkerprimersequence',
-                             'run_prefix', 'library_construction_protocol',
-                             'experiment_design_description', 'platform'}
-RENAME_COLS_DICT = {'barcode': 'barcodesequence',
-                    'primer': 'linkerprimersequence'}
+
+PREP_TEMPLATE_COLUMNS_TARGET_GENE = {
+    # The following columns are required by QIIME to execute split libraries
+    'demultiplex': Restriction(
+        columns={'barcode': 'varchar',
+                 'primer': 'varchar'},
+        error_msg="Demultiplexing disabled. You will not be able to "
+                  "preprocess your raw data"),
+    # The following columns are required by Qiita to know how to execute split
+    # libraries using QIIME over a study with multiple illumina lanes
+    'demultiplex_multiple': Restriction(
+        columns={'barcode': 'varchar',
+                 'primer': 'varchar',
+                 'run_prefix': 'varchar'},
+        error_msg="Demultiplexing with multiple input files disabled. If your "
+                  "raw data includes multiple raw input files, you will not "
+                  "be able to preprocess your raw data")
+}
+
+# This list is useful to have if we want to loop through all the restrictions
+# in a template-independent manner
+ALL_RESTRICTIONS = [SAMPLE_TEMPLATE_COLUMNS, PREP_TEMPLATE_COLUMNS,
+                    PREP_TEMPLATE_COLUMNS_TARGET_GENE]
+
+
+# A set holding all the controlled columns, useful to avoid recalculating it
+def _col_iterator():
+    for r_set in ALL_RESTRICTIONS:
+        for restriction in viewvalues(r_set):
+            for cols in viewkeys(restriction.columns):
+                yield cols
+
+CONTROLLED_COLS = set(col for col in _col_iterator())
diff --git a/qiita_db/metadata_template/prep_template.py b/qiita_db/metadata_template/prep_template.py
@@ -7,20 +7,26 @@
 # -----------------------------------------------------------------------------
 
 from __future__ import division
+from future.utils import viewvalues
 from os.path import join
 from time import strftime
+from copy import deepcopy
+import warnings
+
+import pandas as pd
 
 from qiita_core.exceptions import IncompetentQiitaDeveloperError
 from qiita_db.exceptions import (QiitaDBColumnError, QiitaDBUnknownIDError,
-                                 QiitaDBError, QiitaDBExecutionError)
+                                 QiitaDBError, QiitaDBExecutionError,
+                                 QiitaDBWarning)
 from qiita_db.sql_connection import SQLConnectionHandler
 from qiita_db.ontology import Ontology
 from qiita_db.util import (convert_to_id,
                            convert_from_id, get_mountpoint, infer_status)
 from .base_metadata_template import BaseSample, MetadataTemplate
 from .util import load_template_to_dataframe
-from .constants import (TARGET_GENE_DATA_TYPES, RENAME_COLS_DICT,
-                        REQUIRED_TARGET_GENE_COLS)
+from .constants import (TARGET_GENE_DATA_TYPES, PREP_TEMPLATE_COLUMNS,
+                        PREP_TEMPLATE_COLUMNS_TARGET_GENE)
 
 
 class PrepSample(BaseSample):
@@ -66,8 +72,9 @@ class PrepTemplate(MetadataTemplate):
     _table_prefix = "prep_"
     _column_table = "prep_columns"
     _id_column = "prep_template_id"
-    translate_cols_dict = {'emp_status_id': 'emp_status'}
     _sample_cls = PrepSample
+    _fp_id = convert_to_id("prep_template", "filepath_type")
+    _filepath_table = 'prep_template_filepath'
 
     @classmethod
     def create(cls, md_template, raw_data, study, data_type,
@@ -116,8 +123,13 @@ def create(cls, md_template, raw_data, study, data_type,
             data_type_id = convert_to_id(data_type, "data_type", conn_handler)
             data_type_str = data_type
 
+        pt_cols = PREP_TEMPLATE_COLUMNS
+        if data_type_str in TARGET_GENE_DATA_TYPES:
+            pt_cols = deepcopy(PREP_TEMPLATE_COLUMNS)
+            pt_cols.update(PREP_TEMPLATE_COLUMNS_TARGET_GENE)
+
         md_template = cls._clean_validate_template(md_template, study.id,
-                                                   data_type_str, conn_handler)
+                                                   pt_cols)
 
         # Insert the metadata template
         # We need the prep_id for multiple calls below, which currently is not
@@ -140,7 +152,7 @@ def create(cls, md_template, raw_data, study, data_type,
                 "{0} = %s".format(cls._id_column), (prep_id,))
 
             # Check if sample IDs present here but not in sample template
-            sql = ("SELECT sample_id from qiita.required_sample_info WHERE "
+            sql = ("SELECT sample_id from qiita.study_sample WHERE "
                    "study_id = %s")
             # Get list of study sample IDs, prep template study IDs,
             # and their intersection
@@ -181,40 +193,6 @@ def validate_investigation_type(self, investigation_type):
                                      "Choose from: %s" % (investigation_type,
                                                           ', '.join(terms)))
 
-    @classmethod
-    def _check_template_special_columns(cls, md_template, data_type):
-        r"""Checks for special columns based on obj type
-
-        Parameters
-        ----------
-        md_template : DataFrame
-            The metadata template file contents indexed by sample ids
-        data_type : str
-            The data_type of the template.
-
-        Returns
-        -------
-        set
-            The set of missing columns
-
-        Notes
-        -----
-        Sometimes people use different names for the same columns. We just
-        rename them to use the naming that we expect, so this is normalized
-        across studies.
-        """
-        # We only have column requirements if the data type of the raw data
-        # is one of the target gene types
-        missing_cols = set()
-        if data_type in TARGET_GENE_DATA_TYPES:
-            md_template.rename(columns=RENAME_COLS_DICT, inplace=True)
-
-            # Check for all required columns for target genes studies
-            missing_cols = REQUIRED_TARGET_GENE_COLS.difference(
-                md_template.columns)
-
-        return missing_cols
-
     @classmethod
     def delete(cls, id_):
         r"""Deletes the table from the database
@@ -412,17 +390,11 @@ def generate_files(self):
         self.add_filepath(fp)
 
         # creating QIIME mapping file
-        self.create_qiime_mapping_file(fp)
+        self.create_qiime_mapping_file()
 
-    def create_qiime_mapping_file(self, prep_template_fp):
+    def create_qiime_mapping_file(self):
         """This creates the QIIME mapping file and links it in the db.
 
-        Parameters
-        ----------
-        prep_template_fp : str
-            The prep template filepath that should be concatenated to the
-            sample template go used to generate a new  QIIME mapping file
-
         Returns
         -------
         filepath : str
@@ -432,12 +404,20 @@ def create_qiime_mapping_file(self, prep_template_fp):
         ------
         ValueError
             If the prep template is not a subset of the sample template
+        QiitaDBWarning
+            If the QIIME-required columns are not present in the template
+
+        Notes
+        -----
+        We cannot ensure that the QIIME-required columns are present in the
+        metadata map. However, we have to generate a QIIME-compliant mapping
+        file. Since the user may need a QIIME mapping file, but not these
+        QIIME-required columns, we are going to create them and
+        populate them with the value XXQIITAXX.
         """
         rename_cols = {
             'barcode': 'BarcodeSequence',
-            'barcodesequence': 'BarcodeSequence',
             'primer': 'LinkerPrimerSequence',
-            'linkerprimersequence': 'LinkerPrimerSequence',
             'description': 'Description',
         }
 
@@ -456,19 +436,38 @@ def create_qiime_mapping_file(self, prep_template_fp):
 
         # reading files via pandas
         st = load_template_to_dataframe(sample_template_fp)
-        pt = load_template_to_dataframe(prep_template_fp)
+        pt = self.to_dataframe()
+
         st_sample_names = set(st.index)
         pt_sample_names = set(pt.index)
 
         if not pt_sample_names.issubset(st_sample_names):
             raise ValueError(
-                "Prep template is not a sub set of the sample template, files:"
-                "%s %s - samples: %s" % (sample_template_fp, prep_template_fp,
-                                         str(pt_sample_names-st_sample_names)))
+                "Prep template is not a sub set of the sample template, files"
+                "%s - samples: %s"
+                % (sample_template_fp,
+                   ', '.join(pt_sample_names-st_sample_names)))
 
         mapping = pt.join(st, lsuffix="_prep")
         mapping.rename(columns=rename_cols, inplace=True)
 
+        # Pre-populate the QIIME-required columns with the value XXQIITAXX
+        index = mapping.index
+        placeholder = ['XXQIITAXX'] * len(index)
+        missing = []
+        for val in viewvalues(rename_cols):
+            if val not in mapping:
+                missing.append(val)
+                mapping[val] = pd.Series(placeholder, index=index)
+
+        if missing:
+            warnings.warn(
+                "Some columns required to generate a QIIME-compliant mapping "
+                "file are not present in the template. A placeholder value "
+                "(XXQIITAXX) has been used to populate these columns. Missing "
+                "columns: %s" % ', '.join(missing),
+                QiitaDBWarning)
+
         # Gets the orginal mapping columns and readjust the order to comply
         # with QIIME requirements
         cols = mapping.columns.values.tolist()
@@ -486,11 +485,13 @@ def create_qiime_mapping_file(self, prep_template_fp):
                         self.id, strftime("%Y%m%d-%H%M%S")))
 
         # Save the mapping file
-        mapping.to_csv(filepath, index_label='#SampleID', na_rep='unknown',
+        mapping.to_csv(filepath, index_label='#SampleID', na_rep='',
                        sep='\t')
 
         # adding the fp to the object
-        self.add_filepath(filepath)
+        self.add_filepath(
+            filepath, conn_handler=conn_handler,
+            fp_id=convert_to_id("qiime_map", "filepath_type"))
 
         return filepath
 

diff --git a/qiita_db/metadata_template/sample_template.py b/qiita_db/metadata_template/sample_template.py
@@ -16,11 +16,12 @@
 from qiita_db.exceptions import (QiitaDBDuplicateError, QiitaDBError,
                                  QiitaDBUnknownIDError)
 from qiita_db.sql_connection import SQLConnectionHandler
-from qiita_db.util import get_required_sample_info_status, get_mountpoint
+from qiita_db.util import get_mountpoint, convert_to_id
 from qiita_db.study import Study
 from qiita_db.data import RawData
 from .base_metadata_template import BaseSample, MetadataTemplate
 from .prep_template import PrepTemplate
+from .constants import SAMPLE_TEMPLATE_COLUMNS
 
 
 class Sample(BaseSample):
@@ -66,9 +67,9 @@ class SampleTemplate(MetadataTemplate):
     _table_prefix = "sample_"
     _column_table = "study_sample_columns"
     _id_column = "study_id"
-    translate_cols_dict = {
-        'required_sample_info_status_id': 'required_sample_info_status'}
     _sample_cls = Sample
+    _fp_id = convert_to_id("sample_template", "filepath_type")
+    _filepath_table = 'sample_template_filepath'
 
     @staticmethod
     def metadata_headers():
@@ -87,19 +88,6 @@ def metadata_headers():
                 "WHERE table_name = 'required_sample_info' "
                 "ORDER BY column_name")]
 
-    @classmethod
-    def _check_template_special_columns(cls, md_template, study_id):
-        r"""Checks for special columns based on obj type
-
-        Parameters
-        ----------
-        md_template : DataFrame
-            The metadata template file contents indexed by sample ids
-        study_id : int
-            The study to which the sample template belongs to.
-        """
-        return set()
-
     @classmethod
     def create(cls, md_template, study):
         r"""Creates the sample template in the database
@@ -123,7 +111,7 @@ def create(cls, md_template, study):
 
         # Clean and validate the metadata template given
         md_template = cls._clean_validate_template(md_template, study.id,
-                                                   study.id, conn_handler)
+                                                   SAMPLE_TEMPLATE_COLUMNS)
 
         cls._add_common_creation_steps_to_queue(md_template, study.id,
                                                 conn_handler, queue_name)
@@ -233,8 +221,7 @@ def extend(self, md_template):
         conn_handler.create_queue(queue_name)
 
         md_template = self._clean_validate_template(md_template, self.study_id,
-                                                    self.study_id,
-                                                    conn_handler)
+                                                    SAMPLE_TEMPLATE_COLUMNS)
 
         self._add_common_extend_steps_to_queue(md_template, conn_handler,
                                                queue_name)
@@ -260,7 +247,7 @@ def update(self, md_template):
 
         # Clean and validate the metadata template given
         new_map = self._clean_validate_template(md_template, self.id,
-                                                conn_handler)
+                                                SAMPLE_TEMPLATE_COLUMNS)
         # Retrieving current metadata
         current_map = self._transform_to_dict(conn_handler.execute_fetchall(
             "SELECT * FROM qiita.{0} WHERE {1}=%s".format(self._table,

diff --git a/qiita_db/metadata_template/test/test_base_metadata_template.py b/qiita_db/metadata_template/test/test_base_metadata_template.py
@@ -60,7 +60,7 @@ def test_add_common_creation_steps_to_queue(self):
     def test_clean_validate_template(self):
         """_clean_validate_template raises an error from base class"""
         with self.assertRaises(IncompetentQiitaDeveloperError):
-            MetadataTemplate._clean_validate_template(None, 1, None, None)
+            MetadataTemplate._clean_validate_template(None, 1, None)
 
 
 if __name__ == '__main__':