qiita-spots · ElDeveloper · Apr 28, 2015 · Apr 23, 2015 · Apr 23, 2015 · Apr 24, 2015
diff --git a/qiita_db/analysis.py b/qiita_db/analysis.py
@@ -24,15 +24,17 @@
 from future.utils import viewitems
 from biom import load_table
 from biom.util import biom_open
+import pandas as pd
+from skbio.util import find_duplicates
 
 from qiita_core.exceptions import IncompetentQiitaDeveloperError
 from .sql_connection import SQLConnectionHandler
 from .base import QiitaStatusObject
-from .data import ProcessedData, RawData
+from .data import ProcessedData
 from .study import Study
-from .exceptions import QiitaDBStatusError  # QiitaDBNotImplementedError
+from .exceptions import QiitaDBStatusError, QiitaDBError
 from .util import (convert_to_id, get_work_base_dir,
-                   get_mountpoint, get_table_cols, insert_filepaths)
+                   get_mountpoint, insert_filepaths)
 
 
 class Analysis(QiitaStatusObject):
@@ -719,81 +721,61 @@ def _build_mapping_file(self, samples, conn_handler=None):
            Code modified slightly from qiime.util.MetadataMap.__add__"""
         conn_handler = conn_handler if conn_handler is not None \
             else SQLConnectionHandler()
-        # We will keep track of all unique sample_ids and metadata headers
-        # we have seen as we go, as well as studies already seen
+
         all_sample_ids = set()
-        all_headers = set(get_table_cols("required_sample_info", conn_handler))
-        all_studies = set()
+        sql = """SELECT filepath_id, filepath
+                 FROM qiita.filepath
+                    JOIN qiita.prep_template_filepath USING (filepath_id)
+                    JOIN qiita.prep_template_preprocessed_data
+                        USING (prep_template_id)
+                    JOIN qiita.preprocessed_processed_data
+                        USING (preprocessed_data_id)
+                    JOIN qiita.filepath_type USING (filepath_type_id)
+                 WHERE processed_data_id = %s
+                    AND filepath_type = 'qiime_map'
+                 ORDER BY filepath_id DESC"""
+        _id, fp = get_mountpoint('templates')[0]
+        to_concat = []
 
-        merged_data = defaultdict(lambda: defaultdict(lambda: None))
         for pid, samples in viewitems(samples):
-            if any([all_sample_ids.intersection(samples),
-                   len(set(samples)) != len(samples)]):
-                # duplicate samples so raise error
-                raise ValueError("Duplicate sample ids found: %s" %
-                                 str(all_sample_ids.intersection(samples)))
-            all_sample_ids.update(samples)
-            study_id = ProcessedData(pid).study
-
-            # create a convenience study object
-            s = Study(study_id)
-
-            # get the ids to retrieve the data from the sample and prep tables
-            sample_template_id = s.sample_template
-            # you can have multiple different prep templates but we are only
-            # using the one for 16S i. e. the last one ... sorry ;l
-            # see issue https://github.com/biocore/qiita/issues/465
-            prep_template_id = RawData(s.raw_data()[0]).prep_templates[-1]
-
-            if study_id in all_studies:
-                # samples already added by other processed data file
-                # with the study_id
-                continue
-            all_studies.add(study_id)
-            # add headers to set of all headers found
-            all_headers.update(get_table_cols("sample_%d" % sample_template_id,
-                               conn_handler))
-            all_headers.update(get_table_cols("prep_%d" % prep_template_id,
-                               conn_handler))
-            # NEED TO ADD COMMON PREP INFO Issue #247
-            sql = ("SELECT rs.*, p.*, ss.* "
-                   "FROM qiita.required_sample_info rs JOIN qiita.sample_{0} "
-                   "ss USING(sample_id) JOIN qiita.prep_{1} p USING(sample_id)"
-                   " WHERE rs.sample_id IN {2} AND rs.study_id = {3}".format(
-                       sample_template_id, prep_template_id,
-                       "(%s)" % ",".join("'%s'" % s for s in samples),
-                       study_id))
-            metadata = conn_handler.execute_fetchall(sql)
-            # add all the metadata to merged_data
-            for data in metadata:
-                sample_id = data['sample_id']
-                for header, value in viewitems(data):
-                    if header in {'sample_id'}:
-                        continue
-                    merged_data[sample_id][header] = str(value)
-
-        # prep headers, making sure they follow mapping file format rules
-        all_headers = list(all_headers - {'linkerprimersequence',
-                           'barcodesequence', 'description', 'sample_id'})
-        all_headers.sort()
-        all_headers = ['BarcodeSequence', 'LinkerPrimerSequence'] + all_headers
-        all_headers.append('Description')
-
-        # write mapping file out
+            if len(samples) != len(set(samples)):
+                duplicates = find_duplicates(samples)
+                raise QiitaDBError("Duplicate sample ids found: %s"
+                                   % ', '.join(duplicates))
+            # Get the QIIME mapping file
+            qiime_map_fp = conn_handler.execute_fetchall(sql, (pid,))[0][1]
+            # Parse the mapping file
+            qiime_map = pd.read_csv(
+                join(fp, qiime_map_fp), sep='\t', keep_default_na=False,
+                na_values=['unknown'], index_col=False,
+                converters=defaultdict(lambda: str))
+            qiime_map.set_index('#SampleID', inplace=True, drop=True)
+            qiime_map = qiime_map.loc[samples]
+
+            duplicates = all_sample_ids.intersection(qiime_map.index)
+            if duplicates or len(samples) != len(set(samples)):
+                # Duplicate samples so raise error
+                raise QiitaDBError("Duplicate sample ids found: %s"
+                                   % ', '.join(duplicates))
+            all_sample_ids.update(qiime_map.index)
+            to_concat.append(qiime_map)
+
+        merged_map = pd.concat(to_concat)
+
+        cols = merged_map.columns.values.tolist()
+        cols.remove('BarcodeSequence')
+        cols.remove('LinkerPrimerSequence')
+        cols.remove('Description')
+        new_cols = ['BarcodeSequence', 'LinkerPrimerSequence']
+        new_cols.extend(cols)
+        new_cols.append('Description')
+        merged_map = merged_map[new_cols]
+
+        # Save the mapping file
         _, base_fp = get_mountpoint(self._table)[0]
         mapping_fp = join(base_fp, "%d_analysis_mapping.txt" % self._id)
-        with open(mapping_fp, 'w') as f:
-            f.write("#SampleID\t%s\n" % '\t'.join(all_headers))
-            for sample, metadata in viewitems(merged_data):
-                data = [sample]
-                for header in all_headers:
-                    l_head = header.lower()
-                    data.append(metadata[l_head] if
-                                metadata[l_head] is not None else "no_data")
-                f.write("%s\n" % "\t".join(data))
-
-        self._add_file("%d_analysis_mapping.txt" % self._id,
-                       "plain_text", conn_handler=conn_handler)
+        merged_map.to_csv(mapping_fp, index_label='#SampleID',
+                          na_rep='unknown', sep='\t')
 
     def _add_file(self, filename, filetype, data_type=None, conn_handler=None):
         """adds analysis item to database

diff --git a/qiita_db/metadata_template/test/test_prep_template.py b/qiita_db/metadata_template/test/test_prep_template.py
@@ -892,7 +892,7 @@ def test_create_error_cleanup(self):
 
         self.assertFalse(exists_table("prep_%d" % exp_id, self.conn_handler))
 
-    def _common_creation_checks(self, new_id, pt):
+    def _common_creation_checks(self, new_id, pt, fp_count):
         # The returned object has the correct id
         self.assertEqual(pt.id, new_id)
 
@@ -981,23 +981,25 @@ def _common_creation_checks(self, new_id, pt):
         # prep and qiime files have been created
         filepaths = pt.get_filepaths()
         self.assertEqual(len(filepaths), 2)
-        self.assertEqual(filepaths[0][0], 22)
-        self.assertEqual(filepaths[1][0], 21)
+        self.assertEqual(filepaths[0][0], fp_count + 2)
+        self.assertEqual(filepaths[1][0], fp_count + 1)
 
     def test_create(self):
         """Creates a new PrepTemplate"""
+        fp_count = get_count('qiita.filepath')
         new_id = get_count('qiita.prep_template') + 1
         pt = PrepTemplate.create(self.metadata, self.new_raw_data,
                                  self.test_study, self.data_type)
-        self._common_creation_checks(new_id, pt)
+        self._common_creation_checks(new_id, pt, fp_count)
 
     def test_create_already_prefixed_samples(self):
         """Creates a new PrepTemplate"""
+        fp_count = get_count('qiita.filepath')
         new_id = get_count('qiita.prep_template') + 1
         pt = npt.assert_warns(QiitaDBWarning, PrepTemplate.create,
                               self.metadata_prefixed, self.new_raw_data,
                               self.test_study, self.data_type)
-        self._common_creation_checks(new_id, pt)
+        self._common_creation_checks(new_id, pt, fp_count)
 
     def test_generate_files(self):
         fp_count = get_count("qiita.filepath")
@@ -1025,14 +1027,16 @@ def test_create_qiime_mapping_file(self):
 
     def test_create_data_type_id(self):
         """Creates a new PrepTemplate passing the data_type_id"""
+        fp_count = get_count('qiita.filepath')
         new_id = get_count('qiita.prep_template') + 1
         pt = PrepTemplate.create(self.metadata, self.new_raw_data,
                                  self.test_study, self.data_type_id)
-        self._common_creation_checks(new_id, pt)
+        self._common_creation_checks(new_id, pt, fp_count)
 
     def test_create_warning(self):
         """Warns if a required columns is missing for a given functionality
         """
+        fp_count = get_count("qiita.filepath")
         new_id = get_count('qiita.prep_template') + 1
         del self.metadata['barcode']
         pt = npt.assert_warns(QiitaDBWarning, PrepTemplate.create,
@@ -1123,8 +1127,8 @@ def test_create_warning(self):
         # prep and qiime files have been created
         filepaths = pt.get_filepaths()
         self.assertEqual(len(filepaths), 2)
-        self.assertEqual(filepaths[0][0], 22)
-        self.assertEqual(filepaths[1][0], 21)
+        self.assertEqual(filepaths[0][0], fp_count + 2)
+        self.assertEqual(filepaths[1][0], fp_count + 1)
 
     def test_create_investigation_type_error(self):
         """Create raises an error if the investigation_type does not exists"""

diff --git a/qiita_db/support_files/populate_test_db.sql b/qiita_db/support_files/populate_test_db.sql
@@ -449,4 +449,9 @@ INSERT INTO qiita.collection_users (email, collection_id) VALUES ('shared@foo.ba
 INSERT INTO qiita.analysis (email, name, description, dflt, analysis_status_id) VALUES ('test@foo.bar', 'test@foo.bar-dflt', 'dflt', true, 1), ('admin@foo.bar', 'admin@foo.bar-dflt', 'dflt', true, 1), ('shared@foo.bar', 'shared@foo.bar-dflt', 'dflt', true, 1), ('demo@microbio.me', 'demo@microbio.me-dflt', 'dflt', true, 1);
 
 -- Attach samples to analysis
-INSERT INTO qiita.analysis_sample (analysis_id, processed_data_id, sample_id) VALUES (3,1,'1.SKD8.640184'), (3,1,'1.SKB7.640196'), (3,1,'1.SKM9.640192'), (3,1,'1.SKM4.640180')
+INSERT INTO qiita.analysis_sample (analysis_id, processed_data_id, sample_id) VALUES (3,1,'1.SKD8.640184'), (3,1,'1.SKB7.640196'), (3,1,'1.SKM9.640192'), (3,1,'1.SKM4.640180');
+
+-- Create the new prep_template_filepath
+INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id, data_directory_id) VALUES ('1_prep_1_19700101-000000.txt', 15, '3703494589', 1, 9);
+INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id, data_directory_id) VALUES ('1_prep_1_qiime_19700101-000000.txt', 16, '3703494589', 1, 9);
+INSERT INTO qiita.prep_template_filepath VALUES (1, 19), (1, 20);
diff --git a/qiita_db/support_files/test_data/analysis/1_analysis_mapping_exp.txt b/qiita_db/support_files/test_data/analysis/1_analysis_mapping_exp.txt
@@ -0,0 +1,4 @@
+#SampleID	BarcodeSequence	LinkerPrimerSequence	center_name	center_project_name	emp_status	experiment_center	experiment_design_description	experiment_title	illumina_technology	library_construction_protocol	pcr_primers	platform	run_center	run_date	run_prefix	samp_size	sample_center	sequencing_meth	study_center	target_gene	target_subfragment	altitude	anonymized_name	assigned_from_geo	collection_timestamp	common_name	country	depth	description_duplicate	elevation	env_biome	env_feature	has_extracted_data	has_physical_specimen	host_subject_id	host_taxid	latitude	longitude	ph	physical_location	samp_salinity	sample_type	season_environment	taxon_id	temp	texture	tot_nitro	tot_org_carb	water_content_soil	Description
+1.SKB8.640193	AGCGCTCACATC	GTGCCAGCMGCCGCGGTAA	ANL		EMP	ANL	micro biome of soil and rhizosphere of cannabis plants from CA	Cannabis Soil Microbiome	MiSeq	This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions.	FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT	Illumina	ANL	8/1/12	s_G1_L001_sequences	.25,g	ANL	Sequencing by synthesis	CCME	16S rRNA	V4	0.0	SKB8	n	2011-11-11 13:00:00	root metagenome	GAZ:United States of America	0.15	Burmese root	114.0	ENVO:Temperate grasslands, savannas, and shrubland biome	ENVO:plant-associated habitat	True	True	1001:M7	3483	74.0894932572	65.3283470202	6.94	ANL	7.15	ENVO:soil	winter	1118232	15.0	64.6 sand, 17.6 silt, 17.8 clay	1.41	5.0	0.16399999999999998	Cannabis Soil Microbiome
+1.SKD8.640184	TGAGTGGTCTGT	GTGCCAGCMGCCGCGGTAA	ANL		EMP	ANL	micro biome of soil and rhizosphere of cannabis plants from CA	Cannabis Soil Microbiome	MiSeq	This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions.	FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT	Illumina	ANL	8/1/12	s_G1_L001_sequences	.25,g	ANL	Sequencing by synthesis	CCME	16S rRNA	V4	0.0	SKD8	n	2011-11-11 13:00:00	root metagenome	GAZ:United States of America	0.15	Diesel Root	114.0	ENVO:Temperate grasslands, savannas, and shrubland biome	ENVO:plant-associated habitat	True	True	1001:D9	3483	57.571893782	32.5563076447	6.8	ANL	7.1	ENVO:soil	winter	1118232	15.0	66 sand, 16.3 silt, 17.7 clay	1.51	4.32	0.17800000000000002	Cannabis Soil Microbiome
+1.SKB7.640196	CGGCCTAAGTTC	GTGCCAGCMGCCGCGGTAA	ANL		EMP	ANL	micro biome of soil and rhizosphere of cannabis plants from CA	Cannabis Soil Microbiome	MiSeq	This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions.	FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT	Illumina	ANL	8/1/12	s_G1_L001_sequences	.25,g	ANL	Sequencing by synthesis	CCME	16S rRNA	V4	0.0	SKB7	n	2011-11-11 13:00:00	root metagenome	GAZ:United States of America	0.15	Burmese root	114.0	ENVO:Temperate grasslands, savannas, and shrubland biome	ENVO:plant-associated habitat	True	True	1001:M8	3483	13.089194595	92.5274472082	6.94	ANL	7.15	ENVO:soil	winter	1118232	15.0	64.6 sand, 17.6 silt, 17.8 clay	1.41	5.0	0.16399999999999998	Cannabis Soil Microbiome