Skip to content

Commit b604ae1

Browse files
committed
Merge pull request #1106 from josenavas/fix-analysis-tests
Fix analysis tests
2 parents b26a867 + 8be914f commit b604ae1

File tree

10 files changed

+143
-132
lines changed

10 files changed

+143
-132
lines changed

qiita_db/analysis.py

Lines changed: 55 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,17 @@
2424
from future.utils import viewitems
2525
from biom import load_table
2626
from biom.util import biom_open
27+
import pandas as pd
28+
from skbio.util import find_duplicates
2729

2830
from qiita_core.exceptions import IncompetentQiitaDeveloperError
2931
from .sql_connection import SQLConnectionHandler
3032
from .base import QiitaStatusObject
31-
from .data import ProcessedData, RawData
33+
from .data import ProcessedData
3234
from .study import Study
33-
from .exceptions import QiitaDBStatusError # QiitaDBNotImplementedError
35+
from .exceptions import QiitaDBStatusError, QiitaDBError
3436
from .util import (convert_to_id, get_work_base_dir,
35-
get_mountpoint, get_table_cols, insert_filepaths)
37+
get_mountpoint, insert_filepaths)
3638

3739

3840
class Analysis(QiitaStatusObject):
@@ -719,81 +721,61 @@ def _build_mapping_file(self, samples, conn_handler=None):
719721
Code modified slightly from qiime.util.MetadataMap.__add__"""
720722
conn_handler = conn_handler if conn_handler is not None \
721723
else SQLConnectionHandler()
722-
# We will keep track of all unique sample_ids and metadata headers
723-
# we have seen as we go, as well as studies already seen
724+
724725
all_sample_ids = set()
725-
all_headers = set(get_table_cols("required_sample_info", conn_handler))
726-
all_studies = set()
726+
sql = """SELECT filepath_id, filepath
727+
FROM qiita.filepath
728+
JOIN qiita.prep_template_filepath USING (filepath_id)
729+
JOIN qiita.prep_template_preprocessed_data
730+
USING (prep_template_id)
731+
JOIN qiita.preprocessed_processed_data
732+
USING (preprocessed_data_id)
733+
JOIN qiita.filepath_type USING (filepath_type_id)
734+
WHERE processed_data_id = %s
735+
AND filepath_type = 'qiime_map'
736+
ORDER BY filepath_id DESC"""
737+
_id, fp = get_mountpoint('templates')[0]
738+
to_concat = []
727739

728-
merged_data = defaultdict(lambda: defaultdict(lambda: None))
729740
for pid, samples in viewitems(samples):
730-
if any([all_sample_ids.intersection(samples),
731-
len(set(samples)) != len(samples)]):
732-
# duplicate samples so raise error
733-
raise ValueError("Duplicate sample ids found: %s" %
734-
str(all_sample_ids.intersection(samples)))
735-
all_sample_ids.update(samples)
736-
study_id = ProcessedData(pid).study
737-
738-
# create a convenience study object
739-
s = Study(study_id)
740-
741-
# get the ids to retrieve the data from the sample and prep tables
742-
sample_template_id = s.sample_template
743-
# you can have multiple different prep templates but we are only
744-
# using the one for 16S i. e. the last one ... sorry ;l
745-
# see issue https://github.com/biocore/qiita/issues/465
746-
prep_template_id = RawData(s.raw_data()[0]).prep_templates[-1]
747-
748-
if study_id in all_studies:
749-
# samples already added by other processed data file
750-
# with the study_id
751-
continue
752-
all_studies.add(study_id)
753-
# add headers to set of all headers found
754-
all_headers.update(get_table_cols("sample_%d" % sample_template_id,
755-
conn_handler))
756-
all_headers.update(get_table_cols("prep_%d" % prep_template_id,
757-
conn_handler))
758-
# NEED TO ADD COMMON PREP INFO Issue #247
759-
sql = ("SELECT rs.*, p.*, ss.* "
760-
"FROM qiita.required_sample_info rs JOIN qiita.sample_{0} "
761-
"ss USING(sample_id) JOIN qiita.prep_{1} p USING(sample_id)"
762-
" WHERE rs.sample_id IN {2} AND rs.study_id = {3}".format(
763-
sample_template_id, prep_template_id,
764-
"(%s)" % ",".join("'%s'" % s for s in samples),
765-
study_id))
766-
metadata = conn_handler.execute_fetchall(sql)
767-
# add all the metadata to merged_data
768-
for data in metadata:
769-
sample_id = data['sample_id']
770-
for header, value in viewitems(data):
771-
if header in {'sample_id'}:
772-
continue
773-
merged_data[sample_id][header] = str(value)
774-
775-
# prep headers, making sure they follow mapping file format rules
776-
all_headers = list(all_headers - {'linkerprimersequence',
777-
'barcodesequence', 'description', 'sample_id'})
778-
all_headers.sort()
779-
all_headers = ['BarcodeSequence', 'LinkerPrimerSequence'] + all_headers
780-
all_headers.append('Description')
781-
782-
# write mapping file out
741+
if len(samples) != len(set(samples)):
742+
duplicates = find_duplicates(samples)
743+
raise QiitaDBError("Duplicate sample ids found: %s"
744+
% ', '.join(duplicates))
745+
# Get the QIIME mapping file
746+
qiime_map_fp = conn_handler.execute_fetchall(sql, (pid,))[0][1]
747+
# Parse the mapping file
748+
qiime_map = pd.read_csv(
749+
join(fp, qiime_map_fp), sep='\t', keep_default_na=False,
750+
na_values=['unknown'], index_col=False,
751+
converters=defaultdict(lambda: str))
752+
qiime_map.set_index('#SampleID', inplace=True, drop=True)
753+
qiime_map = qiime_map.loc[samples]
754+
755+
duplicates = all_sample_ids.intersection(qiime_map.index)
756+
if duplicates or len(samples) != len(set(samples)):
757+
# Duplicate samples so raise error
758+
raise QiitaDBError("Duplicate sample ids found: %s"
759+
% ', '.join(duplicates))
760+
all_sample_ids.update(qiime_map.index)
761+
to_concat.append(qiime_map)
762+
763+
merged_map = pd.concat(to_concat)
764+
765+
cols = merged_map.columns.values.tolist()
766+
cols.remove('BarcodeSequence')
767+
cols.remove('LinkerPrimerSequence')
768+
cols.remove('Description')
769+
new_cols = ['BarcodeSequence', 'LinkerPrimerSequence']
770+
new_cols.extend(cols)
771+
new_cols.append('Description')
772+
merged_map = merged_map[new_cols]
773+
774+
# Save the mapping file
783775
_, base_fp = get_mountpoint(self._table)[0]
784776
mapping_fp = join(base_fp, "%d_analysis_mapping.txt" % self._id)
785-
with open(mapping_fp, 'w') as f:
786-
f.write("#SampleID\t%s\n" % '\t'.join(all_headers))
787-
for sample, metadata in viewitems(merged_data):
788-
data = [sample]
789-
for header in all_headers:
790-
l_head = header.lower()
791-
data.append(metadata[l_head] if
792-
metadata[l_head] is not None else "no_data")
793-
f.write("%s\n" % "\t".join(data))
794-
795-
self._add_file("%d_analysis_mapping.txt" % self._id,
796-
"plain_text", conn_handler=conn_handler)
777+
merged_map.to_csv(mapping_fp, index_label='#SampleID',
778+
na_rep='unknown', sep='\t')
797779

798780
def _add_file(self, filename, filetype, data_type=None, conn_handler=None):
799781
"""adds analysis item to database

qiita_db/metadata_template/test/test_prep_template.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -892,7 +892,7 @@ def test_create_error_cleanup(self):
892892

893893
self.assertFalse(exists_table("prep_%d" % exp_id, self.conn_handler))
894894

895-
def _common_creation_checks(self, new_id, pt):
895+
def _common_creation_checks(self, new_id, pt, fp_count):
896896
# The returned object has the correct id
897897
self.assertEqual(pt.id, new_id)
898898

@@ -981,23 +981,25 @@ def _common_creation_checks(self, new_id, pt):
981981
# prep and qiime files have been created
982982
filepaths = pt.get_filepaths()
983983
self.assertEqual(len(filepaths), 2)
984-
self.assertEqual(filepaths[0][0], 22)
985-
self.assertEqual(filepaths[1][0], 21)
984+
self.assertEqual(filepaths[0][0], fp_count + 2)
985+
self.assertEqual(filepaths[1][0], fp_count + 1)
986986

987987
def test_create(self):
988988
"""Creates a new PrepTemplate"""
989+
fp_count = get_count('qiita.filepath')
989990
new_id = get_count('qiita.prep_template') + 1
990991
pt = PrepTemplate.create(self.metadata, self.new_raw_data,
991992
self.test_study, self.data_type)
992-
self._common_creation_checks(new_id, pt)
993+
self._common_creation_checks(new_id, pt, fp_count)
993994

994995
def test_create_already_prefixed_samples(self):
995996
"""Creates a new PrepTemplate"""
997+
fp_count = get_count('qiita.filepath')
996998
new_id = get_count('qiita.prep_template') + 1
997999
pt = npt.assert_warns(QiitaDBWarning, PrepTemplate.create,
9981000
self.metadata_prefixed, self.new_raw_data,
9991001
self.test_study, self.data_type)
1000-
self._common_creation_checks(new_id, pt)
1002+
self._common_creation_checks(new_id, pt, fp_count)
10011003

10021004
def test_generate_files(self):
10031005
fp_count = get_count("qiita.filepath")
@@ -1025,14 +1027,16 @@ def test_create_qiime_mapping_file(self):
10251027

10261028
def test_create_data_type_id(self):
10271029
"""Creates a new PrepTemplate passing the data_type_id"""
1030+
fp_count = get_count('qiita.filepath')
10281031
new_id = get_count('qiita.prep_template') + 1
10291032
pt = PrepTemplate.create(self.metadata, self.new_raw_data,
10301033
self.test_study, self.data_type_id)
1031-
self._common_creation_checks(new_id, pt)
1034+
self._common_creation_checks(new_id, pt, fp_count)
10321035

10331036
def test_create_warning(self):
10341037
"""Warns if a required columns is missing for a given functionality
10351038
"""
1039+
fp_count = get_count("qiita.filepath")
10361040
new_id = get_count('qiita.prep_template') + 1
10371041
del self.metadata['barcode']
10381042
pt = npt.assert_warns(QiitaDBWarning, PrepTemplate.create,
@@ -1123,8 +1127,8 @@ def test_create_warning(self):
11231127
# prep and qiime files have been created
11241128
filepaths = pt.get_filepaths()
11251129
self.assertEqual(len(filepaths), 2)
1126-
self.assertEqual(filepaths[0][0], 22)
1127-
self.assertEqual(filepaths[1][0], 21)
1130+
self.assertEqual(filepaths[0][0], fp_count + 2)
1131+
self.assertEqual(filepaths[1][0], fp_count + 1)
11281132

11291133
def test_create_investigation_type_error(self):
11301134
"""Create raises an error if the investigation_type does not exists"""

qiita_db/support_files/populate_test_db.sql

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -449,4 +449,9 @@ INSERT INTO qiita.collection_users (email, collection_id) VALUES ('shared@foo.ba
449449
INSERT INTO qiita.analysis (email, name, description, dflt, analysis_status_id) VALUES ('test@foo.bar', 'test@foo.bar-dflt', 'dflt', true, 1), ('admin@foo.bar', 'admin@foo.bar-dflt', 'dflt', true, 1), ('shared@foo.bar', 'shared@foo.bar-dflt', 'dflt', true, 1), ('demo@microbio.me', 'demo@microbio.me-dflt', 'dflt', true, 1);
450450

451451
-- Attach samples to analysis
452-
INSERT INTO qiita.analysis_sample (analysis_id, processed_data_id, sample_id) VALUES (3,1,'1.SKD8.640184'), (3,1,'1.SKB7.640196'), (3,1,'1.SKM9.640192'), (3,1,'1.SKM4.640180')
452+
INSERT INTO qiita.analysis_sample (analysis_id, processed_data_id, sample_id) VALUES (3,1,'1.SKD8.640184'), (3,1,'1.SKB7.640196'), (3,1,'1.SKM9.640192'), (3,1,'1.SKM4.640180');
453+
454+
-- Create the new prep_template_filepath
455+
INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id, data_directory_id) VALUES ('1_prep_1_19700101-000000.txt', 15, '3703494589', 1, 9);
456+
INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id, data_directory_id) VALUES ('1_prep_1_qiime_19700101-000000.txt', 16, '3703494589', 1, 9);
457+
INSERT INTO qiita.prep_template_filepath VALUES (1, 19), (1, 20);
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#SampleID BarcodeSequence LinkerPrimerSequence center_name center_project_name emp_status experiment_center experiment_design_description experiment_title illumina_technology library_construction_protocol pcr_primers platform run_center run_date run_prefix samp_size sample_center sequencing_meth study_center target_gene target_subfragment altitude anonymized_name assigned_from_geo collection_timestamp common_name country depth description_duplicate elevation env_biome env_feature has_extracted_data has_physical_specimen host_subject_id host_taxid latitude longitude ph physical_location samp_salinity sample_type season_environment taxon_id temp texture tot_nitro tot_org_carb water_content_soil Description
2+
1.SKB8.640193 AGCGCTCACATC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 0.0 SKB8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M7 3483 74.0894932572 65.3283470202 6.94 ANL 7.15 ENVO:soil winter 1118232 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.16399999999999998 Cannabis Soil Microbiome
3+
1.SKD8.640184 TGAGTGGTCTGT GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 0.0 SKD8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Diesel Root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:D9 3483 57.571893782 32.5563076447 6.8 ANL 7.1 ENVO:soil winter 1118232 15.0 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.17800000000000002 Cannabis Soil Microbiome
4+
1.SKB7.640196 CGGCCTAAGTTC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 0.0 SKB7 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M8 3483 13.089194595 92.5274472082 6.94 ANL 7.15 ENVO:soil winter 1118232 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.16399999999999998 Cannabis Soil Microbiome

0 commit comments

Comments
 (0)