Skip to content

Commit 79256ff

Browse files
committed
Merge pull request #1075 from josenavas/fix-metadata-obj
Fix metadata obj
2 parents 0cfab56 + 2d3df08 commit 79256ff

File tree

12 files changed

+1222
-1154
lines changed

12 files changed

+1222
-1154
lines changed

qiita_db/metadata_template/base_metadata_template.py

Lines changed: 66 additions & 190 deletions
Large diffs are not rendered by default.

qiita_db/metadata_template/constants.py

Lines changed: 69 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,74 @@
66
# The full license is in the file LICENSE, distributed with this software.
77
# -----------------------------------------------------------------------------
88

9+
from collections import namedtuple
10+
from future.utils import viewkeys, viewvalues
11+
12+
Restriction = namedtuple('Restriction', ['columns', 'error_msg'])
13+
14+
# A dict containing the restrictions that apply to the sample templates
15+
SAMPLE_TEMPLATE_COLUMNS = {
16+
# The following columns are required by EBI for submission
17+
'EBI': Restriction(columns={'collection_timestamp': 'timestamp',
18+
'physical_specimen_location': 'varchar'},
19+
error_msg="EBI submission disabled"),
20+
# The following columns are required for the official main QIITA site
21+
'qiita_main': Restriction(columns={'sample_type': 'varchar',
22+
'description': 'varchar',
23+
'physical_specimen_remaining': 'bool',
24+
'dna_extracted': 'bool',
25+
'latitude': 'float8',
26+
'longitude': 'float8',
27+
'host_subject_id': 'varchar'},
28+
error_msg="Processed data approval disabled")
29+
}
30+
31+
# A dict containing the restrictions that apply to the prep templates
32+
PREP_TEMPLATE_COLUMNS = {
33+
# The following columns are required by EBI for submission
34+
'EBI': Restriction(
35+
columns={'primer': 'varchar',
36+
'center_name': 'varchar',
37+
'platform': 'varchar',
38+
'library_construction_protocol': 'varchar',
39+
'experiment_design_description': 'varchar'},
40+
error_msg="EBI submission disabled")
41+
}
42+
43+
# Different prep templates have different requirements depending on the data
44+
# type. We create a dictionary for each of these special datatypes
945

1046
TARGET_GENE_DATA_TYPES = ['16S', '18S', 'ITS']
11-
REQUIRED_TARGET_GENE_COLS = {'barcodesequence', 'linkerprimersequence',
12-
'run_prefix', 'library_construction_protocol',
13-
'experiment_design_description', 'platform'}
14-
RENAME_COLS_DICT = {'barcode': 'barcodesequence',
15-
'primer': 'linkerprimersequence'}
47+
48+
PREP_TEMPLATE_COLUMNS_TARGET_GENE = {
49+
# The following columns are required by QIIME to execute split libraries
50+
'demultiplex': Restriction(
51+
columns={'barcode': 'varchar',
52+
'primer': 'varchar'},
53+
error_msg="Demultiplexing disabled. You will not be able to "
54+
"preprocess your raw data"),
55+
# The following columns are required by Qiita to know how to execute split
56+
# libraries using QIIME over a study with multiple illumina lanes
57+
'demultiplex_multiple': Restriction(
58+
columns={'barcode': 'varchar',
59+
'primer': 'varchar',
60+
'run_prefix': 'varchar'},
61+
error_msg="Demultiplexing with multiple input files disabled. If your "
62+
"raw data includes multiple raw input files, you will not "
63+
"be able to preprocess your raw data")
64+
}
65+
66+
# This list is useful to have if we want to loop through all the restrictions
67+
# in a template-independent manner
68+
ALL_RESTRICTIONS = [SAMPLE_TEMPLATE_COLUMNS, PREP_TEMPLATE_COLUMNS,
69+
PREP_TEMPLATE_COLUMNS_TARGET_GENE]
70+
71+
72+
# A set holding all the controlled columns, useful to avoid recalculating it
73+
def _col_iterator():
74+
for r_set in ALL_RESTRICTIONS:
75+
for restriction in viewvalues(r_set):
76+
for cols in viewkeys(restriction.columns):
77+
yield cols
78+
79+
CONTROLLED_COLS = set(col for col in _col_iterator())

qiita_db/metadata_template/prep_template.py

Lines changed: 57 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,26 @@
77
# -----------------------------------------------------------------------------
88

99
from __future__ import division
10+
from future.utils import viewvalues
1011
from os.path import join
1112
from time import strftime
13+
from copy import deepcopy
14+
import warnings
15+
16+
import pandas as pd
1217

1318
from qiita_core.exceptions import IncompetentQiitaDeveloperError
1419
from qiita_db.exceptions import (QiitaDBColumnError, QiitaDBUnknownIDError,
15-
QiitaDBError, QiitaDBExecutionError)
20+
QiitaDBError, QiitaDBExecutionError,
21+
QiitaDBWarning)
1622
from qiita_db.sql_connection import SQLConnectionHandler
1723
from qiita_db.ontology import Ontology
1824
from qiita_db.util import (convert_to_id,
1925
convert_from_id, get_mountpoint, infer_status)
2026
from .base_metadata_template import BaseSample, MetadataTemplate
2127
from .util import load_template_to_dataframe
22-
from .constants import (TARGET_GENE_DATA_TYPES, RENAME_COLS_DICT,
23-
REQUIRED_TARGET_GENE_COLS)
28+
from .constants import (TARGET_GENE_DATA_TYPES, PREP_TEMPLATE_COLUMNS,
29+
PREP_TEMPLATE_COLUMNS_TARGET_GENE)
2430

2531

2632
class PrepSample(BaseSample):
@@ -66,8 +72,9 @@ class PrepTemplate(MetadataTemplate):
6672
_table_prefix = "prep_"
6773
_column_table = "prep_columns"
6874
_id_column = "prep_template_id"
69-
translate_cols_dict = {'emp_status_id': 'emp_status'}
7075
_sample_cls = PrepSample
76+
_fp_id = convert_to_id("prep_template", "filepath_type")
77+
_filepath_table = 'prep_template_filepath'
7178

7279
@classmethod
7380
def create(cls, md_template, raw_data, study, data_type,
@@ -116,8 +123,13 @@ def create(cls, md_template, raw_data, study, data_type,
116123
data_type_id = convert_to_id(data_type, "data_type", conn_handler)
117124
data_type_str = data_type
118125

126+
pt_cols = PREP_TEMPLATE_COLUMNS
127+
if data_type_str in TARGET_GENE_DATA_TYPES:
128+
pt_cols = deepcopy(PREP_TEMPLATE_COLUMNS)
129+
pt_cols.update(PREP_TEMPLATE_COLUMNS_TARGET_GENE)
130+
119131
md_template = cls._clean_validate_template(md_template, study.id,
120-
data_type_str, conn_handler)
132+
pt_cols)
121133

122134
# Insert the metadata template
123135
# We need the prep_id for multiple calls below, which currently is not
@@ -140,7 +152,7 @@ def create(cls, md_template, raw_data, study, data_type,
140152
"{0} = %s".format(cls._id_column), (prep_id,))
141153

142154
# Check if sample IDs present here but not in sample template
143-
sql = ("SELECT sample_id from qiita.required_sample_info WHERE "
155+
sql = ("SELECT sample_id from qiita.study_sample WHERE "
144156
"study_id = %s")
145157
# Get list of study sample IDs, prep template study IDs,
146158
# and their intersection
@@ -181,40 +193,6 @@ def validate_investigation_type(self, investigation_type):
181193
"Choose from: %s" % (investigation_type,
182194
', '.join(terms)))
183195

184-
@classmethod
185-
def _check_template_special_columns(cls, md_template, data_type):
186-
r"""Checks for special columns based on obj type
187-
188-
Parameters
189-
----------
190-
md_template : DataFrame
191-
The metadata template file contents indexed by sample ids
192-
data_type : str
193-
The data_type of the template.
194-
195-
Returns
196-
-------
197-
set
198-
The set of missing columns
199-
200-
Notes
201-
-----
202-
Sometimes people use different names for the same columns. We just
203-
rename them to use the naming that we expect, so this is normalized
204-
across studies.
205-
"""
206-
# We only have column requirements if the data type of the raw data
207-
# is one of the target gene types
208-
missing_cols = set()
209-
if data_type in TARGET_GENE_DATA_TYPES:
210-
md_template.rename(columns=RENAME_COLS_DICT, inplace=True)
211-
212-
# Check for all required columns for target genes studies
213-
missing_cols = REQUIRED_TARGET_GENE_COLS.difference(
214-
md_template.columns)
215-
216-
return missing_cols
217-
218196
@classmethod
219197
def delete(cls, id_):
220198
r"""Deletes the table from the database
@@ -412,17 +390,11 @@ def generate_files(self):
412390
self.add_filepath(fp)
413391

414392
# creating QIIME mapping file
415-
self.create_qiime_mapping_file(fp)
393+
self.create_qiime_mapping_file()
416394

417-
def create_qiime_mapping_file(self, prep_template_fp):
395+
def create_qiime_mapping_file(self):
418396
"""This creates the QIIME mapping file and links it in the db.
419397
420-
Parameters
421-
----------
422-
prep_template_fp : str
423-
The prep template filepath that should be concatenated to the
424-
sample template go used to generate a new QIIME mapping file
425-
426398
Returns
427399
-------
428400
filepath : str
@@ -432,12 +404,20 @@ def create_qiime_mapping_file(self, prep_template_fp):
432404
------
433405
ValueError
434406
If the prep template is not a subset of the sample template
407+
QiitaDBWarning
408+
If the QIIME-required columns are not present in the template
409+
410+
Notes
411+
-----
412+
We cannot ensure that the QIIME-required columns are present in the
413+
metadata map. However, we have to generate a QIIME-compliant mapping
414+
file. Since the user may need a QIIME mapping file, but not these
415+
QIIME-required columns, we are going to create them and
416+
populate them with the value XXQIITAXX.
435417
"""
436418
rename_cols = {
437419
'barcode': 'BarcodeSequence',
438-
'barcodesequence': 'BarcodeSequence',
439420
'primer': 'LinkerPrimerSequence',
440-
'linkerprimersequence': 'LinkerPrimerSequence',
441421
'description': 'Description',
442422
}
443423

@@ -456,19 +436,38 @@ def create_qiime_mapping_file(self, prep_template_fp):
456436

457437
# reading files via pandas
458438
st = load_template_to_dataframe(sample_template_fp)
459-
pt = load_template_to_dataframe(prep_template_fp)
439+
pt = self.to_dataframe()
440+
460441
st_sample_names = set(st.index)
461442
pt_sample_names = set(pt.index)
462443

463444
if not pt_sample_names.issubset(st_sample_names):
464445
raise ValueError(
465-
"Prep template is not a sub set of the sample template, files:"
466-
"%s %s - samples: %s" % (sample_template_fp, prep_template_fp,
467-
str(pt_sample_names-st_sample_names)))
446+
"Prep template is not a sub set of the sample template, files"
447+
"%s - samples: %s"
448+
% (sample_template_fp,
449+
', '.join(pt_sample_names-st_sample_names)))
468450

469451
mapping = pt.join(st, lsuffix="_prep")
470452
mapping.rename(columns=rename_cols, inplace=True)
471453

454+
# Pre-populate the QIIME-required columns with the value XXQIITAXX
455+
index = mapping.index
456+
placeholder = ['XXQIITAXX'] * len(index)
457+
missing = []
458+
for val in viewvalues(rename_cols):
459+
if val not in mapping:
460+
missing.append(val)
461+
mapping[val] = pd.Series(placeholder, index=index)
462+
463+
if missing:
464+
warnings.warn(
465+
"Some columns required to generate a QIIME-compliant mapping "
466+
"file are not present in the template. A placeholder value "
467+
"(XXQIITAXX) has been used to populate these columns. Missing "
468+
"columns: %s" % ', '.join(missing),
469+
QiitaDBWarning)
470+
472471
# Gets the orginal mapping columns and readjust the order to comply
473472
# with QIIME requirements
474473
cols = mapping.columns.values.tolist()
@@ -486,11 +485,13 @@ def create_qiime_mapping_file(self, prep_template_fp):
486485
self.id, strftime("%Y%m%d-%H%M%S")))
487486

488487
# Save the mapping file
489-
mapping.to_csv(filepath, index_label='#SampleID', na_rep='unknown',
488+
mapping.to_csv(filepath, index_label='#SampleID', na_rep='',
490489
sep='\t')
491490

492491
# adding the fp to the object
493-
self.add_filepath(filepath)
492+
self.add_filepath(
493+
filepath, conn_handler=conn_handler,
494+
fp_id=convert_to_id("qiime_map", "filepath_type"))
494495

495496
return filepath
496497

qiita_db/metadata_template/sample_template.py

Lines changed: 7 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,12 @@
1616
from qiita_db.exceptions import (QiitaDBDuplicateError, QiitaDBError,
1717
QiitaDBUnknownIDError)
1818
from qiita_db.sql_connection import SQLConnectionHandler
19-
from qiita_db.util import get_required_sample_info_status, get_mountpoint
19+
from qiita_db.util import get_mountpoint, convert_to_id
2020
from qiita_db.study import Study
2121
from qiita_db.data import RawData
2222
from .base_metadata_template import BaseSample, MetadataTemplate
2323
from .prep_template import PrepTemplate
24+
from .constants import SAMPLE_TEMPLATE_COLUMNS
2425

2526

2627
class Sample(BaseSample):
@@ -66,9 +67,9 @@ class SampleTemplate(MetadataTemplate):
6667
_table_prefix = "sample_"
6768
_column_table = "study_sample_columns"
6869
_id_column = "study_id"
69-
translate_cols_dict = {
70-
'required_sample_info_status_id': 'required_sample_info_status'}
7170
_sample_cls = Sample
71+
_fp_id = convert_to_id("sample_template", "filepath_type")
72+
_filepath_table = 'sample_template_filepath'
7273

7374
@staticmethod
7475
def metadata_headers():
@@ -87,19 +88,6 @@ def metadata_headers():
8788
"WHERE table_name = 'required_sample_info' "
8889
"ORDER BY column_name")]
8990

90-
@classmethod
91-
def _check_template_special_columns(cls, md_template, study_id):
92-
r"""Checks for special columns based on obj type
93-
94-
Parameters
95-
----------
96-
md_template : DataFrame
97-
The metadata template file contents indexed by sample ids
98-
study_id : int
99-
The study to which the sample template belongs to.
100-
"""
101-
return set()
102-
10391
@classmethod
10492
def create(cls, md_template, study):
10593
r"""Creates the sample template in the database
@@ -123,7 +111,7 @@ def create(cls, md_template, study):
123111

124112
# Clean and validate the metadata template given
125113
md_template = cls._clean_validate_template(md_template, study.id,
126-
study.id, conn_handler)
114+
SAMPLE_TEMPLATE_COLUMNS)
127115

128116
cls._add_common_creation_steps_to_queue(md_template, study.id,
129117
conn_handler, queue_name)
@@ -233,8 +221,7 @@ def extend(self, md_template):
233221
conn_handler.create_queue(queue_name)
234222

235223
md_template = self._clean_validate_template(md_template, self.study_id,
236-
self.study_id,
237-
conn_handler)
224+
SAMPLE_TEMPLATE_COLUMNS)
238225

239226
self._add_common_extend_steps_to_queue(md_template, conn_handler,
240227
queue_name)
@@ -260,7 +247,7 @@ def update(self, md_template):
260247

261248
# Clean and validate the metadata template given
262249
new_map = self._clean_validate_template(md_template, self.id,
263-
conn_handler)
250+
SAMPLE_TEMPLATE_COLUMNS)
264251
# Retrieving current metadata
265252
current_map = self._transform_to_dict(conn_handler.execute_fetchall(
266253
"SELECT * FROM qiita.{0} WHERE {1}=%s".format(self._table,

qiita_db/metadata_template/test/test_base_metadata_template.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def test_add_common_creation_steps_to_queue(self):
6060
def test_clean_validate_template(self):
6161
"""_clean_validate_template raises an error from base class"""
6262
with self.assertRaises(IncompetentQiitaDeveloperError):
63-
MetadataTemplate._clean_validate_template(None, 1, None, None)
63+
MetadataTemplate._clean_validate_template(None, 1, None)
6464

6565

6666
if __name__ == '__main__':

0 commit comments

Comments
 (0)