Skip to content

Fix metadata obj #1075

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Apr 28, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
256 changes: 66 additions & 190 deletions qiita_db/metadata_template/base_metadata_template.py

Large diffs are not rendered by default.

74 changes: 69 additions & 5 deletions qiita_db/metadata_template/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,74 @@
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------

from collections import namedtuple
from future.utils import viewkeys, viewvalues

Restriction = namedtuple('Restriction', ['columns', 'error_msg'])

# A dict containing the restrictions that apply to the sample templates
SAMPLE_TEMPLATE_COLUMNS = {
# The following columns are required by EBI for submission
'EBI': Restriction(columns={'collection_timestamp': 'timestamp',
'physical_specimen_location': 'varchar'},
error_msg="EBI submission disabled"),
# The following columns are required for the official main QIITA site
'qiita_main': Restriction(columns={'sample_type': 'varchar',
'description': 'varchar',
'physical_specimen_remaining': 'bool',
'dna_extracted': 'bool',
'latitude': 'float8',
'longitude': 'float8',
'host_subject_id': 'varchar'},
error_msg="Processed data approval disabled")
}

# A dict containing the restrictions that apply to the prep templates
PREP_TEMPLATE_COLUMNS = {
# The following columns are required by EBI for submission
'EBI': Restriction(
columns={'primer': 'varchar',
'center_name': 'varchar',
'platform': 'varchar',
'library_construction_protocol': 'varchar',
'experiment_design_description': 'varchar'},
error_msg="EBI submission disabled")
}

# Different prep templates have different requirements depending on the data
# type. We create a dictionary for each of these special datatypes

TARGET_GENE_DATA_TYPES = ['16S', '18S', 'ITS']
REQUIRED_TARGET_GENE_COLS = {'barcodesequence', 'linkerprimersequence',
'run_prefix', 'library_construction_protocol',
'experiment_design_description', 'platform'}
RENAME_COLS_DICT = {'barcode': 'barcodesequence',
'primer': 'linkerprimersequence'}

PREP_TEMPLATE_COLUMNS_TARGET_GENE = {
# The following columns are required by QIIME to execute split libraries
'demultiplex': Restriction(
columns={'barcode': 'varchar',
'primer': 'varchar'},
error_msg="Demultiplexing disabled. You will not be able to "
"preprocess your raw data"),
# The following columns are required by Qiita to know how to execute split
# libraries using QIIME over a study with multiple illumina lanes
'demultiplex_multiple': Restriction(
columns={'barcode': 'varchar',
'primer': 'varchar',
'run_prefix': 'varchar'},
error_msg="Demultiplexing with multiple input files disabled. If your "
"raw data includes multiple raw input files, you will not "
"be able to preprocess your raw data")
}

# This list is useful to have if we want to loop through all the restrictions
# in a template-independent manner
ALL_RESTRICTIONS = [SAMPLE_TEMPLATE_COLUMNS, PREP_TEMPLATE_COLUMNS,
PREP_TEMPLATE_COLUMNS_TARGET_GENE]


# A set holding all the controlled columns, useful to avoid recalculating it
def _col_iterator():
for r_set in ALL_RESTRICTIONS:
for restriction in viewvalues(r_set):
for cols in viewkeys(restriction.columns):
yield cols

CONTROLLED_COLS = set(col for col in _col_iterator())
113 changes: 57 additions & 56 deletions qiita_db/metadata_template/prep_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,26 @@
# -----------------------------------------------------------------------------

from __future__ import division
from future.utils import viewvalues
from os.path import join
from time import strftime
from copy import deepcopy
import warnings

import pandas as pd

from qiita_core.exceptions import IncompetentQiitaDeveloperError
from qiita_db.exceptions import (QiitaDBColumnError, QiitaDBUnknownIDError,
QiitaDBError, QiitaDBExecutionError)
QiitaDBError, QiitaDBExecutionError,
QiitaDBWarning)
from qiita_db.sql_connection import SQLConnectionHandler
from qiita_db.ontology import Ontology
from qiita_db.util import (convert_to_id,
convert_from_id, get_mountpoint, infer_status)
from .base_metadata_template import BaseSample, MetadataTemplate
from .util import load_template_to_dataframe
from .constants import (TARGET_GENE_DATA_TYPES, RENAME_COLS_DICT,
REQUIRED_TARGET_GENE_COLS)
from .constants import (TARGET_GENE_DATA_TYPES, PREP_TEMPLATE_COLUMNS,
PREP_TEMPLATE_COLUMNS_TARGET_GENE)


class PrepSample(BaseSample):
Expand Down Expand Up @@ -66,8 +72,9 @@ class PrepTemplate(MetadataTemplate):
_table_prefix = "prep_"
_column_table = "prep_columns"
_id_column = "prep_template_id"
translate_cols_dict = {'emp_status_id': 'emp_status'}
_sample_cls = PrepSample
_fp_id = convert_to_id("prep_template", "filepath_type")
_filepath_table = 'prep_template_filepath'

@classmethod
def create(cls, md_template, raw_data, study, data_type,
Expand Down Expand Up @@ -116,8 +123,13 @@ def create(cls, md_template, raw_data, study, data_type,
data_type_id = convert_to_id(data_type, "data_type", conn_handler)
data_type_str = data_type

pt_cols = PREP_TEMPLATE_COLUMNS
if data_type_str in TARGET_GENE_DATA_TYPES:
pt_cols = deepcopy(PREP_TEMPLATE_COLUMNS)
pt_cols.update(PREP_TEMPLATE_COLUMNS_TARGET_GENE)

md_template = cls._clean_validate_template(md_template, study.id,
data_type_str, conn_handler)
pt_cols)

# Insert the metadata template
# We need the prep_id for multiple calls below, which currently is not
Expand All @@ -140,7 +152,7 @@ def create(cls, md_template, raw_data, study, data_type,
"{0} = %s".format(cls._id_column), (prep_id,))

# Check if sample IDs present here but not in sample template
sql = ("SELECT sample_id from qiita.required_sample_info WHERE "
sql = ("SELECT sample_id from qiita.study_sample WHERE "
"study_id = %s")
# Get list of study sample IDs, prep template study IDs,
# and their intersection
Expand Down Expand Up @@ -181,40 +193,6 @@ def validate_investigation_type(self, investigation_type):
"Choose from: %s" % (investigation_type,
', '.join(terms)))

@classmethod
def _check_template_special_columns(cls, md_template, data_type):
r"""Checks for special columns based on obj type

Parameters
----------
md_template : DataFrame
The metadata template file contents indexed by sample ids
data_type : str
The data_type of the template.

Returns
-------
set
The set of missing columns

Notes
-----
Sometimes people use different names for the same columns. We just
rename them to use the naming that we expect, so this is normalized
across studies.
"""
# We only have column requirements if the data type of the raw data
# is one of the target gene types
missing_cols = set()
if data_type in TARGET_GENE_DATA_TYPES:
md_template.rename(columns=RENAME_COLS_DICT, inplace=True)

# Check for all required columns for target genes studies
missing_cols = REQUIRED_TARGET_GENE_COLS.difference(
md_template.columns)

return missing_cols

@classmethod
def delete(cls, id_):
r"""Deletes the table from the database
Expand Down Expand Up @@ -412,17 +390,11 @@ def generate_files(self):
self.add_filepath(fp)

# creating QIIME mapping file
self.create_qiime_mapping_file(fp)
self.create_qiime_mapping_file()

def create_qiime_mapping_file(self, prep_template_fp):
def create_qiime_mapping_file(self):
"""This creates the QIIME mapping file and links it in the db.

Parameters
----------
prep_template_fp : str
The prep template filepath that should be concatenated to the
sample template go used to generate a new QIIME mapping file

Returns
-------
filepath : str
Expand All @@ -432,12 +404,20 @@ def create_qiime_mapping_file(self, prep_template_fp):
------
ValueError
If the prep template is not a subset of the sample template
QiitaDBWarning
If the QIIME-required columns are not present in the template

Notes
-----
We cannot ensure that the QIIME-required columns are present in the
metadata map. However, we have to generate a QIIME-compliant mapping
file. Since the user may need a QIIME mapping file, but not these
QIIME-required columns, we are going to create them and
populate them with the value XXQIITAXX.
"""
rename_cols = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need to do this? I'm not sure but it seems unnecessary at this point.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ping! I do not think we need to patch this way for this specific fields, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oooops! Good call, we still need it, but not as extensive. Instead of:

rename_cols = {
    'barcode': 'BarcodeSequence',
    'barcodesequence': 'BarcodeSequence',
    'primer': 'LinkerPrimerSequence',
    'linkerprimersequence': 'LinkerPrimerSequence',
    'description': 'Description',
}

We need to use:

rename_cols = {
    'barcode': 'BarcodeSequence',
    'primer': 'LinkerPrimerSequence',
    'description': 'Description',
}

The reason is that we are using different names on the database than on QIIME. Does this seams reasonable? The other solution is to require the user to introduce those values directly. This will require changing the restrictions. What do you think?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reduced version makes more sense.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, while changing this I noticed that the prep template already created in the database is not fully correct. I'm changing it and it will probably require a couple of more changes in the rest of the tests.

'barcode': 'BarcodeSequence',
'barcodesequence': 'BarcodeSequence',
'primer': 'LinkerPrimerSequence',
'linkerprimersequence': 'LinkerPrimerSequence',
'description': 'Description',
}

Expand All @@ -456,19 +436,38 @@ def create_qiime_mapping_file(self, prep_template_fp):

# reading files via pandas
st = load_template_to_dataframe(sample_template_fp)
pt = load_template_to_dataframe(prep_template_fp)
pt = self.to_dataframe()

st_sample_names = set(st.index)
pt_sample_names = set(pt.index)

if not pt_sample_names.issubset(st_sample_names):
raise ValueError(
"Prep template is not a sub set of the sample template, files:"
"%s %s - samples: %s" % (sample_template_fp, prep_template_fp,
str(pt_sample_names-st_sample_names)))
"Prep template is not a sub set of the sample template, files"
"%s - samples: %s"
% (sample_template_fp,
', '.join(pt_sample_names-st_sample_names)))

mapping = pt.join(st, lsuffix="_prep")
mapping.rename(columns=rename_cols, inplace=True)

# Pre-populate the QIIME-required columns with the value XXQIITAXX
index = mapping.index
placeholder = ['XXQIITAXX'] * len(index)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You probably thought about this, but why not just setting an empty string?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Empty strings are easy to miss. On previous discussions, @rob-knight suggested that any value that we populate in the templates should be make clear that it is wrong and needs to be fixed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now I remember, thanks!

On (Apr-27-15|16:24), josenavas wrote:

     mapping = pt.join(st, lsuffix="_prep")
     mapping.rename(columns=rename_cols, inplace=True)
  •    # Pre-populate the QIIME-required columns with the value XXQIITAXX
    
  •    index = mapping.index
    
  •    placeholder = ['XXQIITAXX'] \* len(index)
    

Empty strings are easy to miss. On previous discussions, @rob-knight suggested that any value that we populate in the templates should be make clear that it is wrong and needs to be fixed.


Reply to this email directly or view it on GitHub:
https://github.com/biocore/qiita/pull/1075/files#r29202141

missing = []
for val in viewvalues(rename_cols):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think it would make sense to raise a warning in this case? The warning would make a note of the columns that would have place-holders.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call, adding.

if val not in mapping:
missing.append(val)
mapping[val] = pd.Series(placeholder, index=index)

if missing:
warnings.warn(
"Some columns required to generate a QIIME-compliant mapping "
"file are not present in the template. A placeholder value "
"(XXQIITAXX) has been used to populate these columns. Missing "
"columns: %s" % ', '.join(missing),
QiitaDBWarning)

# Gets the orginal mapping columns and readjust the order to comply
# with QIIME requirements
cols = mapping.columns.values.tolist()
Expand All @@ -486,11 +485,13 @@ def create_qiime_mapping_file(self, prep_template_fp):
self.id, strftime("%Y%m%d-%H%M%S")))

# Save the mapping file
mapping.to_csv(filepath, index_label='#SampleID', na_rep='unknown',
mapping.to_csv(filepath, index_label='#SampleID', na_rep='',
sep='\t')

# adding the fp to the object
self.add_filepath(filepath)
self.add_filepath(
filepath, conn_handler=conn_handler,
fp_id=convert_to_id("qiime_map", "filepath_type"))

return filepath

Expand Down
27 changes: 7 additions & 20 deletions qiita_db/metadata_template/sample_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@
from qiita_db.exceptions import (QiitaDBDuplicateError, QiitaDBError,
QiitaDBUnknownIDError)
from qiita_db.sql_connection import SQLConnectionHandler
from qiita_db.util import get_required_sample_info_status, get_mountpoint
from qiita_db.util import get_mountpoint, convert_to_id
from qiita_db.study import Study
from qiita_db.data import RawData
from .base_metadata_template import BaseSample, MetadataTemplate
from .prep_template import PrepTemplate
from .constants import SAMPLE_TEMPLATE_COLUMNS


class Sample(BaseSample):
Expand Down Expand Up @@ -66,9 +67,9 @@ class SampleTemplate(MetadataTemplate):
_table_prefix = "sample_"
_column_table = "study_sample_columns"
_id_column = "study_id"
translate_cols_dict = {
'required_sample_info_status_id': 'required_sample_info_status'}
_sample_cls = Sample
_fp_id = convert_to_id("sample_template", "filepath_type")
_filepath_table = 'sample_template_filepath'

@staticmethod
def metadata_headers():
Expand All @@ -87,19 +88,6 @@ def metadata_headers():
"WHERE table_name = 'required_sample_info' "
"ORDER BY column_name")]

@classmethod
def _check_template_special_columns(cls, md_template, study_id):
r"""Checks for special columns based on obj type

Parameters
----------
md_template : DataFrame
The metadata template file contents indexed by sample ids
study_id : int
The study to which the sample template belongs to.
"""
return set()

@classmethod
def create(cls, md_template, study):
r"""Creates the sample template in the database
Expand All @@ -123,7 +111,7 @@ def create(cls, md_template, study):

# Clean and validate the metadata template given
md_template = cls._clean_validate_template(md_template, study.id,
study.id, conn_handler)
SAMPLE_TEMPLATE_COLUMNS)

cls._add_common_creation_steps_to_queue(md_template, study.id,
conn_handler, queue_name)
Expand Down Expand Up @@ -233,8 +221,7 @@ def extend(self, md_template):
conn_handler.create_queue(queue_name)

md_template = self._clean_validate_template(md_template, self.study_id,
self.study_id,
conn_handler)
SAMPLE_TEMPLATE_COLUMNS)

self._add_common_extend_steps_to_queue(md_template, conn_handler,
queue_name)
Expand All @@ -260,7 +247,7 @@ def update(self, md_template):

# Clean and validate the metadata template given
new_map = self._clean_validate_template(md_template, self.id,
conn_handler)
SAMPLE_TEMPLATE_COLUMNS)
# Retrieving current metadata
current_map = self._transform_to_dict(conn_handler.execute_fetchall(
"SELECT * FROM qiita.{0} WHERE {1}=%s".format(self._table,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def test_add_common_creation_steps_to_queue(self):
def test_clean_validate_template(self):
"""_clean_validate_template raises an error from base class"""
with self.assertRaises(IncompetentQiitaDeveloperError):
MetadataTemplate._clean_validate_template(None, 1, None, None)
MetadataTemplate._clean_validate_template(None, 1, None)


if __name__ == '__main__':
Expand Down
Loading