-
Notifications
You must be signed in to change notification settings - Fork 80
Fix metadata obj #1075
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix metadata obj #1075
Changes from all commits
8ec3b27
a13a7b8
c64ab6c
5f23a55
ef284f3
694f4ab
3728f63
b145fca
78f763a
013d126
ba49dc5
57ed605
2a29012
62f0d3e
f81caa9
134678d
6aa5335
1cc26e9
2d3df08
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,20 +7,26 @@ | |
# ----------------------------------------------------------------------------- | ||
|
||
from __future__ import division | ||
from future.utils import viewvalues | ||
from os.path import join | ||
from time import strftime | ||
from copy import deepcopy | ||
import warnings | ||
|
||
import pandas as pd | ||
|
||
from qiita_core.exceptions import IncompetentQiitaDeveloperError | ||
from qiita_db.exceptions import (QiitaDBColumnError, QiitaDBUnknownIDError, | ||
QiitaDBError, QiitaDBExecutionError) | ||
QiitaDBError, QiitaDBExecutionError, | ||
QiitaDBWarning) | ||
from qiita_db.sql_connection import SQLConnectionHandler | ||
from qiita_db.ontology import Ontology | ||
from qiita_db.util import (convert_to_id, | ||
convert_from_id, get_mountpoint, infer_status) | ||
from .base_metadata_template import BaseSample, MetadataTemplate | ||
from .util import load_template_to_dataframe | ||
from .constants import (TARGET_GENE_DATA_TYPES, RENAME_COLS_DICT, | ||
REQUIRED_TARGET_GENE_COLS) | ||
from .constants import (TARGET_GENE_DATA_TYPES, PREP_TEMPLATE_COLUMNS, | ||
PREP_TEMPLATE_COLUMNS_TARGET_GENE) | ||
|
||
|
||
class PrepSample(BaseSample): | ||
|
@@ -66,8 +72,9 @@ class PrepTemplate(MetadataTemplate): | |
_table_prefix = "prep_" | ||
_column_table = "prep_columns" | ||
_id_column = "prep_template_id" | ||
translate_cols_dict = {'emp_status_id': 'emp_status'} | ||
_sample_cls = PrepSample | ||
_fp_id = convert_to_id("prep_template", "filepath_type") | ||
_filepath_table = 'prep_template_filepath' | ||
|
||
@classmethod | ||
def create(cls, md_template, raw_data, study, data_type, | ||
|
@@ -116,8 +123,13 @@ def create(cls, md_template, raw_data, study, data_type, | |
data_type_id = convert_to_id(data_type, "data_type", conn_handler) | ||
data_type_str = data_type | ||
|
||
pt_cols = PREP_TEMPLATE_COLUMNS | ||
if data_type_str in TARGET_GENE_DATA_TYPES: | ||
pt_cols = deepcopy(PREP_TEMPLATE_COLUMNS) | ||
pt_cols.update(PREP_TEMPLATE_COLUMNS_TARGET_GENE) | ||
|
||
md_template = cls._clean_validate_template(md_template, study.id, | ||
data_type_str, conn_handler) | ||
pt_cols) | ||
|
||
# Insert the metadata template | ||
# We need the prep_id for multiple calls below, which currently is not | ||
|
@@ -140,7 +152,7 @@ def create(cls, md_template, raw_data, study, data_type, | |
"{0} = %s".format(cls._id_column), (prep_id,)) | ||
|
||
# Check if sample IDs present here but not in sample template | ||
sql = ("SELECT sample_id from qiita.required_sample_info WHERE " | ||
sql = ("SELECT sample_id from qiita.study_sample WHERE " | ||
"study_id = %s") | ||
# Get list of study sample IDs, prep template study IDs, | ||
# and their intersection | ||
|
@@ -181,40 +193,6 @@ def validate_investigation_type(self, investigation_type): | |
"Choose from: %s" % (investigation_type, | ||
', '.join(terms))) | ||
|
||
@classmethod | ||
def _check_template_special_columns(cls, md_template, data_type): | ||
r"""Checks for special columns based on obj type | ||
|
||
Parameters | ||
---------- | ||
md_template : DataFrame | ||
The metadata template file contents indexed by sample ids | ||
data_type : str | ||
The data_type of the template. | ||
|
||
Returns | ||
------- | ||
set | ||
The set of missing columns | ||
|
||
Notes | ||
----- | ||
Sometimes people use different names for the same columns. We just | ||
rename them to use the naming that we expect, so this is normalized | ||
across studies. | ||
""" | ||
# We only have column requirements if the data type of the raw data | ||
# is one of the target gene types | ||
missing_cols = set() | ||
if data_type in TARGET_GENE_DATA_TYPES: | ||
md_template.rename(columns=RENAME_COLS_DICT, inplace=True) | ||
|
||
# Check for all required columns for target genes studies | ||
missing_cols = REQUIRED_TARGET_GENE_COLS.difference( | ||
md_template.columns) | ||
|
||
return missing_cols | ||
|
||
@classmethod | ||
def delete(cls, id_): | ||
r"""Deletes the table from the database | ||
|
@@ -412,17 +390,11 @@ def generate_files(self): | |
self.add_filepath(fp) | ||
|
||
# creating QIIME mapping file | ||
self.create_qiime_mapping_file(fp) | ||
self.create_qiime_mapping_file() | ||
|
||
def create_qiime_mapping_file(self, prep_template_fp): | ||
def create_qiime_mapping_file(self): | ||
"""This creates the QIIME mapping file and links it in the db. | ||
|
||
Parameters | ||
---------- | ||
prep_template_fp : str | ||
The prep template filepath that should be concatenated to the | ||
sample template go used to generate a new QIIME mapping file | ||
|
||
Returns | ||
------- | ||
filepath : str | ||
|
@@ -432,12 +404,20 @@ def create_qiime_mapping_file(self, prep_template_fp): | |
------ | ||
ValueError | ||
If the prep template is not a subset of the sample template | ||
QiitaDBWarning | ||
If the QIIME-required columns are not present in the template | ||
|
||
Notes | ||
----- | ||
We cannot ensure that the QIIME-required columns are present in the | ||
metadata map. However, we have to generate a QIIME-compliant mapping | ||
file. Since the user may need a QIIME mapping file, but not these | ||
QIIME-required columns, we are going to create them and | ||
populate them with the value XXQIITAXX. | ||
""" | ||
rename_cols = { | ||
'barcode': 'BarcodeSequence', | ||
'barcodesequence': 'BarcodeSequence', | ||
'primer': 'LinkerPrimerSequence', | ||
'linkerprimersequence': 'LinkerPrimerSequence', | ||
'description': 'Description', | ||
} | ||
|
||
|
@@ -456,19 +436,38 @@ def create_qiime_mapping_file(self, prep_template_fp): | |
|
||
# reading files via pandas | ||
st = load_template_to_dataframe(sample_template_fp) | ||
pt = load_template_to_dataframe(prep_template_fp) | ||
pt = self.to_dataframe() | ||
|
||
st_sample_names = set(st.index) | ||
pt_sample_names = set(pt.index) | ||
|
||
if not pt_sample_names.issubset(st_sample_names): | ||
raise ValueError( | ||
"Prep template is not a sub set of the sample template, files:" | ||
"%s %s - samples: %s" % (sample_template_fp, prep_template_fp, | ||
str(pt_sample_names-st_sample_names))) | ||
"Prep template is not a sub set of the sample template, files" | ||
"%s - samples: %s" | ||
% (sample_template_fp, | ||
', '.join(pt_sample_names-st_sample_names))) | ||
|
||
mapping = pt.join(st, lsuffix="_prep") | ||
mapping.rename(columns=rename_cols, inplace=True) | ||
|
||
# Pre-populate the QIIME-required columns with the value XXQIITAXX | ||
index = mapping.index | ||
placeholder = ['XXQIITAXX'] * len(index) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You probably thought about this, but why not just setting an empty string? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Empty strings are easy to miss. On previous discussions, @rob-knight suggested that any value that we populate in the templates should be make clear that it is wrong and needs to be fixed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now I remember, thanks! On (Apr-27-15|16:24), josenavas wrote:
|
||
missing = [] | ||
for val in viewvalues(rename_cols): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you think it would make sense to raise a warning in this case? The warning would make a note of the columns that would have place-holders. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good call, adding. |
||
if val not in mapping: | ||
missing.append(val) | ||
mapping[val] = pd.Series(placeholder, index=index) | ||
|
||
if missing: | ||
warnings.warn( | ||
"Some columns required to generate a QIIME-compliant mapping " | ||
"file are not present in the template. A placeholder value " | ||
"(XXQIITAXX) has been used to populate these columns. Missing " | ||
"columns: %s" % ', '.join(missing), | ||
QiitaDBWarning) | ||
|
||
# Gets the orginal mapping columns and readjust the order to comply | ||
# with QIIME requirements | ||
cols = mapping.columns.values.tolist() | ||
|
@@ -486,11 +485,13 @@ def create_qiime_mapping_file(self, prep_template_fp): | |
self.id, strftime("%Y%m%d-%H%M%S"))) | ||
|
||
# Save the mapping file | ||
mapping.to_csv(filepath, index_label='#SampleID', na_rep='unknown', | ||
mapping.to_csv(filepath, index_label='#SampleID', na_rep='', | ||
sep='\t') | ||
|
||
# adding the fp to the object | ||
self.add_filepath(filepath) | ||
self.add_filepath( | ||
filepath, conn_handler=conn_handler, | ||
fp_id=convert_to_id("qiime_map", "filepath_type")) | ||
|
||
return filepath | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we still need to do this? I'm not sure but it seems unnecessary at this point.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping! I do not think we need to patch this way for this specific fields, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oooops! Good call, we still need it, but not as extensive. Instead of:
We need to use:
The reason is that we are using different names on the database than on QIIME. Does this seams reasonable? The other solution is to require the user to introduce those values directly. This will require changing the restrictions. What do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The reduced version makes more sense.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, while changing this I noticed that the prep template already created in the database is not fully correct. I'm changing it and it will probably require a couple of more changes in the rest of the tests.