Skip to content

Unify template validation #1043

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 9, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 85 additions & 6 deletions qiita_db/metadata_template/base_metadata_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,24 @@
from future.utils import viewitems
from os.path import join
from functools import partial
from copy import deepcopy

import pandas as pd
from skbio.util import find_duplicates

from qiita_core.exceptions import IncompetentQiitaDeveloperError
from qiita_db.exceptions import (QiitaDBUnknownIDError,

from qiita_db.exceptions import (QiitaDBUnknownIDError, QiitaDBColumnError,
QiitaDBNotImplementedError,
QiitaDBColumnError)
QiitaDBDuplicateHeaderError)
from qiita_db.base import QiitaObject
from qiita_db.sql_connection import SQLConnectionHandler
from qiita_db.util import (exists_table, get_table_cols,
convert_to_id,
get_mountpoint, insert_filepaths)
from qiita_db.logger import LogEntry
from .util import as_python_types, get_datatypes
from .util import (as_python_types, get_datatypes, get_invalid_sample_names,
prefix_sample_names_with_id)


class BaseSample(QiitaObject):
Expand Down Expand Up @@ -566,9 +570,8 @@ def _check_special_columns(cls, md_template, obj):
----------
md_template : DataFrame
The metadata template file contents indexed by sample ids
obj : Study or RawData
The obj to which the metadata template belongs to. Study in case
of SampleTemplate and RawData in case of PrepTemplate
obj : object
Any extra object needed by the template to perform any extra check
"""
# Check required columns
missing = set(cls.translate_cols_dict.values()).difference(md_template)
Expand All @@ -584,6 +587,82 @@ def _check_special_columns(cls, md_template, obj):
return missing.union(
cls._check_template_special_columns(md_template, obj))

@classmethod
def _clean_validate_template(cls, md_template, study_id, obj,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do you validate that the passed study_id is actually attached to this template?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point. If the template being tested is a prep template, the process will fail later because the samples are not found in the sample template. If it is the study_id, there is no way, since the object still doesn't exist. But 2 things: (1) this was already like that in the previous implementation; and (2) this is a private function that should not be called from outside the class, so if somebody is messing up with this function they should know what they're doing (and this is completely pythonic).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we get another dev to chime in on this? It's a little disconcerting to me to just take it on faith the study is correct and wait until it fails to see otherwise.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed offline and 👍 will merge when tests pass

conn_handler=None):
"""Takes care of all validation and cleaning of metadata templates

Parameters
----------
md_template : DataFrame
The metadata template file contents indexed by sample ids
study_id : int
The study to which the metadata template belongs to.
obj : object
Any extra object needed by the template to perform any extra check

Returns
-------
md_template : DataFrame
Cleaned copy of the input md_template

Raises
------
QiitaDBColumnError
If the sample names in md_template contains invalid names
QiitaDBDuplicateHeaderError
If md_template contains duplicate headers
QiitaDBColumnError
If md_template is missing a required column
"""
cls._check_subclass()
invalid_ids = get_invalid_sample_names(md_template.index)
if invalid_ids:
raise QiitaDBColumnError("The following sample names in the "
"template contain invalid characters "
"(only alphanumeric characters or periods"
" are allowed): %s." %
", ".join(invalid_ids))
# We are going to modify the md_template. We create a copy so
# we don't modify the user one
md_template = deepcopy(md_template)

# Prefix the sample names with the study_id
prefix_sample_names_with_id(md_template, study_id)

# In the database, all the column headers are lowercase
md_template.columns = [c.lower() for c in md_template.columns]

# Check that we don't have duplicate columns
if len(set(md_template.columns)) != len(md_template.columns):
raise QiitaDBDuplicateHeaderError(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey, just noticed that this error message is not going to be very information (and it is not your fault, the code was like this already). If possible can you change to something along the lines of: 'The following header names are duplicated: %s' % ', '.join(find_duplicates(md_template.columns))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nix that, QiitaDBDuplicateHeaderError already takes care of that formatting.

find_duplicates(md_template.columns))

# We need to check for some special columns, that are not present on
# the database, but depending on the data type are required.
missing = cls._check_special_columns(md_template, obj)

conn_handler = conn_handler if conn_handler else SQLConnectionHandler()

# Get the required columns from the DB
db_cols = get_table_cols(cls._table, conn_handler)

# Remove the sample_id and study_id columns
db_cols.remove('sample_id')
db_cols.remove(cls._id_column)

# Retrieve the headers of the metadata template
headers = list(md_template.keys())

# Check that md_template has the required columns
remaining = set(db_cols).difference(headers)
missing = missing.union(remaining)
missing = missing.difference(cls.translate_cols_dict)
if missing:
raise QiitaDBColumnError("Missing columns: %s"
% ', '.join(missing))
return md_template

@classmethod
def _add_common_creation_steps_to_queue(cls, md_template, obj_id,
conn_handler, queue_name):
Expand Down
57 changes: 5 additions & 52 deletions qiita_db/metadata_template/prep_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,18 @@
# -----------------------------------------------------------------------------

from __future__ import division
from copy import deepcopy
from os.path import join
from time import strftime

from skbio.util import find_duplicates

from qiita_core.exceptions import IncompetentQiitaDeveloperError
from qiita_db.exceptions import (QiitaDBColumnError, QiitaDBUnknownIDError,
QiitaDBDuplicateHeaderError, QiitaDBError,
QiitaDBExecutionError)
QiitaDBError, QiitaDBExecutionError)
from qiita_db.sql_connection import SQLConnectionHandler
from qiita_db.ontology import Ontology
from qiita_db.util import (get_table_cols, get_emp_status, convert_to_id,
from qiita_db.util import (get_emp_status, convert_to_id,
convert_from_id, get_mountpoint, infer_status)
from .base_metadata_template import BaseSample, MetadataTemplate
from .util import (get_invalid_sample_names, prefix_sample_names_with_id,
load_template_to_dataframe)
from .util import load_template_to_dataframe
from .constants import (TARGET_GENE_DATA_TYPES, RENAME_COLS_DICT,
REQUIRED_TARGET_GENE_COLS)

Expand Down Expand Up @@ -109,29 +104,6 @@ def create(cls, md_template, raw_data, study, data_type,
if investigation_type is not None:
cls.validate_investigation_type(investigation_type)

invalid_ids = get_invalid_sample_names(md_template.index)
if invalid_ids:
raise QiitaDBColumnError("The following sample names in the prep"
" template contain invalid characters "
"(only alphanumeric characters or periods"
" are allowed): %s." %
", ".join(invalid_ids))

# We are going to modify the md_template. We create a copy so
# we don't modify the user one
md_template = deepcopy(md_template)

# Prefix the sample names with the study_id
prefix_sample_names_with_id(md_template, study.id)

# In the database, all the column headers are lowercase
md_template.columns = [c.lower() for c in md_template.columns]

# Check that we don't have duplicate columns
if len(set(md_template.columns)) != len(md_template.columns):
raise QiitaDBDuplicateHeaderError(
find_duplicates(md_template.columns))

# Get a connection handler
conn_handler = SQLConnectionHandler()
queue_name = "CREATE_PREP_TEMPLATE_%d" % raw_data.id
Expand All @@ -146,27 +118,8 @@ def create(cls, md_template, raw_data, study, data_type,
data_type_id = convert_to_id(data_type, "data_type", conn_handler)
data_type_str = data_type

# We need to check for some special columns, that are not present on
# the database, but depending on the data type are required.
missing = cls._check_special_columns(md_template, data_type_str)

# Get the required columns from the DB
db_cols = get_table_cols(cls._table, conn_handler)

# Remove the sample_id and study_id columns
db_cols.remove('sample_id')
db_cols.remove(cls._id_column)

# Retrieve the headers of the metadata template
headers = list(md_template.keys())

# Check that md_template has the required columns
remaining = set(db_cols).difference(headers)
missing = missing.union(remaining)
missing = missing.difference(cls.translate_cols_dict)
if missing:
raise QiitaDBColumnError("Missing columns: %s"
% ', '.join(missing))
md_template = cls._clean_validate_template(md_template, study.id,
data_type_str, conn_handler)

# Insert the metadata template
# We need the prep_id for multiple calls below, which currently is not
Expand Down
74 changes: 3 additions & 71 deletions qiita_db/metadata_template/sample_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,23 @@

from __future__ import division
from future.builtins import zip
from copy import deepcopy
from os.path import join
from time import strftime
from os.path import basename

import pandas as pd
import warnings
from skbio.util import find_duplicates

from qiita_core.exceptions import IncompetentQiitaDeveloperError
from qiita_db.exceptions import (QiitaDBDuplicateError, QiitaDBColumnError,
QiitaDBDuplicateHeaderError, QiitaDBError,
from qiita_db.exceptions import (QiitaDBDuplicateError, QiitaDBError,
QiitaDBWarning)
from qiita_db.sql_connection import SQLConnectionHandler
from qiita_db.util import (get_table_cols, get_required_sample_info_status,
get_mountpoint, scrub_data)
from qiita_db.study import Study
from qiita_db.data import RawData
from .base_metadata_template import BaseSample, MetadataTemplate
from .util import (get_invalid_sample_names, prefix_sample_names_with_id,
as_python_types, get_datatypes)
from .util import as_python_types, get_datatypes
from .prep_template import PrepTemplate


Expand Down Expand Up @@ -114,70 +110,6 @@ def _check_template_special_columns(cls, md_template, study_id):
"""
return set()

@classmethod
def _clean_validate_template(cls, md_template, study_id,
conn_handler=None):
"""Takes care of all validation and cleaning of sample templates

Parameters
----------
md_template : DataFrame
The metadata template file contents indexed by sample ids
study_id : int
The study to which the sample template belongs to.

Returns
-------
md_template : DataFrame
Cleaned copy of the input md_template
"""
invalid_ids = get_invalid_sample_names(md_template.index)
if invalid_ids:
raise QiitaDBColumnError("The following sample names in the sample"
" template contain invalid characters "
"(only alphanumeric characters or periods"
" are allowed): %s." %
", ".join(invalid_ids))
# We are going to modify the md_template. We create a copy so
# we don't modify the user one
md_template = deepcopy(md_template)

# Prefix the sample names with the study_id
prefix_sample_names_with_id(md_template, study_id)

# In the database, all the column headers are lowercase
md_template.columns = [c.lower() for c in md_template.columns]

# Check that we don't have duplicate columns
if len(set(md_template.columns)) != len(md_template.columns):
raise QiitaDBDuplicateHeaderError(
find_duplicates(md_template.columns))

# We need to check for some special columns, that are not present on
# the database, but depending on the data type are required.
missing = cls._check_special_columns(md_template, study_id)

conn_handler = conn_handler if conn_handler else SQLConnectionHandler()

# Get the required columns from the DB
db_cols = get_table_cols(cls._table, conn_handler)

# Remove the sample_id and study_id columns
db_cols.remove('sample_id')
db_cols.remove(cls._id_column)

# Retrieve the headers of the metadata template
headers = list(md_template.keys())

# Check that md_template has the required columns
remaining = set(db_cols).difference(headers)
missing = missing.union(remaining)
missing = missing.difference(cls.translate_cols_dict)
if missing:
raise QiitaDBColumnError("Missing columns: %s"
% ', '.join(missing))
return md_template

@classmethod
def create(cls, md_template, study):
r"""Creates the sample template in the database
Expand All @@ -201,7 +133,7 @@ def create(cls, md_template, study):

# Clean and validate the metadata template given
md_template = cls._clean_validate_template(md_template, study.id,
conn_handler)
study.id, conn_handler)

cls._add_common_creation_steps_to_queue(md_template, study.id,
conn_handler, queue_name)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ def test_add_common_creation_steps_to_queue(self):
MetadataTemplate._add_common_creation_steps_to_queue(
None, 1, None, "")

def test_clean_validate_template(self):
"""_clean_validate_template raises an error from base class"""
with self.assertRaises(IncompetentQiitaDeveloperError):
MetadataTemplate._clean_validate_template(None, 1, None, None)


@qiita_test_checker()
class TestMetadataTemplateReadWrite(TestCase):
Expand Down
Loading