-
Notifications
You must be signed in to change notification settings - Fork 80
Unify template validation #1043
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
29fe8d5
3c5b9e1
1c8f63b
0d10d7f
040d9fe
77afb1c
e2f52c7
418a71e
fbff71c
acb76ff
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,20 +39,24 @@ | |
from future.utils import viewitems | ||
from os.path import join | ||
from functools import partial | ||
from copy import deepcopy | ||
|
||
import pandas as pd | ||
from skbio.util import find_duplicates | ||
|
||
from qiita_core.exceptions import IncompetentQiitaDeveloperError | ||
from qiita_db.exceptions import (QiitaDBUnknownIDError, | ||
|
||
from qiita_db.exceptions import (QiitaDBUnknownIDError, QiitaDBColumnError, | ||
QiitaDBNotImplementedError, | ||
QiitaDBColumnError) | ||
QiitaDBDuplicateHeaderError) | ||
from qiita_db.base import QiitaObject | ||
from qiita_db.sql_connection import SQLConnectionHandler | ||
from qiita_db.util import (exists_table, get_table_cols, | ||
convert_to_id, | ||
get_mountpoint, insert_filepaths) | ||
from qiita_db.logger import LogEntry | ||
from .util import as_python_types, get_datatypes | ||
from .util import (as_python_types, get_datatypes, get_invalid_sample_names, | ||
prefix_sample_names_with_id) | ||
|
||
|
||
class BaseSample(QiitaObject): | ||
|
@@ -566,9 +570,8 @@ def _check_special_columns(cls, md_template, obj): | |
---------- | ||
md_template : DataFrame | ||
The metadata template file contents indexed by sample ids | ||
obj : Study or RawData | ||
The obj to which the metadata template belongs to. Study in case | ||
of SampleTemplate and RawData in case of PrepTemplate | ||
obj : object | ||
Any extra object needed by the template to perform any extra check | ||
""" | ||
# Check required columns | ||
missing = set(cls.translate_cols_dict.values()).difference(md_template) | ||
|
@@ -584,6 +587,82 @@ def _check_special_columns(cls, md_template, obj): | |
return missing.union( | ||
cls._check_template_special_columns(md_template, obj)) | ||
|
||
@classmethod | ||
def _clean_validate_template(cls, md_template, study_id, obj, | ||
conn_handler=None): | ||
"""Takes care of all validation and cleaning of metadata templates | ||
|
||
Parameters | ||
---------- | ||
md_template : DataFrame | ||
The metadata template file contents indexed by sample ids | ||
study_id : int | ||
The study to which the metadata template belongs to. | ||
obj : object | ||
Any extra object needed by the template to perform any extra check | ||
|
||
Returns | ||
------- | ||
md_template : DataFrame | ||
Cleaned copy of the input md_template | ||
|
||
Raises | ||
------ | ||
QiitaDBColumnError | ||
If the sample names in md_template contains invalid names | ||
QiitaDBDuplicateHeaderError | ||
If md_template contains duplicate headers | ||
QiitaDBColumnError | ||
If md_template is missing a required column | ||
""" | ||
cls._check_subclass() | ||
invalid_ids = get_invalid_sample_names(md_template.index) | ||
if invalid_ids: | ||
raise QiitaDBColumnError("The following sample names in the " | ||
"template contain invalid characters " | ||
"(only alphanumeric characters or periods" | ||
" are allowed): %s." % | ||
", ".join(invalid_ids)) | ||
# We are going to modify the md_template. We create a copy so | ||
# we don't modify the user one | ||
md_template = deepcopy(md_template) | ||
|
||
# Prefix the sample names with the study_id | ||
prefix_sample_names_with_id(md_template, study_id) | ||
|
||
# In the database, all the column headers are lowercase | ||
md_template.columns = [c.lower() for c in md_template.columns] | ||
|
||
# Check that we don't have duplicate columns | ||
if len(set(md_template.columns)) != len(md_template.columns): | ||
raise QiitaDBDuplicateHeaderError( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey, just noticed that this error message is not going to be very information (and it is not your fault, the code was like this already). If possible can you change to something along the lines of: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nix that, QiitaDBDuplicateHeaderError already takes care of that formatting. |
||
find_duplicates(md_template.columns)) | ||
|
||
# We need to check for some special columns, that are not present on | ||
# the database, but depending on the data type are required. | ||
missing = cls._check_special_columns(md_template, obj) | ||
|
||
conn_handler = conn_handler if conn_handler else SQLConnectionHandler() | ||
|
||
# Get the required columns from the DB | ||
db_cols = get_table_cols(cls._table, conn_handler) | ||
|
||
# Remove the sample_id and study_id columns | ||
db_cols.remove('sample_id') | ||
db_cols.remove(cls._id_column) | ||
|
||
# Retrieve the headers of the metadata template | ||
headers = list(md_template.keys()) | ||
|
||
# Check that md_template has the required columns | ||
remaining = set(db_cols).difference(headers) | ||
missing = missing.union(remaining) | ||
missing = missing.difference(cls.translate_cols_dict) | ||
if missing: | ||
raise QiitaDBColumnError("Missing columns: %s" | ||
% ', '.join(missing)) | ||
return md_template | ||
|
||
@classmethod | ||
def _add_common_creation_steps_to_queue(cls, md_template, obj_id, | ||
conn_handler, queue_name): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How do you validate that the passed study_id is actually attached to this template?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's a good point. If the template being tested is a prep template, the process will fail later because the samples are not found in the sample template. If it is the study_id, there is no way, since the object still doesn't exist. But 2 things: (1) this was already like that in the previous implementation; and (2) this is a private function that should not be called from outside the class, so if somebody is messing up with this function they should know what they're doing (and this is completely pythonic).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we get another dev to chime in on this? It's a little disconcerting to me to just take it on faith the study is correct and wait until it fails to see otherwise.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Discussed offline and 👍 will merge when tests pass