Skip to content

Commit 0d65c9d

Browse files
committed
Merge pull request #1043 from josenavas/unify-template-validation
Unify template validation
2 parents 3ad2264 + acb76ff commit 0d65c9d

File tree

6 files changed

+271
-129
lines changed

6 files changed

+271
-129
lines changed

qiita_db/metadata_template/base_metadata_template.py

Lines changed: 85 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,20 +39,24 @@
3939
from future.utils import viewitems
4040
from os.path import join
4141
from functools import partial
42+
from copy import deepcopy
4243

4344
import pandas as pd
45+
from skbio.util import find_duplicates
4446

4547
from qiita_core.exceptions import IncompetentQiitaDeveloperError
46-
from qiita_db.exceptions import (QiitaDBUnknownIDError,
48+
49+
from qiita_db.exceptions import (QiitaDBUnknownIDError, QiitaDBColumnError,
4750
QiitaDBNotImplementedError,
48-
QiitaDBColumnError)
51+
QiitaDBDuplicateHeaderError)
4952
from qiita_db.base import QiitaObject
5053
from qiita_db.sql_connection import SQLConnectionHandler
5154
from qiita_db.util import (exists_table, get_table_cols,
5255
convert_to_id,
5356
get_mountpoint, insert_filepaths)
5457
from qiita_db.logger import LogEntry
55-
from .util import as_python_types, get_datatypes
58+
from .util import (as_python_types, get_datatypes, get_invalid_sample_names,
59+
prefix_sample_names_with_id)
5660

5761

5862
class BaseSample(QiitaObject):
@@ -566,9 +570,8 @@ def _check_special_columns(cls, md_template, obj):
566570
----------
567571
md_template : DataFrame
568572
The metadata template file contents indexed by sample ids
569-
obj : Study or RawData
570-
The obj to which the metadata template belongs to. Study in case
571-
of SampleTemplate and RawData in case of PrepTemplate
573+
obj : object
574+
Any extra object needed by the template to perform any extra check
572575
"""
573576
# Check required columns
574577
missing = set(cls.translate_cols_dict.values()).difference(md_template)
@@ -584,6 +587,82 @@ def _check_special_columns(cls, md_template, obj):
584587
return missing.union(
585588
cls._check_template_special_columns(md_template, obj))
586589

590+
@classmethod
591+
def _clean_validate_template(cls, md_template, study_id, obj,
592+
conn_handler=None):
593+
"""Takes care of all validation and cleaning of metadata templates
594+
595+
Parameters
596+
----------
597+
md_template : DataFrame
598+
The metadata template file contents indexed by sample ids
599+
study_id : int
600+
The study to which the metadata template belongs to.
601+
obj : object
602+
Any extra object needed by the template to perform any extra check
603+
604+
Returns
605+
-------
606+
md_template : DataFrame
607+
Cleaned copy of the input md_template
608+
609+
Raises
610+
------
611+
QiitaDBColumnError
612+
If the sample names in md_template contains invalid names
613+
QiitaDBDuplicateHeaderError
614+
If md_template contains duplicate headers
615+
QiitaDBColumnError
616+
If md_template is missing a required column
617+
"""
618+
cls._check_subclass()
619+
invalid_ids = get_invalid_sample_names(md_template.index)
620+
if invalid_ids:
621+
raise QiitaDBColumnError("The following sample names in the "
622+
"template contain invalid characters "
623+
"(only alphanumeric characters or periods"
624+
" are allowed): %s." %
625+
", ".join(invalid_ids))
626+
# We are going to modify the md_template. We create a copy so
627+
# we don't modify the user one
628+
md_template = deepcopy(md_template)
629+
630+
# Prefix the sample names with the study_id
631+
prefix_sample_names_with_id(md_template, study_id)
632+
633+
# In the database, all the column headers are lowercase
634+
md_template.columns = [c.lower() for c in md_template.columns]
635+
636+
# Check that we don't have duplicate columns
637+
if len(set(md_template.columns)) != len(md_template.columns):
638+
raise QiitaDBDuplicateHeaderError(
639+
find_duplicates(md_template.columns))
640+
641+
# We need to check for some special columns, that are not present on
642+
# the database, but depending on the data type are required.
643+
missing = cls._check_special_columns(md_template, obj)
644+
645+
conn_handler = conn_handler if conn_handler else SQLConnectionHandler()
646+
647+
# Get the required columns from the DB
648+
db_cols = get_table_cols(cls._table, conn_handler)
649+
650+
# Remove the sample_id and study_id columns
651+
db_cols.remove('sample_id')
652+
db_cols.remove(cls._id_column)
653+
654+
# Retrieve the headers of the metadata template
655+
headers = list(md_template.keys())
656+
657+
# Check that md_template has the required columns
658+
remaining = set(db_cols).difference(headers)
659+
missing = missing.union(remaining)
660+
missing = missing.difference(cls.translate_cols_dict)
661+
if missing:
662+
raise QiitaDBColumnError("Missing columns: %s"
663+
% ', '.join(missing))
664+
return md_template
665+
587666
@classmethod
588667
def _add_common_creation_steps_to_queue(cls, md_template, obj_id,
589668
conn_handler, queue_name):

qiita_db/metadata_template/prep_template.py

Lines changed: 5 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,18 @@
77
# -----------------------------------------------------------------------------
88

99
from __future__ import division
10-
from copy import deepcopy
1110
from os.path import join
1211
from time import strftime
1312

14-
from skbio.util import find_duplicates
15-
1613
from qiita_core.exceptions import IncompetentQiitaDeveloperError
1714
from qiita_db.exceptions import (QiitaDBColumnError, QiitaDBUnknownIDError,
18-
QiitaDBDuplicateHeaderError, QiitaDBError,
19-
QiitaDBExecutionError)
15+
QiitaDBError, QiitaDBExecutionError)
2016
from qiita_db.sql_connection import SQLConnectionHandler
2117
from qiita_db.ontology import Ontology
22-
from qiita_db.util import (get_table_cols, get_emp_status, convert_to_id,
18+
from qiita_db.util import (get_emp_status, convert_to_id,
2319
convert_from_id, get_mountpoint, infer_status)
2420
from .base_metadata_template import BaseSample, MetadataTemplate
25-
from .util import (get_invalid_sample_names, prefix_sample_names_with_id,
26-
load_template_to_dataframe)
21+
from .util import load_template_to_dataframe
2722
from .constants import (TARGET_GENE_DATA_TYPES, RENAME_COLS_DICT,
2823
REQUIRED_TARGET_GENE_COLS)
2924

@@ -109,29 +104,6 @@ def create(cls, md_template, raw_data, study, data_type,
109104
if investigation_type is not None:
110105
cls.validate_investigation_type(investigation_type)
111106

112-
invalid_ids = get_invalid_sample_names(md_template.index)
113-
if invalid_ids:
114-
raise QiitaDBColumnError("The following sample names in the prep"
115-
" template contain invalid characters "
116-
"(only alphanumeric characters or periods"
117-
" are allowed): %s." %
118-
", ".join(invalid_ids))
119-
120-
# We are going to modify the md_template. We create a copy so
121-
# we don't modify the user one
122-
md_template = deepcopy(md_template)
123-
124-
# Prefix the sample names with the study_id
125-
prefix_sample_names_with_id(md_template, study.id)
126-
127-
# In the database, all the column headers are lowercase
128-
md_template.columns = [c.lower() for c in md_template.columns]
129-
130-
# Check that we don't have duplicate columns
131-
if len(set(md_template.columns)) != len(md_template.columns):
132-
raise QiitaDBDuplicateHeaderError(
133-
find_duplicates(md_template.columns))
134-
135107
# Get a connection handler
136108
conn_handler = SQLConnectionHandler()
137109
queue_name = "CREATE_PREP_TEMPLATE_%d" % raw_data.id
@@ -146,27 +118,8 @@ def create(cls, md_template, raw_data, study, data_type,
146118
data_type_id = convert_to_id(data_type, "data_type", conn_handler)
147119
data_type_str = data_type
148120

149-
# We need to check for some special columns, that are not present on
150-
# the database, but depending on the data type are required.
151-
missing = cls._check_special_columns(md_template, data_type_str)
152-
153-
# Get the required columns from the DB
154-
db_cols = get_table_cols(cls._table, conn_handler)
155-
156-
# Remove the sample_id and study_id columns
157-
db_cols.remove('sample_id')
158-
db_cols.remove(cls._id_column)
159-
160-
# Retrieve the headers of the metadata template
161-
headers = list(md_template.keys())
162-
163-
# Check that md_template has the required columns
164-
remaining = set(db_cols).difference(headers)
165-
missing = missing.union(remaining)
166-
missing = missing.difference(cls.translate_cols_dict)
167-
if missing:
168-
raise QiitaDBColumnError("Missing columns: %s"
169-
% ', '.join(missing))
121+
md_template = cls._clean_validate_template(md_template, study.id,
122+
data_type_str, conn_handler)
170123

171124
# Insert the metadata template
172125
# We need the prep_id for multiple calls below, which currently is not

qiita_db/metadata_template/sample_template.py

Lines changed: 3 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -8,27 +8,23 @@
88

99
from __future__ import division
1010
from future.builtins import zip
11-
from copy import deepcopy
1211
from os.path import join
1312
from time import strftime
1413
from os.path import basename
1514

1615
import pandas as pd
1716
import warnings
18-
from skbio.util import find_duplicates
1917

2018
from qiita_core.exceptions import IncompetentQiitaDeveloperError
21-
from qiita_db.exceptions import (QiitaDBDuplicateError, QiitaDBColumnError,
22-
QiitaDBDuplicateHeaderError, QiitaDBError,
19+
from qiita_db.exceptions import (QiitaDBDuplicateError, QiitaDBError,
2320
QiitaDBWarning)
2421
from qiita_db.sql_connection import SQLConnectionHandler
2522
from qiita_db.util import (get_table_cols, get_required_sample_info_status,
2623
get_mountpoint, scrub_data)
2724
from qiita_db.study import Study
2825
from qiita_db.data import RawData
2926
from .base_metadata_template import BaseSample, MetadataTemplate
30-
from .util import (get_invalid_sample_names, prefix_sample_names_with_id,
31-
as_python_types, get_datatypes)
27+
from .util import as_python_types, get_datatypes
3228
from .prep_template import PrepTemplate
3329

3430

@@ -114,70 +110,6 @@ def _check_template_special_columns(cls, md_template, study_id):
114110
"""
115111
return set()
116112

117-
@classmethod
118-
def _clean_validate_template(cls, md_template, study_id,
119-
conn_handler=None):
120-
"""Takes care of all validation and cleaning of sample templates
121-
122-
Parameters
123-
----------
124-
md_template : DataFrame
125-
The metadata template file contents indexed by sample ids
126-
study_id : int
127-
The study to which the sample template belongs to.
128-
129-
Returns
130-
-------
131-
md_template : DataFrame
132-
Cleaned copy of the input md_template
133-
"""
134-
invalid_ids = get_invalid_sample_names(md_template.index)
135-
if invalid_ids:
136-
raise QiitaDBColumnError("The following sample names in the sample"
137-
" template contain invalid characters "
138-
"(only alphanumeric characters or periods"
139-
" are allowed): %s." %
140-
", ".join(invalid_ids))
141-
# We are going to modify the md_template. We create a copy so
142-
# we don't modify the user one
143-
md_template = deepcopy(md_template)
144-
145-
# Prefix the sample names with the study_id
146-
prefix_sample_names_with_id(md_template, study_id)
147-
148-
# In the database, all the column headers are lowercase
149-
md_template.columns = [c.lower() for c in md_template.columns]
150-
151-
# Check that we don't have duplicate columns
152-
if len(set(md_template.columns)) != len(md_template.columns):
153-
raise QiitaDBDuplicateHeaderError(
154-
find_duplicates(md_template.columns))
155-
156-
# We need to check for some special columns, that are not present on
157-
# the database, but depending on the data type are required.
158-
missing = cls._check_special_columns(md_template, study_id)
159-
160-
conn_handler = conn_handler if conn_handler else SQLConnectionHandler()
161-
162-
# Get the required columns from the DB
163-
db_cols = get_table_cols(cls._table, conn_handler)
164-
165-
# Remove the sample_id and study_id columns
166-
db_cols.remove('sample_id')
167-
db_cols.remove(cls._id_column)
168-
169-
# Retrieve the headers of the metadata template
170-
headers = list(md_template.keys())
171-
172-
# Check that md_template has the required columns
173-
remaining = set(db_cols).difference(headers)
174-
missing = missing.union(remaining)
175-
missing = missing.difference(cls.translate_cols_dict)
176-
if missing:
177-
raise QiitaDBColumnError("Missing columns: %s"
178-
% ', '.join(missing))
179-
return md_template
180-
181113
@classmethod
182114
def create(cls, md_template, study):
183115
r"""Creates the sample template in the database
@@ -201,7 +133,7 @@ def create(cls, md_template, study):
201133

202134
# Clean and validate the metadata template given
203135
md_template = cls._clean_validate_template(md_template, study.id,
204-
conn_handler)
136+
study.id, conn_handler)
205137

206138
cls._add_common_creation_steps_to_queue(md_template, study.id,
207139
conn_handler, queue_name)

qiita_db/metadata_template/test/test_base_metadata_template.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ def test_add_common_creation_steps_to_queue(self):
5858
MetadataTemplate._add_common_creation_steps_to_queue(
5959
None, 1, None, "")
6060

61+
def test_clean_validate_template(self):
62+
"""_clean_validate_template raises an error from base class"""
63+
with self.assertRaises(IncompetentQiitaDeveloperError):
64+
MetadataTemplate._clean_validate_template(None, 1, None, None)
65+
6166

6267
@qiita_test_checker()
6368
class TestMetadataTemplateReadWrite(TestCase):

0 commit comments

Comments
 (0)