Skip to content

Unify metadata creation #1032

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 2, 2015
77 changes: 77 additions & 0 deletions qiita_db/metadata_template/base_metadata_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
convert_to_id,
get_mountpoint, insert_filepaths)
from qiita_db.logger import LogEntry
from .util import as_python_types, get_datatypes


class BaseSample(QiitaObject):
Expand Down Expand Up @@ -518,6 +519,82 @@ def _check_special_columns(cls, md_template, obj):
return missing.union(
cls._check_template_special_columns(md_template, obj))

@classmethod
def _add_common_creation_steps_to_queue(cls, md_template, obj_id,
conn_handler, queue_name):
r"""Adds the common creation steps to the queue in conn_handler

Parameters
----------
md_template : DataFrame
The metadata template file contents indexed by sample ids
obj_id : int
The id of the object being created
conn_handler : SQLConnectionHandler
The connection handler object connected to the DB
queue_name : str
The queue where the SQL statements will be added
"""
cls._check_subclass()
# Get some useful information from the metadata template
sample_ids = md_template.index.tolist()
num_samples = len(sample_ids)
headers = list(md_template.keys())

# Get the required columns from the DB
db_cols = get_table_cols(cls._table, conn_handler)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks looks like the function needs the "am I instantiating the base class" test.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking on that, but this is a private function and it should be only used in the create functions, that already have that test. If you feel strong about it, I can add it though...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would probably be good to add as a safety layer, since it will also mean a more interpretable error.

# Remove the sample_id and _id_column columns
db_cols.remove('sample_id')
db_cols.remove(cls._id_column)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there any way _id_column can be sample_id?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, cls._id_column is either study_id or prep_template_id

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cool


# Insert values on required columns
values = as_python_types(md_template, db_cols)
values.insert(0, sample_ids)
values.insert(0, [obj_id] * num_samples)
values = [v for v in zip(*values)]
conn_handler.add_to_queue(
queue_name,
"INSERT INTO qiita.{0} ({1}, sample_id, {2}) "
"VALUES (%s, %s, {3})".format(cls._table, cls._id_column,
', '.join(db_cols),
', '.join(['%s'] * len(db_cols))),
values, many=True)

# Insert rows on *_columns table
headers = list(set(headers).difference(db_cols))
datatypes = get_datatypes(md_template.ix[:, headers])
# psycopg2 requires a list of tuples, in which each tuple is a set
# of values to use in the string formatting of the query. We have all
# the values in different lists (but in the same order) so use zip
# to create the list of tuples that psycopg2 requires.
values = [
v for v in zip([obj_id] * len(headers), headers, datatypes)]
conn_handler.add_to_queue(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Philosophical thing, but I'd add this to the queue after the table is made and filled.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's in a queue, all the changes happen or not, so I don't see the added value. Do you have strong feelings?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, not blocking, just philosophical.

queue_name,
"INSERT INTO qiita.{0} ({1}, column_name, column_type) "
"VALUES (%s, %s, %s)".format(cls._column_table, cls._id_column),
values, many=True)

# Create table with custom columns
table_name = cls._table_name(obj_id)
column_datatype = ["%s %s" % (col, dtype)
for col, dtype in zip(headers, datatypes)]
conn_handler.add_to_queue(
queue_name,
"CREATE TABLE qiita.{0} (sample_id varchar NOT NULL, {1})".format(
table_name, ', '.join(column_datatype)))

# Insert values on custom table
values = as_python_types(md_template, headers)
values.insert(0, sample_ids)
values = [v for v in zip(*values)]
conn_handler.add_to_queue(
queue_name,
"INSERT INTO qiita.{0} (sample_id, {1}) "
"VALUES (%s, {2})".format(table_name, ", ".join(headers),
', '.join(["%s"] * len(headers))),
values, many=True)

@classmethod
def delete(cls, id_):
r"""Deletes the table from the database
Expand Down
55 changes: 2 additions & 53 deletions qiita_db/metadata_template/prep_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
# -----------------------------------------------------------------------------

from __future__ import division
from future.builtins import zip
from copy import deepcopy
from os.path import join
from time import strftime
Expand All @@ -24,7 +23,6 @@
convert_from_id, get_mountpoint, infer_status)
from .base_metadata_template import BaseSample, MetadataTemplate
from .util import (get_invalid_sample_names, prefix_sample_names_with_id,
as_python_types, get_datatypes,
load_template_to_dataframe)
from .constants import (TARGET_GENE_DATA_TYPES, RENAME_COLS_DICT,
REQUIRED_TARGET_GENE_COLS)
Expand Down Expand Up @@ -152,10 +150,6 @@ def create(cls, md_template, raw_data, study, data_type,
# the database, but depending on the data type are required.
missing = cls._check_special_columns(md_template, data_type_str)

# Get some useful information from the metadata template
sample_ids = md_template.index.tolist()
num_samples = len(sample_ids)

# Get the required columns from the DB
db_cols = get_table_cols(cls._table, conn_handler)

Expand Down Expand Up @@ -183,53 +177,8 @@ def create(cls, md_template, raw_data, study, data_type,
"prep_template_id", (data_type_id, raw_data.id,
investigation_type))[0]

# Insert values on required columns
values = as_python_types(md_template, db_cols)
values.insert(0, sample_ids)
values.insert(0, [prep_id] * num_samples)
values = [v for v in zip(*values)]
conn_handler.add_to_queue(
queue_name,
"INSERT INTO qiita.{0} ({1}, sample_id, {2}) "
"VALUES (%s, %s, {3})".format(
cls._table, cls._id_column, ', '.join(db_cols),
', '.join(['%s'] * len(db_cols))),
values, many=True)

# Insert rows on *_columns table
headers = list(set(headers).difference(db_cols))
datatypes = get_datatypes(md_template.ix[:, headers])
# psycopg2 requires a list of tuples, in which each tuple is a set
# of values to use in the string formatting of the query. We have all
# the values in different lists (but in the same order) so use zip
# to create the list of tuples that psycopg2 requires.
values = [
v for v in zip([prep_id] * len(headers), headers, datatypes)]
conn_handler.add_to_queue(
queue_name,
"INSERT INTO qiita.{0} ({1}, column_name, column_type) "
"VALUES (%s, %s, %s)".format(cls._column_table, cls._id_column),
values, many=True)

# Create table with custom columns
table_name = cls._table_name(prep_id)
column_datatype = ["%s %s" % (col, dtype)
for col, dtype in zip(headers, datatypes)]
conn_handler.add_to_queue(
queue_name,
"CREATE TABLE qiita.{0} (sample_id varchar, "
"{1})".format(table_name, ', '.join(column_datatype)))

# Insert values on custom table
values = as_python_types(md_template, headers)
values.insert(0, sample_ids)
values = [v for v in zip(*values)]
conn_handler.add_to_queue(
queue_name,
"INSERT INTO qiita.{0} (sample_id, {1}) "
"VALUES (%s, {2})".format(table_name, ", ".join(headers),
', '.join(["%s"] * len(headers))),
values, many=True)
cls._add_common_creation_steps_to_queue(md_template, prep_id,
conn_handler, queue_name)

try:
conn_handler.execute_queue(queue_name)
Expand Down
59 changes: 2 additions & 57 deletions qiita_db/metadata_template/sample_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,64 +279,9 @@ def create(cls, md_template, study):
md_template = cls._clean_validate_template(md_template, study.id,
conn_handler)

# Get some useful information from the metadata template
sample_ids = md_template.index.tolist()
num_samples = len(sample_ids)
headers = list(md_template.keys())

# Get the required columns from the DB
db_cols = get_table_cols(cls._table, conn_handler)
# Remove the sample_id and study_id columns
db_cols.remove('sample_id')
db_cols.remove(cls._id_column)

# Insert values on required columns
values = as_python_types(md_template, db_cols)
values.insert(0, sample_ids)
values.insert(0, [study.id] * num_samples)
values = [v for v in zip(*values)]
conn_handler.add_to_queue(
queue_name,
"INSERT INTO qiita.{0} ({1}, sample_id, {2}) "
"VALUES (%s, %s, {3})".format(cls._table, cls._id_column,
', '.join(db_cols),
', '.join(['%s'] * len(db_cols))),
values, many=True)

# Insert rows on *_columns table
headers = list(set(headers).difference(db_cols))
datatypes = get_datatypes(md_template.ix[:, headers])
# psycopg2 requires a list of tuples, in which each tuple is a set
# of values to use in the string formatting of the query. We have all
# the values in different lists (but in the same order) so use zip
# to create the list of tuples that psycopg2 requires.
values = [
v for v in zip([study.id] * len(headers), headers, datatypes)]
conn_handler.add_to_queue(
queue_name,
"INSERT INTO qiita.{0} ({1}, column_name, column_type) "
"VALUES (%s, %s, %s)".format(cls._column_table, cls._id_column),
values, many=True)

# Create table with custom columns
table_name = cls._table_name(study.id)
column_datatype = ["%s %s" % (col, dtype)
for col, dtype in zip(headers, datatypes)]
conn_handler.add_to_queue(
queue_name,
"CREATE TABLE qiita.{0} (sample_id varchar NOT NULL, {1})".format(
table_name, ', '.join(column_datatype)))
cls._add_common_creation_steps_to_queue(md_template, study.id,
conn_handler, queue_name)

# Insert values on custom table
values = as_python_types(md_template, headers)
values.insert(0, sample_ids)
values = [v for v in zip(*values)]
conn_handler.add_to_queue(
queue_name,
"INSERT INTO qiita.{0} (sample_id, {1}) "
"VALUES (%s, {2})".format(table_name, ", ".join(headers),
', '.join(["%s"] * len(headers))),
values, many=True)
conn_handler.execute_queue(queue_name)

# figuring out the filepath of the backup
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ def test_table_name(self):
with self.assertRaises(IncompetentQiitaDeveloperError):
MetadataTemplate._table_name(self.study)

def test_add_common_creation_steps_to_queue(self):
"""_add_common_creation_steps_to_queue raises an error from base class
"""
with self.assertRaises(IncompetentQiitaDeveloperError):
MetadataTemplate._add_common_creation_steps_to_queue(
None, 1, None, "")


@qiita_test_checker()
class TestMetadataTemplateReadWrite(TestCase):
Expand Down
88 changes: 88 additions & 0 deletions qiita_db/metadata_template/test/test_prep_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,94 @@ def test_to_dataframe(self):
u'samp_size', u'sequencing_meth', u'illumina_technology',
u'sample_center', u'pcr_primers', u'study_center'})

def test_add_common_creation_steps_to_queue(self):
"""add_common_creation_steps_to_queue adds the correct sql statements
"""
metadata_dict = {
'2.SKB8.640193': {'center_name': 'ANL',
'center_project_name': 'Test Project',
'emp_status_id': 1,
'str_column': 'Value for sample 1',
'linkerprimersequence': 'GTGCCAGCMGCCGCGGTAA',
'barcodesequence': 'GTCCGCAAGTTA',
'run_prefix': "s_G1_L001_sequences",
'platform': 'ILLUMINA',
'library_construction_protocol': 'AAAA',
'experiment_design_description': 'BBBB'},
'2.SKD8.640184': {'center_name': 'ANL',
'center_project_name': 'Test Project',
'emp_status_id': 1,
'str_column': 'Value for sample 2',
'linkerprimersequence': 'GTGCCAGCMGCCGCGGTAA',
'barcodesequence': 'CGTAGAGCTCTC',
'run_prefix': "s_G1_L001_sequences",
'platform': 'ILLUMINA',
'library_construction_protocol': 'AAAA',
'experiment_design_description': 'BBBB'},
}
metadata = pd.DataFrame.from_dict(metadata_dict, orient='index')

conn_handler = SQLConnectionHandler()
queue_name = "TEST_QUEUE"
conn_handler.create_queue(queue_name)
PrepTemplate._add_common_creation_steps_to_queue(
metadata, 2, conn_handler, queue_name)

sql_insert_common = (
'INSERT INTO qiita.common_prep_info '
'(prep_template_id, sample_id, center_name, center_project_name, '
'emp_status_id) '
'VALUES (%s, %s, %s, %s, %s)')
sql_insert_common_params_1 = (2, '2.SKB8.640193', 'ANL',
'Test Project', 1)
sql_insert_common_params_2 = (2, '2.SKD8.640184', 'ANL',
'Test Project', 1)

sql_insert_prep_columns = (
'INSERT INTO qiita.prep_columns '
'(prep_template_id, column_name, column_type) '
'VALUES (%s, %s, %s)')

sql_create_table = (
'CREATE TABLE qiita.prep_2 '
'(sample_id varchar NOT NULL, str_column varchar, '
'run_prefix varchar, barcodesequence varchar, platform varchar, '
'linkerprimersequence varchar, '
'experiment_design_description varchar, '
'library_construction_protocol varchar)')

sql_insert_dynamic = (
'INSERT INTO qiita.prep_2 '
'(sample_id, str_column, run_prefix, barcodesequence, platform, '
'linkerprimersequence, experiment_design_description, '
'library_construction_protocol) '
'VALUES (%s, %s, %s, %s, %s, %s, %s, %s)')

sql_insert_dynamic_params_1 = (
'2.SKB8.640193', 'Value for sample 1', 's_G1_L001_sequences',
'GTCCGCAAGTTA', 'ILLUMINA', 'GTGCCAGCMGCCGCGGTAA', 'BBBB', 'AAAA')
sql_insert_dynamic_params_2 = (
'2.SKD8.640184', 'Value for sample 2', 's_G1_L001_sequences',
'CGTAGAGCTCTC', 'ILLUMINA', 'GTGCCAGCMGCCGCGGTAA', 'BBBB', 'AAAA')

exp = [
(sql_insert_common, sql_insert_common_params_1),
(sql_insert_common, sql_insert_common_params_2),
(sql_insert_prep_columns, (2, 'str_column', 'varchar')),
(sql_insert_prep_columns, (2, 'run_prefix', 'varchar')),
(sql_insert_prep_columns, (2, 'barcodesequence', 'varchar')),
(sql_insert_prep_columns, (2, 'platform', 'varchar')),
(sql_insert_prep_columns, (2, 'linkerprimersequence', 'varchar')),
(sql_insert_prep_columns,
(2, 'experiment_design_description', 'varchar')),
(sql_insert_prep_columns,
(2, 'library_construction_protocol', 'varchar')),
(sql_create_table, None),
(sql_insert_dynamic, sql_insert_dynamic_params_1),
(sql_insert_dynamic, sql_insert_dynamic_params_2)]

self.assertEqual(conn_handler.queues[queue_name], exp)


@qiita_test_checker()
class TestPrepTemplateReadWrite(BaseTestPrepTemplate):
Expand Down
Loading