Skip to content

Fix extend functionality #1072

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Apr 21, 2015
Merged
131 changes: 126 additions & 5 deletions qiita_db/metadata_template/base_metadata_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,24 +37,25 @@

from __future__ import division
from future.utils import viewitems, viewvalues
from future.builtins import zip
from os.path import join
from functools import partial
from collections import defaultdict
from copy import deepcopy

import pandas as pd
from skbio.util import find_duplicates
import warnings

from qiita_core.exceptions import IncompetentQiitaDeveloperError

from qiita_db.exceptions import (QiitaDBUnknownIDError, QiitaDBColumnError,
QiitaDBNotImplementedError,
QiitaDBExecutionError,
QiitaDBNotImplementedError, QiitaDBError,
QiitaDBExecutionError, QiitaDBWarning,
QiitaDBDuplicateHeaderError)
from qiita_db.base import QiitaObject
from qiita_db.sql_connection import SQLConnectionHandler
from qiita_db.util import (exists_table, get_table_cols,
convert_to_id,
from qiita_db.util import (exists_table, get_table_cols, convert_to_id,
get_mountpoint, insert_filepaths)
from qiita_db.logger import LogEntry
from .util import (as_python_types, get_datatypes, get_invalid_sample_names,
Expand Down Expand Up @@ -772,6 +773,126 @@ def _add_common_creation_steps_to_queue(cls, md_template, obj_id,
', '.join(["%s"] * len(headers))),
values, many=True)

def _add_common_extend_steps_to_queue(self, md_template, conn_handler,
queue_name):
r"""Adds the common extend steps to the queue in conn_handler

Parameters
----------
md_template : DataFrame
The metadata template file contents indexed by sample ids
conn_handler : SQLConnectionHandler
The connection handler object connected to the DB
queue_name : str
The queue where the SQL statements will be added

Raises
------
QiitaDBError
If no new samples or new columns are present in `md_template`
"""
# Check if we are adding new samples
sample_ids = md_template.index.tolist()
curr_samples = set(self.keys())
existing_samples = curr_samples.intersection(sample_ids)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@josenavas, please correct me if I'm wrong but I believe these two lines can be simplified with the symmetric_difference operation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're correct, however I need both variables (existing_samples and new_samples). With symmetric_difference I only get new_samples.

new_samples = set(sample_ids).difference(existing_samples)

# Check if we are adding new columns, by getting all the columns from
# the database
table_name = self._table_name(self._id)
db_cols = get_table_cols(self._table, conn_handler)
db_cols.remove('sample_id')
db_cols.remove(self._id_column)
curr_cols = set(
get_table_cols(table_name, conn_handler)).union(db_cols)
headers = md_template.keys().tolist()
existing_cols = curr_cols.intersection(headers)
new_cols = set(headers).difference(existing_cols)

if not new_cols and not new_samples:
raise QiitaDBError(
"No new samples or new columns found in the template. If you "
"want to update existing values, you should use the 'update' "
"functionality.")

if new_cols:
# If we are adding new columns, add them first (simplifies code)
# Sorting the new columns to enforce an order
new_cols = sorted(new_cols)
datatypes = get_datatypes(md_template.ix[:, new_cols])
sql_cols = """INSERT INTO qiita.{0} ({1}, column_name, column_type)
VALUES (%s, %s, %s)""".format(self._column_table,
self._id_column)
sql_alter = """ALTER TABLE qiita.{0} ADD COLUMN {1} {2}"""
for category, dtype in zip(new_cols, datatypes):
conn_handler.add_to_queue(
queue_name, sql_cols, (self._id, category, dtype))
conn_handler.add_to_queue(
queue_name, sql_alter.format(table_name, category, dtype))

if existing_samples:
warnings.warn(
"No values have been modified for samples '%s'. However, "
"the following columns have been added to them: '%s'"
% (", ".join(existing_samples), ", ".join(new_cols)),
QiitaDBWarning)
# The values for the new columns are the only ones that get
# added to the database. None of the existing values will be
# modified (see update for that functionality)
min_md_template = md_template[new_cols].loc[existing_samples]
values = as_python_types(min_md_template, new_cols)
values.append(existing_samples)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add an explanation for this and the following line? It's a bit confusing.

# psycopg2 requires a list of tuples, in which each tuple is a
# set of values to use in the string formatting of the query.
# We have all the values in different lists (but in the same
# order) so use zip to create the list of tuples that psycopg2
# requires.
values = [v for v in zip(*values)]
set_str = ["{0} = %s".format(col) for col in new_cols]
sql = """UPDATE qiita.{0}
SET {1}
WHERE sample_id=%s""".format(table_name,
",".join(set_str))
conn_handler.add_to_queue(queue_name, sql, values, many=True)
elif existing_samples:
warnings.warn(
"The following samples already exist in the template and "
"will be ignored: %s" % ", ".join(existing_samples),
QiitaDBWarning)

if new_samples:
num_samples = len(new_samples)
new_samples = sorted(new_samples)
# At this point we only want the information from the new samples
md_template = md_template.loc[new_samples]

# Insert values on required columns
values = as_python_types(md_template, db_cols)
values.insert(0, new_samples)
values.insert(0, [self._id] * num_samples)
# psycopg2 requires a list of tuples, in which each tuple is a
# tuple of values to use in the string formatting of the query. We
# have all the values in different lists (but in the same order) so
# use zip to create the list of tuples that psycopg2 requires.
values = [v for v in zip(*values)]
sql = """INSERT INTO qiita.{0} ({1}, sample_id, {2})
VALUES (%s, %s, {3})""".format(
self._table, self._id_column, ', '.join(db_cols),
', '.join(['%s'] * len(db_cols)))
conn_handler.add_to_queue(queue_name, sql, values, many=True)

headers = sorted(set(headers).difference(db_cols))

# Insert values on custom table
values = as_python_types(md_template, headers)
values.insert(0, new_samples)
values = [v for v in zip(*values)]
sql = """INSERT INTO qiita.{0} (sample_id, {1})
VALUES (%s, {2})""".format(
table_name, ", ".join(headers),
', '.join(["%s"] * len(headers)))
conn_handler.add_to_queue(queue_name, sql, values, many=True)

@classmethod
def exists(cls, obj_id):
r"""Checks if already exists a MetadataTemplate for the provided object
Expand Down Expand Up @@ -1193,7 +1314,7 @@ def update_category(self, category, samples_and_values):
"""
if not set(self.keys()).issuperset(samples_and_values):
missing = set(self.keys()) - set(samples_and_values)
table_name = self._table_name(self.study_id)
table_name = self._table_name(self._id)
raise QiitaDBUnknownIDError(missing, table_name)

conn_handler = SQLConnectionHandler()
Expand Down
79 changes: 5 additions & 74 deletions qiita_db/metadata_template/sample_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,19 @@
# -----------------------------------------------------------------------------

from __future__ import division
from future.builtins import zip
from os.path import join
from time import strftime

import pandas as pd
import warnings

from qiita_core.exceptions import IncompetentQiitaDeveloperError
from qiita_db.exceptions import (QiitaDBDuplicateError, QiitaDBError,
QiitaDBWarning, QiitaDBUnknownIDError)
QiitaDBUnknownIDError)
from qiita_db.sql_connection import SQLConnectionHandler
from qiita_db.util import (get_table_cols, get_required_sample_info_status,
get_mountpoint, scrub_data)
from qiita_db.util import get_required_sample_info_status, get_mountpoint
from qiita_db.study import Study
from qiita_db.data import RawData
from .base_metadata_template import BaseSample, MetadataTemplate
from .util import as_python_types, get_datatypes
from .prep_template import PrepTemplate


Expand Down Expand Up @@ -242,76 +238,11 @@ def extend(self, md_template):
conn_handler.create_queue(queue_name)

md_template = self._clean_validate_template(md_template, self.study_id,
self.study_id,
conn_handler)

# Raise warning and filter out existing samples
sample_ids = md_template.index.tolist()
sql = ("SELECT sample_id FROM qiita.required_sample_info WHERE "
"study_id = %d" % self.id)
curr_samples = set(s[0] for s in conn_handler.execute_fetchall(sql))
existing_samples = curr_samples.intersection(sample_ids)
if existing_samples:
warnings.warn(
"The following samples already exist and will be ignored: "
"%s" % ", ".join(curr_samples.intersection(
sorted(existing_samples))), QiitaDBWarning)
md_template.drop(existing_samples, inplace=True)

# Get some useful information from the metadata template
sample_ids = md_template.index.tolist()
num_samples = len(sample_ids)
headers = list(md_template.keys())

# Get the required columns from the DB
db_cols = get_table_cols(self._table, conn_handler)
# Remove the sample_id and study_id columns
db_cols.remove('sample_id')
db_cols.remove(self._id_column)

# Insert values on required columns
values = as_python_types(md_template, db_cols)
values.insert(0, sample_ids)
values.insert(0, [self.study_id] * num_samples)
values = [v for v in zip(*values)]
conn_handler.add_to_queue(
queue_name,
"INSERT INTO qiita.{0} ({1}, sample_id, {2}) "
"VALUES (%s, %s, {3})".format(self._table, self._id_column,
', '.join(db_cols),
', '.join(['%s'] * len(db_cols))),
values, many=True)

# Add missing columns to the sample template dynamic table
headers = list(set(headers).difference(db_cols))
datatypes = get_datatypes(md_template.ix[:, headers])
table_name = self._table_name(self.study_id)
new_cols = set(md_template.columns).difference(
set(self.metadata_headers()))
dtypes_dict = dict(zip(md_template.ix[:, headers], datatypes))
for category in new_cols:
# Insert row on *_columns table
conn_handler.add_to_queue(
queue_name,
"INSERT INTO qiita.{0} ({1}, column_name, column_type) "
"VALUES (%s, %s, %s)".format(self._column_table,
self._id_column),
(self.study_id, category, dtypes_dict[category]))
# Insert row on dynamic table
conn_handler.add_to_queue(
queue_name,
"ALTER TABLE qiita.{0} ADD COLUMN {1} {2}".format(
table_name, scrub_data(category), dtypes_dict[category]))

# Insert values on custom table
values = as_python_types(md_template, headers)
values.insert(0, sample_ids)
values = [v for v in zip(*values)]
conn_handler.add_to_queue(
queue_name,
"INSERT INTO qiita.{0} (sample_id, {1}) "
"VALUES (%s, {2})".format(table_name, ", ".join(headers),
', '.join(["%s"] * len(headers))),
values, many=True)
self._add_common_extend_steps_to_queue(md_template, conn_handler,
queue_name)
conn_handler.execute_queue(queue_name)

self.generate_files()
Expand Down
Loading