Skip to content

Commit 162144c

Browse files
committed
Merge pull request #1072 from josenavas/fix-extend
Fix extend functionality
2 parents 4bed8a5 + f8f0a6d commit 162144c

File tree

3 files changed

+542
-170
lines changed

3 files changed

+542
-170
lines changed

qiita_db/metadata_template/base_metadata_template.py

Lines changed: 126 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,24 +37,25 @@
3737

3838
from __future__ import division
3939
from future.utils import viewitems, viewvalues
40+
from future.builtins import zip
4041
from os.path import join
4142
from functools import partial
4243
from collections import defaultdict
4344
from copy import deepcopy
4445

4546
import pandas as pd
4647
from skbio.util import find_duplicates
48+
import warnings
4749

4850
from qiita_core.exceptions import IncompetentQiitaDeveloperError
4951

5052
from qiita_db.exceptions import (QiitaDBUnknownIDError, QiitaDBColumnError,
51-
QiitaDBNotImplementedError,
52-
QiitaDBExecutionError,
53+
QiitaDBNotImplementedError, QiitaDBError,
54+
QiitaDBExecutionError, QiitaDBWarning,
5355
QiitaDBDuplicateHeaderError)
5456
from qiita_db.base import QiitaObject
5557
from qiita_db.sql_connection import SQLConnectionHandler
56-
from qiita_db.util import (exists_table, get_table_cols,
57-
convert_to_id,
58+
from qiita_db.util import (exists_table, get_table_cols, convert_to_id,
5859
get_mountpoint, insert_filepaths)
5960
from qiita_db.logger import LogEntry
6061
from .util import (as_python_types, get_datatypes, get_invalid_sample_names,
@@ -772,6 +773,126 @@ def _add_common_creation_steps_to_queue(cls, md_template, obj_id,
772773
', '.join(["%s"] * len(headers))),
773774
values, many=True)
774775

776+
def _add_common_extend_steps_to_queue(self, md_template, conn_handler,
777+
queue_name):
778+
r"""Adds the common extend steps to the queue in conn_handler
779+
780+
Parameters
781+
----------
782+
md_template : DataFrame
783+
The metadata template file contents indexed by sample ids
784+
conn_handler : SQLConnectionHandler
785+
The connection handler object connected to the DB
786+
queue_name : str
787+
The queue where the SQL statements will be added
788+
789+
Raises
790+
------
791+
QiitaDBError
792+
If no new samples or new columns are present in `md_template`
793+
"""
794+
# Check if we are adding new samples
795+
sample_ids = md_template.index.tolist()
796+
curr_samples = set(self.keys())
797+
existing_samples = curr_samples.intersection(sample_ids)
798+
new_samples = set(sample_ids).difference(existing_samples)
799+
800+
# Check if we are adding new columns, by getting all the columns from
801+
# the database
802+
table_name = self._table_name(self._id)
803+
db_cols = get_table_cols(self._table, conn_handler)
804+
db_cols.remove('sample_id')
805+
db_cols.remove(self._id_column)
806+
curr_cols = set(
807+
get_table_cols(table_name, conn_handler)).union(db_cols)
808+
headers = md_template.keys().tolist()
809+
existing_cols = curr_cols.intersection(headers)
810+
new_cols = set(headers).difference(existing_cols)
811+
812+
if not new_cols and not new_samples:
813+
raise QiitaDBError(
814+
"No new samples or new columns found in the template. If you "
815+
"want to update existing values, you should use the 'update' "
816+
"functionality.")
817+
818+
if new_cols:
819+
# If we are adding new columns, add them first (simplifies code)
820+
# Sorting the new columns to enforce an order
821+
new_cols = sorted(new_cols)
822+
datatypes = get_datatypes(md_template.ix[:, new_cols])
823+
sql_cols = """INSERT INTO qiita.{0} ({1}, column_name, column_type)
824+
VALUES (%s, %s, %s)""".format(self._column_table,
825+
self._id_column)
826+
sql_alter = """ALTER TABLE qiita.{0} ADD COLUMN {1} {2}"""
827+
for category, dtype in zip(new_cols, datatypes):
828+
conn_handler.add_to_queue(
829+
queue_name, sql_cols, (self._id, category, dtype))
830+
conn_handler.add_to_queue(
831+
queue_name, sql_alter.format(table_name, category, dtype))
832+
833+
if existing_samples:
834+
warnings.warn(
835+
"No values have been modified for samples '%s'. However, "
836+
"the following columns have been added to them: '%s'"
837+
% (", ".join(existing_samples), ", ".join(new_cols)),
838+
QiitaDBWarning)
839+
# The values for the new columns are the only ones that get
840+
# added to the database. None of the existing values will be
841+
# modified (see update for that functionality)
842+
min_md_template = md_template[new_cols].loc[existing_samples]
843+
values = as_python_types(min_md_template, new_cols)
844+
values.append(existing_samples)
845+
# psycopg2 requires a list of tuples, in which each tuple is a
846+
# set of values to use in the string formatting of the query.
847+
# We have all the values in different lists (but in the same
848+
# order) so use zip to create the list of tuples that psycopg2
849+
# requires.
850+
values = [v for v in zip(*values)]
851+
set_str = ["{0} = %s".format(col) for col in new_cols]
852+
sql = """UPDATE qiita.{0}
853+
SET {1}
854+
WHERE sample_id=%s""".format(table_name,
855+
",".join(set_str))
856+
conn_handler.add_to_queue(queue_name, sql, values, many=True)
857+
elif existing_samples:
858+
warnings.warn(
859+
"The following samples already exist in the template and "
860+
"will be ignored: %s" % ", ".join(existing_samples),
861+
QiitaDBWarning)
862+
863+
if new_samples:
864+
num_samples = len(new_samples)
865+
new_samples = sorted(new_samples)
866+
# At this point we only want the information from the new samples
867+
md_template = md_template.loc[new_samples]
868+
869+
# Insert values on required columns
870+
values = as_python_types(md_template, db_cols)
871+
values.insert(0, new_samples)
872+
values.insert(0, [self._id] * num_samples)
873+
# psycopg2 requires a list of tuples, in which each tuple is a
874+
# tuple of values to use in the string formatting of the query. We
875+
# have all the values in different lists (but in the same order) so
876+
# use zip to create the list of tuples that psycopg2 requires.
877+
values = [v for v in zip(*values)]
878+
sql = """INSERT INTO qiita.{0} ({1}, sample_id, {2})
879+
VALUES (%s, %s, {3})""".format(
880+
self._table, self._id_column, ', '.join(db_cols),
881+
', '.join(['%s'] * len(db_cols)))
882+
conn_handler.add_to_queue(queue_name, sql, values, many=True)
883+
884+
headers = sorted(set(headers).difference(db_cols))
885+
886+
# Insert values on custom table
887+
values = as_python_types(md_template, headers)
888+
values.insert(0, new_samples)
889+
values = [v for v in zip(*values)]
890+
sql = """INSERT INTO qiita.{0} (sample_id, {1})
891+
VALUES (%s, {2})""".format(
892+
table_name, ", ".join(headers),
893+
', '.join(["%s"] * len(headers)))
894+
conn_handler.add_to_queue(queue_name, sql, values, many=True)
895+
775896
@classmethod
776897
def exists(cls, obj_id):
777898
r"""Checks if already exists a MetadataTemplate for the provided object
@@ -1193,7 +1314,7 @@ def update_category(self, category, samples_and_values):
11931314
"""
11941315
if not set(self.keys()).issuperset(samples_and_values):
11951316
missing = set(self.keys()) - set(samples_and_values)
1196-
table_name = self._table_name(self.study_id)
1317+
table_name = self._table_name(self._id)
11971318
raise QiitaDBUnknownIDError(missing, table_name)
11981319

11991320
conn_handler = SQLConnectionHandler()

qiita_db/metadata_template/sample_template.py

Lines changed: 5 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,19 @@
77
# -----------------------------------------------------------------------------
88

99
from __future__ import division
10-
from future.builtins import zip
1110
from os.path import join
1211
from time import strftime
1312

1413
import pandas as pd
15-
import warnings
1614

1715
from qiita_core.exceptions import IncompetentQiitaDeveloperError
1816
from qiita_db.exceptions import (QiitaDBDuplicateError, QiitaDBError,
19-
QiitaDBWarning, QiitaDBUnknownIDError)
17+
QiitaDBUnknownIDError)
2018
from qiita_db.sql_connection import SQLConnectionHandler
21-
from qiita_db.util import (get_table_cols, get_required_sample_info_status,
22-
get_mountpoint, scrub_data)
19+
from qiita_db.util import get_required_sample_info_status, get_mountpoint
2320
from qiita_db.study import Study
2421
from qiita_db.data import RawData
2522
from .base_metadata_template import BaseSample, MetadataTemplate
26-
from .util import as_python_types, get_datatypes
2723
from .prep_template import PrepTemplate
2824

2925

@@ -242,76 +238,11 @@ def extend(self, md_template):
242238
conn_handler.create_queue(queue_name)
243239

244240
md_template = self._clean_validate_template(md_template, self.study_id,
241+
self.study_id,
245242
conn_handler)
246243

247-
# Raise warning and filter out existing samples
248-
sample_ids = md_template.index.tolist()
249-
sql = ("SELECT sample_id FROM qiita.required_sample_info WHERE "
250-
"study_id = %d" % self.id)
251-
curr_samples = set(s[0] for s in conn_handler.execute_fetchall(sql))
252-
existing_samples = curr_samples.intersection(sample_ids)
253-
if existing_samples:
254-
warnings.warn(
255-
"The following samples already exist and will be ignored: "
256-
"%s" % ", ".join(curr_samples.intersection(
257-
sorted(existing_samples))), QiitaDBWarning)
258-
md_template.drop(existing_samples, inplace=True)
259-
260-
# Get some useful information from the metadata template
261-
sample_ids = md_template.index.tolist()
262-
num_samples = len(sample_ids)
263-
headers = list(md_template.keys())
264-
265-
# Get the required columns from the DB
266-
db_cols = get_table_cols(self._table, conn_handler)
267-
# Remove the sample_id and study_id columns
268-
db_cols.remove('sample_id')
269-
db_cols.remove(self._id_column)
270-
271-
# Insert values on required columns
272-
values = as_python_types(md_template, db_cols)
273-
values.insert(0, sample_ids)
274-
values.insert(0, [self.study_id] * num_samples)
275-
values = [v for v in zip(*values)]
276-
conn_handler.add_to_queue(
277-
queue_name,
278-
"INSERT INTO qiita.{0} ({1}, sample_id, {2}) "
279-
"VALUES (%s, %s, {3})".format(self._table, self._id_column,
280-
', '.join(db_cols),
281-
', '.join(['%s'] * len(db_cols))),
282-
values, many=True)
283-
284-
# Add missing columns to the sample template dynamic table
285-
headers = list(set(headers).difference(db_cols))
286-
datatypes = get_datatypes(md_template.ix[:, headers])
287-
table_name = self._table_name(self.study_id)
288-
new_cols = set(md_template.columns).difference(
289-
set(self.metadata_headers()))
290-
dtypes_dict = dict(zip(md_template.ix[:, headers], datatypes))
291-
for category in new_cols:
292-
# Insert row on *_columns table
293-
conn_handler.add_to_queue(
294-
queue_name,
295-
"INSERT INTO qiita.{0} ({1}, column_name, column_type) "
296-
"VALUES (%s, %s, %s)".format(self._column_table,
297-
self._id_column),
298-
(self.study_id, category, dtypes_dict[category]))
299-
# Insert row on dynamic table
300-
conn_handler.add_to_queue(
301-
queue_name,
302-
"ALTER TABLE qiita.{0} ADD COLUMN {1} {2}".format(
303-
table_name, scrub_data(category), dtypes_dict[category]))
304-
305-
# Insert values on custom table
306-
values = as_python_types(md_template, headers)
307-
values.insert(0, sample_ids)
308-
values = [v for v in zip(*values)]
309-
conn_handler.add_to_queue(
310-
queue_name,
311-
"INSERT INTO qiita.{0} (sample_id, {1}) "
312-
"VALUES (%s, {2})".format(table_name, ", ".join(headers),
313-
', '.join(["%s"] * len(headers))),
314-
values, many=True)
244+
self._add_common_extend_steps_to_queue(md_template, conn_handler,
245+
queue_name)
315246
conn_handler.execute_queue(queue_name)
316247

317248
self.generate_files()

0 commit comments

Comments
 (0)