Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions qiita_db/metadata_template/base_metadata_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,13 +647,24 @@ def delete_column(self, column_name):
If the `column_name` doesn't exist
QiitaDBOperationNotPermittedError
If a the info file can't be updated
If the column_name is selected as a specimen_id_column in the
study.
"""
if column_name not in self.categories():
raise qdb.exceptions.QiitaDBColumnError(
"'%s' not in info file %d" % (column_name, self._id))
if not self.can_be_updated(columns={column_name}):
raise qdb.exceptions.QiitaDBOperationNotPermittedError(
'%s cannot be deleted' % column_name)

# if a tube identifier column is selected disallow its deletion
specimen_id_column = qdb.study.Study(self.study_id).specimen_id_column
if specimen_id_column == column_name:
raise qdb.exceptions.QiitaDBOperationNotPermittedError(
'"%s" cannot be deleted, this column is currently selected'
' as the tube identifier (specimen_id_column)' %
column_name)

with qdb.sql_connection.TRN:
sql = 'ALTER TABLE qiita.%s%d DROP COLUMN %s' % (
self._table_prefix, self._id, column_name)
Expand Down Expand Up @@ -1516,3 +1527,17 @@ def validate(self, restriction_dict):
"columns:\n\t%s.\nSee the Templates tutorial for a description"
" of these fields." % ";\n\t".join(warning_msg),
qdb.exceptions.QiitaDBWarning)

def unique_columns(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is gonna be extremely slow for large info files. Examples from the real system:

from qiita_db.metadata_template.sample_template import SampleTemplate

def unique_columns(st):
    df = st.to_dataframe()
    n = len(df)
    return {k for k, v in df.nunique().iteritems() if v == n}

In [22]: %timeit unique_columns(SampleTemplate(550)) # moving pics
1 loop, best of 3: 662 ms per loop

In [23]: %timeit unique_columns(SampleTemplate(10317)) # AGP
1 loop, best of 3: 1min 16s per loop

As you can imagine something blocking for 1min 16s is not desirable and more in the sample info page which is pretty common and we normally expect fasts replies.

Options:

  • Do everything in DB - preferred but not sure if much faster
  • Do not display only unique values but all columns - and option but don't love it
  • Store unique values in redis and change every time the sample id is replaced as part of the update - kind of cool to store all summaries as part of the update
  • Other?

"""Get the unique columns in a template object

Returns
-------
set of str
Returns a set of the columns that are unique in a template object.
"""

df = self.to_dataframe()

n = len(df)
return {k for k, v in df.nunique().items() if v == n}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

items -> iteritems cause items doesn't exist in the pandas version we have in qiita

7 changes: 7 additions & 0 deletions qiita_db/metadata_template/test/test_prep_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -1508,6 +1508,13 @@ def test_name_setter(self):
pt.name = 'Prep information 1'
self.assertEqual(pt.name, 'Prep information 1')

def test_unique(self):
pt = qdb.metadata_template.prep_template.PrepTemplate(1)
observed = pt.unique_columns()
expected = {'barcode'}

self.assertEqual(observed, expected)


EXP_PREP_TEMPLATE = (
'sample_name\tbarcode\tcenter_name\tcenter_project_name\t'
Expand Down
20 changes: 20 additions & 0 deletions qiita_db/metadata_template/test/test_sample_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -2159,6 +2159,19 @@ def test_delete_column(self):
st.delete_column('dna_extracted')
self.assertNotIn('dna_extracted', st.categories())

def test_delete_column_specimen_id(self):
st = qdb.metadata_template.sample_template.SampleTemplate.create(
self.metadata, self.new_study)
self.new_study.specimen_id_column = 'latitude'

with self.assertRaisesRegexp(
qdb.exceptions.QiitaDBOperationNotPermittedError,
'"latitude" cannot be deleted, this column is currently '
'selected as the tube identifier \(specimen_id_column\)'):
st.delete_column('latitude')

self.new_study.specimen_id_column = None

def test_delete_sample(self):
QE = qdb.exceptions
st = qdb.metadata_template.sample_template.SampleTemplate(1)
Expand Down Expand Up @@ -2187,6 +2200,13 @@ def test_delete_sample(self):
with self.assertRaises(QE.QiitaDBOperationNotPermittedError):
st.delete_sample('1.SKM5.640177')

def test_unique(self):
st = qdb.metadata_template.sample_template.SampleTemplate(1)
observed = st.unique_columns()
expected = {'anonymized_name', 'host_subject_id', 'longitude'}

self.assertEqual(observed, expected)


EXP_SAMPLE_TEMPLATE = (
"sample_name\tcollection_timestamp\tdescription\tdna_extracted\t"
Expand Down
60 changes: 60 additions & 0 deletions qiita_db/study.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ class Study(qdb.base.QiitaObject):
status
title
owner
specimen_id_column

Methods
-------
Expand Down Expand Up @@ -627,6 +628,65 @@ def publications(self, values):
qdb.sql_connection.TRN.add(sql, sql_args, many=True)
qdb.sql_connection.TRN.execute()

@property
def specimen_id_column(self):
"""Returns the specimen identifier column

Returns
-------
str
The name of the specimen id column
"""
with qdb.sql_connection.TRN:
sql = """SELECT specimen_id_column
FROM qiita.study
WHERE study_id = %s"""
qdb.sql_connection.TRN.add(sql, [self._id])
return qdb.sql_connection.TRN.execute_fetchlast()

@specimen_id_column.setter
def specimen_id_column(self, value):
"""Sets the specimen identifier column

Parameters
----------
value : str
The name of the column with the specimen identifiers.

Raises
------
QiitaDBLookupError
If value is not in the sample information for this study.
If the study does not have sample information.
QiitaDBColumnError
Category is not unique.
"""
st = self.sample_template
if st is None:
raise qdb.exceptions.QiitaDBLookupError("Study does not have a "
"sample information.")

if value is not None:
if value not in st.categories():
raise qdb.exceptions.QiitaDBLookupError("Category '%s' is not "
"present in the sample"
" information."
% value)

observed_values = st.get_category(value)
if len(observed_values) != len(set(observed_values.values())):
raise qdb.exceptions.QiitaDBColumnError("The category does not"
" contain unique "
"values.")

with qdb.sql_connection.TRN:
# Set the new ones
sql = """UPDATE qiita.study SET
specimen_id_column = %s
WHERE study_id = %s"""
qdb.sql_connection.TRN.add(sql, [value, self._id])
qdb.sql_connection.TRN.execute()

@property
def investigation(self):
""" Returns Investigation this study is part of
Expand Down
7 changes: 7 additions & 0 deletions qiita_db/support_files/patches/66.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
-- August 22, 2018
-- add specimen_id_column to study table (needed to plate samples in labman)

ALTER TABLE qiita.study ADD specimen_id_column varchar(256);

COMMENT ON COLUMN qiita.study.specimen_id_column IS 'The name of the column that describes the specimen identifiers (such as what is written on the tubes).';

5 changes: 4 additions & 1 deletion qiita_db/support_files/qiita-db.dbs
Original file line number Diff line number Diff line change
Expand Up @@ -1373,6 +1373,9 @@ Controlled Vocabulary]]></comment>
<column name="study_abstract" type="text" jt="12" mandatory="y" />
<column name="vamps_id" type="varchar" jt="12" />
<column name="ebi_study_accession" type="varchar" jt="12" />
<column name="specimen_id_column" type="varchar" length="256" jt="12" >
<comment><![CDATA[The name of the column that describes the specimen identifiers (such as what is written on the tubes).]]></comment>
</column>
<index name="pk_study" unique="PRIMARY_KEY" >
<column name="study_id" />
</index>
Expand Down Expand Up @@ -1691,7 +1694,6 @@ Controlled Vocabulary]]></comment>
<entity schema="qiita" name="study_environmental_package" color="b2cdf7" x="2085" y="45" />
<entity schema="qiita" name="environmental_package" color="b2cdf7" x="2160" y="180" />
<entity schema="qiita" name="study_person" color="c0d4f3" x="2010" y="135" />
<entity schema="qiita" name="study" color="d0def5" x="1785" y="240" />
<entity schema="qiita" name="per_study_tags" color="b2cdf7" x="2145" y="450" />
<entity schema="qiita" name="study_tags" color="b2cdf7" x="2295" y="480" />
<entity schema="qiita" name="timeseries_type" color="c0d4f3" x="2010" y="540" />
Expand Down Expand Up @@ -1725,6 +1727,7 @@ Controlled Vocabulary]]></comment>
<entity schema="qiita" name="software_command" color="b2cdf7" x="2160" y="1245" />
<entity schema="qiita" name="processing_job_validator" color="b2cdf7" x="2160" y="1410" />
<entity schema="qiita" name="processing_job" color="b2cdf7" x="1935" y="1140" />
<entity schema="qiita" name="study" color="d0def5" x="1785" y="240" />
<group name="Group_analyses" color="c4e0f9" >
<comment>analysis tables</comment>
<entity schema="qiita" name="analysis" />
Expand Down
Loading