Skip to content

Search stopgap #1014

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 1, 2015
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fnish stopgap
  • Loading branch information
squirrelo committed Mar 25, 2015
commit 695fa2e5636697a49fd1fa4db2ad4785be1525b3
21 changes: 21 additions & 0 deletions qiita_db/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1173,3 +1173,24 @@ def processed_date(self):
return conn_handler.execute_fetchone(
"SELECT processed_date FROM qiita.{0} WHERE "
"processed_data_id=%s".format(self._table), (self.id,))[0]

@property
def samples(self):
"""Return the samples available according to prep template

Returns
-------
set
all sample_ids available for the processed data
"""
conn_handler = SQLConnectionHandler()
# Get the prep template id for teh dynamic table lookup
sql = """SELECT ptp.prep_template_id FROM
qiita.prep_template_preprocessed_data ptp JOIN
qiita.preprocessed_processed_data ppd USING (preprocessed_data_id)
WHERE ppd.processed_data_id = %s"""
prep_id = conn_handler.execute_fetchone(sql, [self._id])[0]

# Get samples from dynamic table
sql = "SELECT sample_id FROM qiita.prep_%d" % prep_id
return set(s[0] for s in conn_handler.execute_fetchall(sql))
12 changes: 12 additions & 0 deletions qiita_db/test/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -868,6 +868,18 @@ def test_processed_date(self):
pd = ProcessedData(1)
self.assertEqual(pd.processed_date, datetime(2012, 10, 1, 9, 30, 27))

def test_samples(self):
pd = ProcessedData(1)
obs = pd.samples
exp = {
'1.SKB2.640194', '1.SKM4.640180', '1.SKB3.640195', '1.SKB6.640176',
'1.SKD6.640190', '1.SKM6.640187', '1.SKD9.640182', '1.SKM8.640201',
'1.SKM2.640199', '1.SKD2.640178', '1.SKB7.640196', '1.SKD4.640185',
'1.SKB8.640193', '1.SKM3.640197', '1.SKD5.640186', '1.SKB1.640202',
'1.SKM1.640183', '1.SKD1.640179', '1.SKD3.640198', '1.SKB5.640181',
'1.SKB4.640189', '1.SKB9.640200', '1.SKM9.640192', '1.SKD8.640184',
'1.SKM5.640177', '1.SKM7.640188', '1.SKD7.640191'}
self.assertEqual(obs, exp)

if __name__ == '__main__':
main()
27 changes: 13 additions & 14 deletions qiita_ware/search.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections import Counter, defaultdict

from future.utils import viewvalues
from future.utils import viewvalues, viewitems

from qiita_db.study import Study
from qiita_db.data import ProcessedData
Expand Down Expand Up @@ -64,34 +64,33 @@ def filter_by_processed_data(results, datatypes=None):
study_proc_ids : dict of dicts of lists
Processed data ids with samples for each study, in the format
{study_id: {datatype: [proc_id, proc_id, ...], ...}, ...}
proc_data_samples : dict of lists of lists
proc_data_samples : dict of lists
Samples available in each processed data id, in the format
{proc_data_id: [[samp_id1, meta1, meta2, ...],
[samp_id2, meta1, meta2, ...], ...}
dtcount : count of all samples available for each datatype
{proc_data_id: [samp_id1, samp_id2, ...], ...}
samples_meta : dict of lists
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from the code below, the sample IDs contained in here will be a superset of the sample IDs described in proc_data_samples. Are those objects expected to be 1-1? If not, it would be useful to include a mention of this in the docstring

sample metadata in same order as the metadata given by search
Format {samp_id: [meta1, meta2, ...], ...}
"""
study_proc_ids = {}
proc_data_samples = {}
dtcount = defaultdict(int)
for study_id in results:
samples_meta = {}
for study_id, study_samples in viewitems(results):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is study_samples an iterable of the sample ids? it looks like it is the metadata values from its use

study = Study(study_id)
samples_meta.update({s[0]: s[1:] for s in study_samples})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this structure holds {sample_id: [metadata_values]} without any correspondence to the metadata category the value is associated with?

study_proc_ids[study_id] = defaultdict(list)
for proc_data_id in study.processed_data():
proc_data = ProcessedData(proc_data_id)
datatype = proc_data.data_type()
# skip processed data if it doesn't fit the given datatypes
if datatypes is not None and datatype not in datatypes:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this conditional can be simplified if datatypes is made the empty set at the beginning of the method in the case where datatypes is None

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually, that would not work based on the description in the docstring. You would instead want to set datatypes = set(all_datatypes)

continue
samps_available = set(proc_data.samples)
proc_data_samples[proc_data_id] = [s for s in results[study_id]
samps_available = proc_data.samples
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this variable lookup friendly? E.g., set or dict or something comparable?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, it's a set.

proc_data_samples[proc_data_id] = [s[0] for s in study_samples
if s[0] in samps_available]
# Add number of samples left to the total for that datatype
pidlen = len(proc_data_samples[proc_data_id])
dtcount[datatype] += pidlen
if pidlen == 0:
if len(proc_data_samples[proc_data_id]) == 0:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be better to set this if its created, instead of setting empty first

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, sorry, misunderstood what's going on

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

...but still, why set in proc_data_samples if you're only going to delete?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't know if all samples for the processed data gets filtered out or not when filling the key, so delete it if nothing is there. Can probably make the list, then attach though and be better.

# all samples filtered so remove it as a result
del(proc_data_samples[proc_data_id])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no assurance that proc_data_id has been set in proc_data_samples from what I can see which would lead to a KeyError

else:
# add the processed data to the list for the study
study_proc_ids[study_id][datatype].append(proc_data_id)
return study_proc_ids, proc_data_samples, dtcount
return study_proc_ids, proc_data_samples, samples_meta
42 changes: 24 additions & 18 deletions qiita_ware/test/test_search.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from unittest import TestCase, main

from qiita_core.util import qiita_test_checker
from qiita_db.search import QiitaStudySearch
from qiita_db.user import User
from qiita_ware.search import search, count_metadata, filter_by_processed_data
from qiita_ware.search import count_metadata, filter_by_processed_data


@qiita_test_checker()
class SearchTest(TestCase):
"""Tests that the search helpers work as expected"""
def test_count_metadata(self):
search = QiitaStudySearch()
results, meta_cols = search('study_id = 1 AND ph > 0',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be much more useful if the inputs to count_metadata where explicitly described instead of fetching from the database. It is difficult to evaluate this test without knowing the inputs.

User('test@foo.bar'))
fullcount, studycount = count_metadata(results, meta_cols)
Expand All @@ -19,28 +21,32 @@ def test_count_metadata(self):
self.assertEqual(studycount, expstudy)

def test_filter_by_processed_data(self):
search = QiitaStudySearch()
results, meta_cols = search('study_id = 1', User('test@foo.bar'))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same comment here

study_proc_ids, proc_data_samples, dtcounts = filter_by_processed_data(
study_proc_ids, proc_data_samples, meta = filter_by_processed_data(
results)
exp_spid = {1: {'18S': [1]}}
exp_pds = {1: [['1.SKM7.640188', 1], ['1.SKD9.640182', 1],
['1.SKM8.640201', 1], ['1.SKB8.640193', 1],
['1.SKD2.640178', 1], ['1.SKM3.640197', 1],
['1.SKM4.640180', 1], ['1.SKB9.640200', 1],
['1.SKB4.640189', 1], ['1.SKB5.640181', 1],
['1.SKB6.640176', 1], ['1.SKM2.640199', 1],
['1.SKM5.640177', 1], ['1.SKB1.640202', 1],
['1.SKD8.640184', 1], ['1.SKD4.640185', 1],
['1.SKB3.640195', 1], ['1.SKM1.640183', 1],
['1.SKB7.640196', 1], ['1.SKD3.640198', 1],
['1.SKD7.640191', 1], ['1.SKD6.640190', 1],
['1.SKB2.640194', 1], ['1.SKM9.640192', 1],
['1.SKM6.640187', 1], ['1.SKD5.640186', 1],
['1.SKD1.640179', 1]]}
exp_dtc = {'18S': 27}
exp_pds = {1: [
'1.SKM7.640188','1.SKD9.640182', '1.SKM8.640201', '1.SKB8.640193',
'1.SKD2.640178', '1.SKM3.640197', '1.SKM4.640180', '1.SKB9.640200',
'1.SKB4.640189', '1.SKB5.640181', '1.SKB6.640176', '1.SKM2.640199',
'1.SKM5.640177', '1.SKB1.640202', '1.SKD8.640184', '1.SKD4.640185',
'1.SKB3.640195', '1.SKM1.640183', '1.SKB7.640196', '1.SKD3.640198',
'1.SKD7.640191', '1.SKD6.640190', '1.SKB2.640194', '1.SKM9.640192',
'1.SKM6.640187', '1.SKD5.640186', '1.SKD1.640179']}
exp_meta = {
'1.SKM7.640188': [1],'1.SKD9.640182': [1], '1.SKM8.640201': [1],
'1.SKB8.640193': [1], '1.SKD2.640178': [1], '1.SKM3.640197': [1],
'1.SKM4.640180': [1], '1.SKB9.640200': [1], '1.SKB4.640189': [1],
'1.SKB5.640181': [1], '1.SKB6.640176': [1], '1.SKM2.640199': [1],
'1.SKM5.640177': [1], '1.SKB1.640202': [1], '1.SKD8.640184': [1],
'1.SKD4.640185': [1], '1.SKB3.640195': [1], '1.SKM1.640183': [1],
'1.SKB7.640196': [1], '1.SKD3.640198': [1], '1.SKD7.640191': [1],
'1.SKD6.640190': [1], '1.SKB2.640194': [1], '1.SKM9.640192': [1],
'1.SKM6.640187': [1], '1.SKD5.640186': [1], '1.SKD1.640179': [1]}
self.assertEqual(study_proc_ids, exp_spid)
self.assertEqual(proc_data_samples, exp_pds)
self.assertEqual(dtcounts, exp_dtc)
self.assertEqual(meta, exp_meta)


if __name__ == '__main__':
Expand Down