-
Notifications
You must be signed in to change notification settings - Fork 80
Search stopgap #1014
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Search stopgap #1014
Changes from 1 commit
0ef232d
695fa2e
99ea8ee
b673505
7a30ab6
632127c
5a4af3b
8edfa56
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
from collections import Counter, defaultdict | ||
|
||
from future.utils import viewvalues | ||
from future.utils import viewvalues, viewitems | ||
|
||
from qiita_db.study import Study | ||
from qiita_db.data import ProcessedData | ||
|
@@ -64,34 +64,33 @@ def filter_by_processed_data(results, datatypes=None): | |
study_proc_ids : dict of dicts of lists | ||
Processed data ids with samples for each study, in the format | ||
{study_id: {datatype: [proc_id, proc_id, ...], ...}, ...} | ||
proc_data_samples : dict of lists of lists | ||
proc_data_samples : dict of lists | ||
Samples available in each processed data id, in the format | ||
{proc_data_id: [[samp_id1, meta1, meta2, ...], | ||
[samp_id2, meta1, meta2, ...], ...} | ||
dtcount : count of all samples available for each datatype | ||
{proc_data_id: [samp_id1, samp_id2, ...], ...} | ||
samples_meta : dict of lists | ||
sample metadata in same order as the metadata given by search | ||
Format {samp_id: [meta1, meta2, ...], ...} | ||
""" | ||
study_proc_ids = {} | ||
proc_data_samples = {} | ||
dtcount = defaultdict(int) | ||
for study_id in results: | ||
samples_meta = {} | ||
for study_id, study_samples in viewitems(results): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is |
||
study = Study(study_id) | ||
samples_meta.update({s[0]: s[1:] for s in study_samples}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this structure holds |
||
study_proc_ids[study_id] = defaultdict(list) | ||
for proc_data_id in study.processed_data(): | ||
proc_data = ProcessedData(proc_data_id) | ||
datatype = proc_data.data_type() | ||
# skip processed data if it doesn't fit the given datatypes | ||
if datatypes is not None and datatype not in datatypes: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this conditional can be simplified if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actually, that would not work based on the description in the docstring. You would instead want to set |
||
continue | ||
samps_available = set(proc_data.samples) | ||
proc_data_samples[proc_data_id] = [s for s in results[study_id] | ||
samps_available = proc_data.samples | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this variable lookup friendly? E.g., There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, it's a set. |
||
proc_data_samples[proc_data_id] = [s[0] for s in study_samples | ||
if s[0] in samps_available] | ||
# Add number of samples left to the total for that datatype | ||
pidlen = len(proc_data_samples[proc_data_id]) | ||
dtcount[datatype] += pidlen | ||
if pidlen == 0: | ||
if len(proc_data_samples[proc_data_id]) == 0: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would be better to set this if its created, instead of setting empty first There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh, sorry, misunderstood what's going on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ...but still, why set in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't know if all samples for the processed data gets filtered out or not when filling the key, so delete it if nothing is there. Can probably make the list, then attach though and be better. |
||
# all samples filtered so remove it as a result | ||
del(proc_data_samples[proc_data_id]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no assurance that |
||
else: | ||
# add the processed data to the list for the study | ||
study_proc_ids[study_id][datatype].append(proc_data_id) | ||
return study_proc_ids, proc_data_samples, dtcount | ||
return study_proc_ids, proc_data_samples, samples_meta |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,16 @@ | ||
from unittest import TestCase, main | ||
|
||
from qiita_core.util import qiita_test_checker | ||
from qiita_db.search import QiitaStudySearch | ||
from qiita_db.user import User | ||
from qiita_ware.search import search, count_metadata, filter_by_processed_data | ||
from qiita_ware.search import count_metadata, filter_by_processed_data | ||
|
||
|
||
@qiita_test_checker() | ||
class SearchTest(TestCase): | ||
"""Tests that the search helpers work as expected""" | ||
def test_count_metadata(self): | ||
search = QiitaStudySearch() | ||
results, meta_cols = search('study_id = 1 AND ph > 0', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be much more useful if the inputs to |
||
User('test@foo.bar')) | ||
fullcount, studycount = count_metadata(results, meta_cols) | ||
|
@@ -19,28 +21,32 @@ def test_count_metadata(self): | |
self.assertEqual(studycount, expstudy) | ||
|
||
def test_filter_by_processed_data(self): | ||
search = QiitaStudySearch() | ||
results, meta_cols = search('study_id = 1', User('test@foo.bar')) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same comment here |
||
study_proc_ids, proc_data_samples, dtcounts = filter_by_processed_data( | ||
study_proc_ids, proc_data_samples, meta = filter_by_processed_data( | ||
results) | ||
exp_spid = {1: {'18S': [1]}} | ||
exp_pds = {1: [['1.SKM7.640188', 1], ['1.SKD9.640182', 1], | ||
['1.SKM8.640201', 1], ['1.SKB8.640193', 1], | ||
['1.SKD2.640178', 1], ['1.SKM3.640197', 1], | ||
['1.SKM4.640180', 1], ['1.SKB9.640200', 1], | ||
['1.SKB4.640189', 1], ['1.SKB5.640181', 1], | ||
['1.SKB6.640176', 1], ['1.SKM2.640199', 1], | ||
['1.SKM5.640177', 1], ['1.SKB1.640202', 1], | ||
['1.SKD8.640184', 1], ['1.SKD4.640185', 1], | ||
['1.SKB3.640195', 1], ['1.SKM1.640183', 1], | ||
['1.SKB7.640196', 1], ['1.SKD3.640198', 1], | ||
['1.SKD7.640191', 1], ['1.SKD6.640190', 1], | ||
['1.SKB2.640194', 1], ['1.SKM9.640192', 1], | ||
['1.SKM6.640187', 1], ['1.SKD5.640186', 1], | ||
['1.SKD1.640179', 1]]} | ||
exp_dtc = {'18S': 27} | ||
exp_pds = {1: [ | ||
'1.SKM7.640188','1.SKD9.640182', '1.SKM8.640201', '1.SKB8.640193', | ||
'1.SKD2.640178', '1.SKM3.640197', '1.SKM4.640180', '1.SKB9.640200', | ||
'1.SKB4.640189', '1.SKB5.640181', '1.SKB6.640176', '1.SKM2.640199', | ||
'1.SKM5.640177', '1.SKB1.640202', '1.SKD8.640184', '1.SKD4.640185', | ||
'1.SKB3.640195', '1.SKM1.640183', '1.SKB7.640196', '1.SKD3.640198', | ||
'1.SKD7.640191', '1.SKD6.640190', '1.SKB2.640194', '1.SKM9.640192', | ||
'1.SKM6.640187', '1.SKD5.640186', '1.SKD1.640179']} | ||
exp_meta = { | ||
'1.SKM7.640188': [1],'1.SKD9.640182': [1], '1.SKM8.640201': [1], | ||
'1.SKB8.640193': [1], '1.SKD2.640178': [1], '1.SKM3.640197': [1], | ||
'1.SKM4.640180': [1], '1.SKB9.640200': [1], '1.SKB4.640189': [1], | ||
'1.SKB5.640181': [1], '1.SKB6.640176': [1], '1.SKM2.640199': [1], | ||
'1.SKM5.640177': [1], '1.SKB1.640202': [1], '1.SKD8.640184': [1], | ||
'1.SKD4.640185': [1], '1.SKB3.640195': [1], '1.SKM1.640183': [1], | ||
'1.SKB7.640196': [1], '1.SKD3.640198': [1], '1.SKD7.640191': [1], | ||
'1.SKD6.640190': [1], '1.SKB2.640194': [1], '1.SKM9.640192': [1], | ||
'1.SKM6.640187': [1], '1.SKD5.640186': [1], '1.SKD1.640179': [1]} | ||
self.assertEqual(study_proc_ids, exp_spid) | ||
self.assertEqual(proc_data_samples, exp_pds) | ||
self.assertEqual(dtcounts, exp_dtc) | ||
self.assertEqual(meta, exp_meta) | ||
|
||
|
||
if __name__ == '__main__': | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
from the code below, the sample IDs contained in here will be a superset of the sample IDs described in
proc_data_samples
. Are those objects expected to be 1-1? If not, it would be useful to include a mention of this in the docstring