Skip to content

Search stopgap #1014

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 1, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions qiita_db/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1310,6 +1310,27 @@ def processed_date(self):
"SELECT processed_date FROM qiita.{0} WHERE "
"processed_data_id=%s".format(self._table), (self.id,))[0]

@property
def samples(self):
"""Return the samples available according to prep template

Returns
-------
set
all sample_ids available for the processed data
"""
conn_handler = SQLConnectionHandler()
# Get the prep template id for teh dynamic table lookup
sql = """SELECT ptp.prep_template_id FROM
qiita.prep_template_preprocessed_data ptp JOIN
qiita.preprocessed_processed_data ppd USING (preprocessed_data_id)
WHERE ppd.processed_data_id = %s"""
prep_id = conn_handler.execute_fetchone(sql, [self._id])[0]

# Get samples from dynamic table
sql = "SELECT sample_id FROM qiita.prep_%d" % prep_id
return set(s[0] for s in conn_handler.execute_fetchall(sql))

@property
def status(self):
conn_handler = SQLConnectionHandler()
Expand Down
62 changes: 60 additions & 2 deletions qiita_db/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,15 @@
from pyparsing import (alphas, nums, Word, dblQuotedString, oneOf, Optional,
opAssoc, CaselessLiteral, removeQuotes, Group,
operatorPrecedence, stringEnd)
from collections import defaultdict

import pandas as pd
from future.utils import viewitems

from qiita_db.util import scrub_data, convert_type, get_table_cols
from qiita_db.sql_connection import SQLConnectionHandler
from qiita_db.study import Study
from qiita_db.data import ProcessedData
from qiita_db.exceptions import QiitaDBIncompatibleDatatypeError


Expand Down Expand Up @@ -211,6 +216,8 @@ def __call__(self, searchstr, user):
if study_res:
# only add study to results if actually has samples in results
results[sid] = study_res
self.results = results
self.meta_headers = meta_headers
return results, meta_headers

def _parse_study_search_string(self, searchstr,
Expand Down Expand Up @@ -304,8 +311,8 @@ def _parse_study_search_string(self, searchstr,

# create the study finding SQL
# remove metadata headers that are in required_sample_info table
meta_headers = meta_headers.difference(self.required_cols).difference(
self.study_cols)
meta_headers = tuple(meta_headers.difference(
self.required_cols).difference(self.study_cols))

# get all study ids that contain all metadata categories searched for
sql = []
Expand Down Expand Up @@ -347,3 +354,54 @@ def _parse_study_search_string(self, searchstr,
"r.study_id WHERE %s" %
(','.join(header_info), sql_where))
return study_sql, sample_sql, meta_header_type_lookup.keys()

def filter_by_processed_data(self, datatypes=None):
"""Filters results to what is available in each processed data

Parameters
----------
datatypes : list of str, optional
Datatypes to selectively return. Default all datatypes available

Returns
-------
study_proc_ids : dict of dicts of lists
Processed data ids with samples for each study, in the format
{study_id: {datatype: [proc_id, proc_id, ...], ...}, ...}
proc_data_samples : dict of lists
Samples available in each processed data id, in the format
{proc_data_id: [samp_id1, samp_id2, ...], ...}
samples_meta : dict of pandas DataFrames
metadata for the found samples, keyed by study. Pandas indexed on
sample_id, column headers are the metadata categories searched
over
"""
if datatypes is not None:
# convert to set for easy lookups
datatypes = set(datatypes)
study_proc_ids = {}
proc_data_samples = {}
samples_meta = {}
headers = {c: val for c, val in enumerate(self.meta_headers)}
for study_id, study_meta in viewitems(self.results):
# add metadata to dataframe and dict
# use from_dict because pandas doesn't like cursor objects
samples_meta[study_id] = pd.DataFrame.from_dict(
{s[0]: s[1:] for s in study_meta}, orient='index')
samples_meta[study_id].rename(columns=headers, inplace=True)
# set up study-based data needed
study = Study(study_id)
study_sample_ids = {s[0] for s in study_meta}
study_proc_ids[study_id] = defaultdict(list)
for proc_data_id in study.processed_data():
proc_data = ProcessedData(proc_data_id)
datatype = proc_data.data_type()
# skip processed data if it doesn't fit the given datatypes
if datatypes is not None and datatype not in datatypes:
continue
filter_samps = proc_data.samples.intersection(study_sample_ids)
if filter_samps:
proc_data_samples[proc_data_id] = sorted(filter_samps)
study_proc_ids[study_id][datatype].append(proc_data_id)

return study_proc_ids, proc_data_samples, samples_meta
13 changes: 13 additions & 0 deletions qiita_db/test/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -932,6 +932,19 @@ def test_processed_date(self):
pd = ProcessedData(1)
self.assertEqual(pd.processed_date, datetime(2012, 10, 1, 9, 30, 27))

def test_samples(self):
pd = ProcessedData(1)
obs = pd.samples
exp = {
'1.SKB2.640194', '1.SKM4.640180', '1.SKB3.640195', '1.SKB6.640176',
'1.SKD6.640190', '1.SKM6.640187', '1.SKD9.640182', '1.SKM8.640201',
'1.SKM2.640199', '1.SKD2.640178', '1.SKB7.640196', '1.SKD4.640185',
'1.SKB8.640193', '1.SKM3.640197', '1.SKD5.640186', '1.SKB1.640202',
'1.SKM1.640183', '1.SKD1.640179', '1.SKD3.640198', '1.SKB5.640181',
'1.SKB4.640189', '1.SKB9.640200', '1.SKM9.640192', '1.SKD8.640184',
'1.SKM5.640177', '1.SKM7.640188', '1.SKD7.640191'}
self.assertEqual(obs, exp)

def test_status(self):
pd = ProcessedData(1)
self.assertEqual(pd.status, 'private')
Expand Down
27 changes: 25 additions & 2 deletions qiita_db/test/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@

from unittest import TestCase, main

import pandas as pd
from pandas.util.testing import assert_frame_equal

from qiita_db.user import User
from qiita_core.util import qiita_test_checker
from qiita_db.search import QiitaStudySearch


@qiita_test_checker()
class SearchTest(TestCase):
"""Tests that the search object works as expected"""

Expand Down Expand Up @@ -174,6 +175,28 @@ def test_call_no_results(self):
self.assertEqual(obs_res, {})
self.assertEqual(obs_meta, ['sample_type'])

def test_filter_by_processed_data(self):
search = QiitaStudySearch()
results, meta_cols = search('study_id = 1', User('test@foo.bar'))
spid, pds, meta = search.filter_by_processed_data()
exp_spid = {1: {'18S': [1]}}
exp_pds = {1: [
'1.SKB1.640202', '1.SKB2.640194', '1.SKB3.640195', '1.SKB4.640189',
'1.SKB5.640181', '1.SKB6.640176', '1.SKB7.640196', '1.SKB8.640193',
'1.SKB9.640200', '1.SKD1.640179', '1.SKD2.640178', '1.SKD3.640198',
'1.SKD4.640185', '1.SKD5.640186', '1.SKD6.640190', '1.SKD7.640191',
'1.SKD8.640184', '1.SKD9.640182', '1.SKM1.640183', '1.SKM2.640199',
'1.SKM3.640197', '1.SKM4.640180', '1.SKM5.640177', '1.SKM6.640187',
'1.SKM7.640188', '1.SKM8.640201', '1.SKM9.640192']}
exp_meta = pd.DataFrame.from_dict({x: 1 for x in exp_pds[1]},
orient='index')
exp_meta.rename(columns={0: 'study_id'}, inplace=True)

self.assertEqual(spid, exp_spid)
self.assertEqual(pds, exp_pds)
self.assertEqual(meta.keys(), [1])
assert_frame_equal(meta[1], exp_meta)


if __name__ == "__main__":
main()