qiita-spots · antgonza · Apr 1, 2015 · Mar 25, 2015 · Mar 25, 2015 · Mar 25, 2015
diff --git a/qiita_db/data.py b/qiita_db/data.py
@@ -1173,3 +1173,24 @@ def processed_date(self):
         return conn_handler.execute_fetchone(
             "SELECT processed_date FROM qiita.{0} WHERE "
             "processed_data_id=%s".format(self._table), (self.id,))[0]
+
+    @property
+    def samples(self):
+        """Return the samples available according to prep template
+
+        Returns
+        -------
+        set
+            all sample_ids available for the processed data
+        """
+        conn_handler = SQLConnectionHandler()
+        # Get the prep template id for teh dynamic table lookup
+        sql = """SELECT ptp.prep_template_id FROM
+            qiita.prep_template_preprocessed_data ptp JOIN
+            qiita.preprocessed_processed_data ppd USING (preprocessed_data_id)
+            WHERE ppd.processed_data_id = %s"""
+        prep_id = conn_handler.execute_fetchone(sql, [self._id])[0]
+
+        # Get samples from dynamic table
+        sql = "SELECT sample_id FROM qiita.prep_%d" % prep_id
+        return set(s[0] for s in conn_handler.execute_fetchall(sql))
diff --git a/qiita_db/test/test_data.py b/qiita_db/test/test_data.py
@@ -868,6 +868,18 @@ def test_processed_date(self):
         pd = ProcessedData(1)
         self.assertEqual(pd.processed_date, datetime(2012, 10, 1, 9, 30, 27))
 
+    def test_samples(self):
+        pd = ProcessedData(1)
+        obs = pd.samples
+        exp = {
+            '1.SKB2.640194', '1.SKM4.640180', '1.SKB3.640195', '1.SKB6.640176',
+            '1.SKD6.640190', '1.SKM6.640187', '1.SKD9.640182', '1.SKM8.640201',
+            '1.SKM2.640199', '1.SKD2.640178', '1.SKB7.640196', '1.SKD4.640185',
+            '1.SKB8.640193', '1.SKM3.640197', '1.SKD5.640186', '1.SKB1.640202',
+            '1.SKM1.640183', '1.SKD1.640179', '1.SKD3.640198', '1.SKB5.640181',
+            '1.SKB4.640189', '1.SKB9.640200', '1.SKM9.640192', '1.SKD8.640184',
+            '1.SKM5.640177', '1.SKM7.640188', '1.SKD7.640191'}
+        self.assertEqual(obs, exp)
 
 if __name__ == '__main__':
     main()
diff --git a/qiita_ware/search.py b/qiita_ware/search.py
@@ -0,0 +1,101 @@
+from collections import Counter, defaultdict
+
+from future.utils import viewvalues, viewitems
+
+from qiita_db.study import Study
+from qiita_db.data import ProcessedData
+
+
+def count_metadata(results, meta_cols):
+    """Counts the metadata found in a search, and returns these counts
+
+    Parameters
+    ----------
+    results : dict of lists of list
+        results in the format returned by the qiita_db search obj
+    meta_cols : list
+        metadata column names searched for, as returned by qiita_db search obj
+
+    Returns
+    -------
+    fullcount : dict of dicts
+        counts for each found metadata value over all studies, in the format
+        {meta_col1: {value1: count, value2: count, ...}, ...}
+    studycount : dict of dict of dicts
+        counts for each found metadata value for each study, in the format
+        {study_id: {meta_col1: {value1: count, value2: count, ...}, ...}, ...}
+    """
+    def double_comprehension(results):
+        for samples in viewvalues(results):
+            for sample in samples:
+                yield sample
+
+    fullcount = {}
+    # rearrange all samples so that each metadata column found is its own list
+    meta_vals = zip(*double_comprehension(results))
+    for pos, cat in enumerate(meta_cols):
+        # use Counter object to count all metadata values for a column
+        # pos+1 so we skip the sample names list
+        fullcount[cat] = Counter(meta_vals[pos + 1])
+
+    # Now get metadata counts for each study, removing sample ids as before
+    studycount = {}
+    for study_id in results:
+        hold = {}
+        # zip all samples for a given study so that each metadata column found
+        # is its own list
+        meta_vals = zip(*(sample for sample in results[study_id]))
+        for pos, cat in enumerate(meta_cols):
+            # use Counter object to count all metadata values for a column
+            # pos+1 so we skip the sample names list
+            hold[cat] = Counter(meta_vals[pos + 1])
+        studycount[study_id] = hold
+
+    return fullcount, studycount
+
+
+def filter_by_processed_data(results, datatypes=None):
+    """Filters results to what is available in each processed data
+
+    Parameters
+    ----------
+    results : dict of lists of list
+        results in the format returned by the qiita_db search obj
+    datatypes : list of str, optional
+        Datatypes to selectively return. Default all datatypes available
+
+    Returns
+    -------
+    study_proc_ids : dict of dicts of lists
+        Processed data ids with samples for each study, in the format
+        {study_id: {datatype: [proc_id, proc_id, ...], ...}, ...}
+    proc_data_samples : dict of lists
+        Samples available in each processed data id, in the format
+        {proc_data_id: [samp_id1, samp_id2, ...], ...}
+    samples_meta : dict of lists
+        sample metadata in same order as the metadata given by search
+        Format {samp_id: [meta1, meta2, ...], ...}
+    """
+    study_proc_ids = {}
+    proc_data_samples = {}
+    samples_meta = {}
+    for study_id, study_samples in viewitems(results):
+        study = Study(study_id)
+        samples_meta.update({s[0]: s[1:] for s in study_samples})
+        study_proc_ids[study_id] = defaultdict(list)
+        for proc_data_id in study.processed_data():
+            proc_data = ProcessedData(proc_data_id)
+            datatype = proc_data.data_type()
+            # skip processed data if it doesn't fit the given datatypes
+            if datatypes is not None and datatype not in datatypes:
+                continue
+            samps_available = proc_data.samples
+            hold = [s[0] for s in study_samples if s[0] in samps_available]
+            if len(hold) == 0:
+                # all samples filtered so remove it as a result
+                del(proc_data_samples[proc_data_id])
+            else:
+                proc_data_samples[proc_data_id] = hold
+                # add the processed data to the list for the study
+                study_proc_ids[study_id][datatype].append(proc_data_id)
+    return study_proc_ids, proc_data_samples, samples_meta
diff --git a/qiita_ware/test/test_search.py b/qiita_ware/test/test_search.py
@@ -0,0 +1,53 @@
+from unittest import TestCase, main
+
+from qiita_core.util import qiita_test_checker
+from qiita_db.search import QiitaStudySearch
+from qiita_db.user import User
+from qiita_ware.search import count_metadata, filter_by_processed_data
+
+
+@qiita_test_checker()
+class SearchTest(TestCase):
+    """Tests that the search helpers work as expected"""
+    def test_count_metadata(self):
+        search = QiitaStudySearch()
+        results, meta_cols = search('study_id = 1 AND ph > 0',
+                                    User('test@foo.bar'))
+        fullcount, studycount = count_metadata(results, meta_cols)
+        expfull = {'study_id': {1: 27}, 'ph': {6.82: 10, 6.8: 9, 6.94: 8}}
+        expstudy = {1: {'study_id': {1: 27},
+                        'ph': {6.82: 10, 6.8: 9, 6.94: 8}}}
+        self.assertEqual(fullcount, expfull)
+        self.assertEqual(studycount, expstudy)
+
+    def test_filter_by_processed_data(self):
+        search = QiitaStudySearch()
+        results, meta_cols = search('study_id = 1', User('test@foo.bar'))
+        study_proc_ids, proc_data_samples, meta = filter_by_processed_data(
+            results)
+        exp_spid = {1: {'18S': [1]}}
+        exp_pds = {1: [
+            '1.SKM7.640188', '1.SKD9.640182', '1.SKM8.640201', '1.SKB8.640193',
+            '1.SKD2.640178', '1.SKM3.640197', '1.SKM4.640180', '1.SKB9.640200',
+            '1.SKB4.640189', '1.SKB5.640181', '1.SKB6.640176', '1.SKM2.640199',
+            '1.SKM5.640177', '1.SKB1.640202', '1.SKD8.640184', '1.SKD4.640185',
+            '1.SKB3.640195', '1.SKM1.640183', '1.SKB7.640196', '1.SKD3.640198',
+            '1.SKD7.640191', '1.SKD6.640190', '1.SKB2.640194', '1.SKM9.640192',
+            '1.SKM6.640187', '1.SKD5.640186', '1.SKD1.640179']}
+        exp_meta = {
+            '1.SKM7.640188': [1], '1.SKD9.640182': [1], '1.SKM8.640201': [1],
+            '1.SKB8.640193': [1], '1.SKD2.640178': [1], '1.SKM3.640197': [1],
+            '1.SKM4.640180': [1], '1.SKB9.640200': [1], '1.SKB4.640189': [1],
+            '1.SKB5.640181': [1], '1.SKB6.640176': [1], '1.SKM2.640199': [1],
+            '1.SKM5.640177': [1], '1.SKB1.640202': [1], '1.SKD8.640184': [1],
+            '1.SKD4.640185': [1], '1.SKB3.640195': [1], '1.SKM1.640183': [1],
+            '1.SKB7.640196': [1], '1.SKD3.640198': [1], '1.SKD7.640191': [1],
+            '1.SKD6.640190': [1], '1.SKB2.640194': [1], '1.SKM9.640192': [1],
+            '1.SKM6.640187': [1], '1.SKD5.640186': [1], '1.SKD1.640179': [1]}
+        self.assertEqual(study_proc_ids, exp_spid)
+        self.assertEqual(proc_data_samples, exp_pds)
+        self.assertEqual(meta, exp_meta)
+
+
+if __name__ == '__main__':
+    main()