Skip to content

Commit 2dc78a3

Browse files
committed
Merge pull request #129 from adamrp/load_processed_data
Load processed data
2 parents ed57160 + 067767b commit 2dc78a3

File tree

8 files changed

+1708
-1658
lines changed

8 files changed

+1708
-1658
lines changed

qiita_db/commands.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# The full license is in the file LICENSE, distributed with this software.
77
# -----------------------------------------------------------------------------
88

9+
from dateutil.parser import parse
910
import pandas as pd
1011
from functools import partial
1112
try:
@@ -18,7 +19,7 @@
1819
from .study import Study, StudyPerson
1920
from .user import User
2021
from .util import get_filetypes, get_filepath_types
21-
from .data import RawData
22+
from .data import RawData, PreprocessedData, ProcessedData
2223
from .metadata_template import SampleTemplate
2324

2425

@@ -108,3 +109,50 @@ def load_raw_data_cmd(filepaths, filepath_types, filetype, study_ids):
108109

109110
return RawData.create(filetype_id, list(zip(filepaths, filepath_types)),
110111
studies)
112+
113+
114+
def load_processed_data_cmd(fps, fp_types, processed_params_table_name,
115+
processed_params_id, preprocessed_data_id=None,
116+
processed_date=None):
117+
"""Add a new processed data entry
118+
119+
Parameters
120+
----------
121+
fps : list of str
122+
Paths to the processed data files to associate with the ProcessedData
123+
object
124+
fp_types: list of str
125+
The types of files, one per fp
126+
processed_params_table_name : str
127+
The name of the processed_params_ table to use
128+
processed_params_id : int
129+
The ID of the row in the processed_params_ table
130+
preprocessed_data_id : int, optional
131+
Defaults to ``None``. The ID of the row in the preprocessed_data table.
132+
processed_date : str, optional
133+
Defaults to ``None``. The date and time to use as the processing date.
134+
Must be interpretable as a datetime object
135+
136+
Returns
137+
-------
138+
qiita_db.ProcessedData
139+
The newly created `qiita_db.ProcessedData` object
140+
"""
141+
if len(fps) != len(fp_types):
142+
raise ValueError("Please pass exactly one fp_type for each "
143+
"and every fp")
144+
145+
fp_types_dict = get_filepath_types()
146+
fp_types = [fp_types_dict[x] for x in fp_types]
147+
148+
if preprocessed_data_id is not None:
149+
preprocessed_data = PreprocessedData(preprocessed_data_id)
150+
else:
151+
preprocessed_data = None
152+
153+
if processed_date is not None:
154+
processed_date = parse(processed_date)
155+
156+
return ProcessedData.create(processed_params_table_name,
157+
processed_params_id, list(zip(fps, fp_types)),
158+
preprocessed_data, processed_date)

qiita_db/support_files/qiita-db.dbs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -635,10 +635,8 @@ Linked by y being raw_data_id from raw data table.</comment>
635635
<table name="processed_filepath" >
636636
<column name="processed_data_id" type="bigint" jt="-5" mandatory="y" />
637637
<column name="filepath_id" type="bigint" jt="-5" mandatory="y" />
638-
<index name="pk_processed_data_filepath" unique="UNIQUE" >
638+
<index name="idx_processed_filepath" unique="PRIMARY_KEY" >
639639
<column name="processed_data_id" />
640-
</index>
641-
<index name="idx_processed_data_filepath" unique="NORMAL" >
642640
<column name="filepath_id" />
643641
</index>
644642
<fk name="fk_processed_data_filepath" to_schema="qiita" to_table="processed_data" >
@@ -1262,8 +1260,8 @@ Controlled Vocabulary]]></comment>
12621260
<entity schema="qiita" name="raw_data" color="d0def5" x="1230" y="480" />
12631261
<entity schema="qiita" name="raw_preprocessed_data" color="b2cdf7" x="1230" y="585" />
12641262
<entity schema="qiita" name="preprocessed_filepath" color="c0d4f3" x="990" y="705" />
1265-
<entity schema="qiita" name="processed_filepath" color="c0d4f3" x="1005" y="930" />
12661263
<entity schema="qiita" name="preprocessed_data" color="c0d4f3" x="1200" y="690" />
1264+
<entity schema="qiita" name="processed_filepath" color="c0d4f3" x="1005" y="930" />
12671265
<group name="Group_analyses" color="c4e0f9" >
12681266
<comment>analysis tables</comment>
12691267
<entity schema="qiita" name="analysis" />

qiita_db/support_files/qiita-db.html

Lines changed: 1545 additions & 1646 deletions
Large diffs are not rendered by default.

qiita_db/support_files/qiita-db.sql

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -456,13 +456,11 @@ CREATE INDEX idx_preprocessed_processed_data_1 ON qiita.preprocessed_processed_d
456456
CREATE TABLE qiita.processed_filepath (
457457
processed_data_id bigint NOT NULL,
458458
filepath_id bigint NOT NULL,
459-
CONSTRAINT pk_processed_data_filepath UNIQUE ( processed_data_id ) ,
459+
CONSTRAINT idx_processed_filepath PRIMARY KEY ( processed_data_id, filepath_id ),
460460
CONSTRAINT fk_processed_data_filepath FOREIGN KEY ( processed_data_id ) REFERENCES qiita.processed_data( processed_data_id ) ,
461461
CONSTRAINT fk_processed_data_filepath_0 FOREIGN KEY ( filepath_id ) REFERENCES qiita.filepath( filepath_id )
462462
);
463463

464-
CREATE INDEX idx_processed_data_filepath ON qiita.processed_filepath ( filepath_id );
465-
466464
CREATE TABLE qiita.processed_params_uclust (
467465
processed_params_id bigserial NOT NULL,
468466
reference_id bigint NOT NULL,

qiita_db/test/test_commands.py

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,11 @@
1919
from configparser import NoOptionError
2020

2121
from qiita_db.commands import (make_study_from_cmd, load_raw_data_cmd,
22-
sample_template_adder)
22+
sample_template_adder, load_processed_data_cmd)
2323
from qiita_db.study import Study, StudyPerson
2424
from qiita_db.user import User
2525
from qiita_db.util import get_count, check_count, get_db_files_base_dir
26+
from qiita_db.data import PreprocessedData
2627
from qiita_core.util import qiita_test_checker
2728

2829

@@ -148,6 +149,64 @@ def test_load_data_from_cmd(self):
148149
study_ids)
149150

150151

152+
@qiita_test_checker()
153+
class TestLoadProcessedDataFromCmd(TestCase):
154+
def setUp(self):
155+
fd, self.otu_table_fp = mkstemp(suffix='_otu_table.biom')
156+
close(fd)
157+
fd, self.otu_table_2_fp = mkstemp(suffix='_otu_table2.biom')
158+
close(fd)
159+
160+
with open(self.otu_table_fp, "w") as f:
161+
f.write("\n")
162+
with open(self.otu_table_2_fp, "w") as f:
163+
f.write("\n")
164+
165+
self.files_to_remove = []
166+
self.files_to_remove.append(self.otu_table_fp)
167+
self.files_to_remove.append(self.otu_table_2_fp)
168+
169+
self.db_test_processed_data_dir = join(get_db_files_base_dir(),
170+
'processed_data')
171+
172+
def tearDown(self):
173+
for fp in self.files_to_remove:
174+
if exists(fp):
175+
remove(fp)
176+
177+
def test_load_processed_data_from_cmd(self):
178+
filepaths = [self.otu_table_fp, self.otu_table_2_fp]
179+
filepath_types = ['biom', 'biom']
180+
181+
initial_processed_data_count = get_count('qiita.processed_data')
182+
initial_processed_fp_count = get_count('qiita.processed_filepath')
183+
initial_fp_count = get_count('qiita.filepath')
184+
185+
new = load_processed_data_cmd(filepaths, filepath_types,
186+
'processed_params_uclust', 1, 1, None)
187+
processed_data_id = new.id
188+
self.files_to_remove.append(
189+
join(self.db_test_processed_data_dir,
190+
'%d_%s' % (processed_data_id, basename(self.otu_table_fp))))
191+
self.files_to_remove.append(
192+
join(self.db_test_processed_data_dir,
193+
'%d_%s' % (processed_data_id,
194+
basename(self.otu_table_2_fp))))
195+
196+
self.assertTrue(check_count('qiita.processed_data',
197+
initial_processed_data_count + 1))
198+
self.assertTrue(check_count('qiita.processed_filepath',
199+
initial_processed_fp_count + 2))
200+
self.assertTrue(check_count('qiita.filepath',
201+
initial_fp_count + 2))
202+
203+
# Ensure that the ValueError is raised when a filepath_type is not
204+
# provided for each and every filepath
205+
with self.assertRaises(ValueError):
206+
load_processed_data_cmd(filepaths, filepath_types[:-1],
207+
'processed_params_uclust', 1, 1, None)
208+
209+
151210
CONFIG_1 = """[required]
152211
timeseries_type_id = 1
153212
metadata_complete = True

qiita_db/test/test_util.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
compute_checksum, check_table_cols,
1818
check_required_columns, convert_to_id,
1919
get_table_cols, get_filetypes, get_filepath_types,
20-
get_count, check_count)
20+
get_count, check_count, get_processed_params_tables)
2121

2222

2323
@qiita_test_checker()
@@ -150,6 +150,10 @@ def test_check_count(self):
150150
self.assertTrue(check_count('qiita.study_person', 3))
151151
self.assertFalse(check_count('qiita.study_person', 2))
152152

153+
def test_get_processed_params_tables(self):
154+
obs = get_processed_params_tables()
155+
self.assertEqual(obs, ['processed_params_uclust'])
156+
153157

154158
class UtilTests(TestCase):
155159
"""Tests for the util functions that do not need to access the DB"""

qiita_db/util.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,3 +467,17 @@ def check_count(table, exp_count):
467467
"""
468468
obs_count = get_count(table)
469469
return obs_count == exp_count
470+
471+
472+
def get_processed_params_tables():
473+
"""Returns a list of all tables starting with "processed_params_"
474+
475+
Returns
476+
-------
477+
list of str
478+
"""
479+
sql = ("SELECT * FROM information_schema.tables WHERE table_schema = "
480+
"'qiita' AND SUBSTR(table_name, 1, 17) = 'processed_params_'")
481+
482+
conn = SQLConnectionHandler()
483+
return [x[2] for x in conn.execute_fetchall(sql)]

scripts/qiita_db

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,10 @@
1010

1111
import click
1212

13-
from qiita_db.util import get_filetypes, get_filepath_types
13+
from qiita_db.util import (get_filetypes, get_filepath_types,
14+
get_processed_params_tables)
1415
from qiita_db.commands import (sample_template_adder, make_study_from_cmd,
15-
load_raw_data_cmd)
16+
load_raw_data_cmd, load_processed_data_cmd)
1617

1718

1819
@click.group()
@@ -39,6 +40,35 @@ def load_raw_data(fp, fp_type, filetype, study):
3940
load_raw_data_cmd(fp, fp_type, filetype, study)
4041

4142

43+
@qiita_db.command()
44+
@click.option('--fp', required=True, type=click.Path(resolve_path=True,
45+
readable=True, exists=True), multiple=True, help='Path to the '
46+
'processed data. This option can be used multilpe times if '
47+
'there are multiple processed data files.')
48+
@click.option('--fp_type', required=True, multiple=True, help='Describes the '
49+
'contents of the file. Pass one fp_type per fp.',
50+
type=click.Choice(get_filepath_types().keys()))
51+
@click.option('--processed_params_table', required=True,
52+
type=click.Choice(get_processed_params_tables()),
53+
help='The table containing the processed parameters used to '
54+
'generate this file')
55+
@click.option('--processed_params_id', required=True, type=int,
56+
help='The ID of the row in the processed_params table')
57+
@click.option('--preprocessed_data_id', type=int, default=None, help='The '
58+
'ID of the row in the preprocessed_data table from which '
59+
'this processed data was created')
60+
@click.option('--processed_date', type=str, default=None,
61+
help='The date to use as the processed_date. Must be '
62+
'interpretable as a datetime. If None, then the current date '
63+
'and time will be used.')
64+
def load_processed_data(fp, fp_type, processed_params_table,
65+
processed_params_id, preprocessed_data_id,
66+
processed_date):
67+
load_processed_data_cmd(fp, fp_type, processed_params_table,
68+
processed_params_id, preprocessed_data_id,
69+
processed_date)
70+
71+
4272
@qiita_db.command()
4373
@click.option('--owner', help="The email address of the owner of the study")
4474
@click.option('--title', help="The title of the study")

0 commit comments

Comments
 (0)