Skip to content

Commit 7a398ed

Browse files
committed
Merge pull request #1008 from antgonza/454-issue
fixing new 454 issue
2 parents 1a6d125 + 954cf15 commit 7a398ed

File tree

3 files changed

+147
-16
lines changed

3 files changed

+147
-16
lines changed

qiita_db/support_files/populate_test_db.sql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -396,8 +396,8 @@ INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algor
396396
INSERT INTO qiita.sample_template_filepath VALUES (1, 16);
397397

398398
INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum, checksum_algorithm_id, data_directory_id) VALUES
399-
('preprocess_test1.sff', 17, '852952723', 1, 5),
400-
('preprocess_test2.sff', 17, '852952723', 1, 5);
399+
('1_preprocess_test1.sff', 17, '852952723', 1, 5),
400+
('1_preprocess_test2.sff', 17, '852952723', 1, 5);
401401

402402
-- Insert the raw data information for study 1
403403
INSERT INTO qiita.raw_data (filetype_id) VALUES (1), (1);

qiita_ware/processing_pipeline.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,41 @@ def _get_preprocess_fasta_cmd(raw_data, prep_template, params):
254254
"-o %s" % output_dir,
255255
params_str])
256256
else:
257+
len_seqs = len(seqs)
258+
len_mapping_fps = len(mapping_fps)
259+
260+
if len_mapping_fps > len_seqs:
261+
mapping_fps = [basename(m) for m in mapping_fps]
262+
sffs = [basename(s) for s in sffs]
263+
raise ValueError(
264+
'The prep template defines: "%s" but you only have "%s" as '
265+
'sequence files' % (', '.join(mapping_fps), ', '.join(sffs)))
266+
267+
if len_seqs != len_mapping_fps:
268+
# -8 is to remove the _MMF.txt
269+
prefixes = {m: {'prefix': basename(m)[:-8], 'seqs': [],
270+
'quals': []} for m in mapping_fps}
271+
counter = 0
272+
for p in prefixes.values():
273+
for i, s in enumerate(seqs):
274+
# the files are prefixed with raw_data_id
275+
if basename(s).split('_', 1)[1].startswith(p['prefix']):
276+
p['seqs'].append(s)
277+
if quals:
278+
p['quals'].append(quals[i])
279+
counter = counter + 1
280+
281+
if counter != len_seqs:
282+
raise ValueError(
283+
'The run prefixes in your prep template '
284+
'"%s" do not match your file names "%s"' %
285+
(', '.join(mapping_fps), ', '.join(sffs)))
286+
287+
mapping_fps = prefixes.keys()
288+
seqs = [','.join(p['seqs']) for p in prefixes.values()]
289+
if quals:
290+
quals = [','.join(p['quals']) for p in prefixes.values()]
291+
257292
cmd, output_folders, n = [], [], 1
258293
for i, (seq, mapping) in enumerate(zip(seqs, mapping_fps)):
259294
qual_str = "-q %s -d" % quals[i] if quals else ""

qiita_ware/test/test_processing_pipeline.py

Lines changed: 110 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -163,8 +163,8 @@ def test_get_preprocess_fasta_cmd_sff_no_run_prefix(self):
163163
raw_data, prep_template, params)
164164

165165
get_raw_path = partial(join, self.db_dir, 'raw_data')
166-
seqs_fp = [get_raw_path('preprocess_test1.sff'),
167-
get_raw_path('preprocess_test2.sff')]
166+
seqs_fp = [get_raw_path('1_preprocess_test1.sff'),
167+
get_raw_path('1_preprocess_test2.sff')]
168168

169169
exp_cmd_1 = ' '.join(["process_sff.py",
170170
"-i %s" % seqs_fp[0],
@@ -173,10 +173,12 @@ def test_get_preprocess_fasta_cmd_sff_no_run_prefix(self):
173173
"-i %s" % seqs_fp[1],
174174
"-o %s" % obs_output_dir])
175175

176-
fasta_files = ','.join([join(obs_output_dir, "preprocess_test1.fna"),
177-
join(obs_output_dir, "preprocess_test2.fna")])
178-
qual_files = ','.join([join(obs_output_dir, "preprocess_test1.qual"),
179-
join(obs_output_dir, "preprocess_test2.qual")])
176+
fasta_files = ','.join([
177+
join(obs_output_dir, "1_preprocess_test1.fna"),
178+
join(obs_output_dir, "1_preprocess_test2.fna")])
179+
qual_files = ','.join([
180+
join(obs_output_dir, "1_preprocess_test1.qual"),
181+
join(obs_output_dir, "1_preprocess_test2.qual")])
180182
exp_cmd_3a = ' '.join(["split_libraries.py",
181183
"-f %s" % fasta_files])
182184

@@ -208,8 +210,8 @@ def test_get_preprocess_fasta_cmd_sff_run_prefix(self):
208210
# Need to alter the run_prefix of one sample so we can test the
209211
# multiple values
210212
conn_handler = SQLConnectionHandler()
211-
sql = ("UPDATE qiita.prep_1 SET run_prefix='test' WHERE "
212-
"sample_id ='1.SKM9.640192'")
213+
sql = ("UPDATE qiita.prep_1 SET run_prefix='test1' WHERE "
214+
"sample_id = '1.SKM9.640192'")
213215
conn_handler.execute(sql)
214216

215217
raw_data = RawData(3)
@@ -223,19 +225,113 @@ def test_get_preprocess_fasta_cmd_sff_run_prefix(self):
223225
# assumming that test_get_preprocess_fasta_cmd_sff_no_run_prefix is
224226
# working we only need to test for the commands being ran and
225227
# that n is valid
226-
self.assertTrue(len(obs_cmds) == 8)
228+
self.assertEqual(len(obs_cmds), 8)
227229
self.assertTrue(obs_cmds[0].startswith('process_sff.py'))
228230
self.assertTrue(obs_cmds[1].startswith('process_sff.py'))
229231
self.assertTrue(obs_cmds[2].startswith('split_libraries.py'))
230-
self.assertTrue('-n 1' in obs_cmds[2])
232+
self.assertIn('-n 1', obs_cmds[2])
231233
self.assertTrue(obs_cmds[3].startswith('split_libraries.py'))
232-
self.assertTrue('-n 800000' in obs_cmds[3])
234+
self.assertIn('-n 800000', obs_cmds[3])
233235
self.assertTrue(obs_cmds[4].startswith('cat'))
234-
self.assertTrue('split_library_log.txt' in obs_cmds[4])
236+
self.assertIn('split_library_log.txt', obs_cmds[4])
235237
self.assertTrue(obs_cmds[5].startswith('cat'))
236-
self.assertTrue('seqs.fna' in obs_cmds[5])
238+
self.assertTrue('seqs.fna', obs_cmds[5])
237239
self.assertTrue(obs_cmds[6].startswith('cat'))
238-
self.assertTrue('seqs_filtered.qual' in obs_cmds[6])
240+
self.assertIn('seqs_filtered.qual', obs_cmds[6])
241+
242+
def test_get_preprocess_fasta_cmd_sff_run_prefix_match(self):
243+
# Test that the run prefixes in the prep_template and the file names
244+
# actually match and raise an error if not
245+
conn_handler = SQLConnectionHandler()
246+
sql = ("""
247+
INSERT INTO qiita.filepath (filepath_id, filepath,
248+
filepath_type_id, checksum, checksum_algorithm_id,
249+
data_directory_id) VALUES (19, '1_new.sff', 17, 852952723, 1,
250+
5);
251+
INSERT INTO qiita.raw_filepath (raw_data_id , filepath_id) VALUES
252+
(3, 19);
253+
UPDATE qiita.prep_1 SET run_prefix='preprocess_test';
254+
UPDATE qiita.prep_1 SET run_prefix='new' WHERE
255+
sample_id = '1.SKB8.640193';
256+
""")
257+
conn_handler.execute(sql)
258+
259+
raw_data = RawData(3)
260+
params = Preprocessed454Params(1)
261+
prep_template = PrepTemplate(1)
262+
263+
obs_cmd, obs_output_dir = _get_preprocess_fasta_cmd(
264+
raw_data, prep_template, params)
265+
266+
obs_cmds = obs_cmd.split('; ')
267+
# assumming that test_get_preprocess_fasta_cmd_sff_no_run_prefix is
268+
# working we only need to test for the commands being ran and
269+
# that n is valid
270+
self.assertEqual(len(obs_cmds), 9)
271+
self.assertTrue(obs_cmds[0].startswith('process_sff.py'))
272+
self.assertTrue(obs_cmds[1].startswith('process_sff.py'))
273+
self.assertTrue(obs_cmds[2].startswith('process_sff.py'))
274+
self.assertTrue(obs_cmds[3].startswith('split_libraries.py'))
275+
self.assertIn('-n 1', obs_cmds[3])
276+
self.assertTrue(obs_cmds[4].startswith('split_libraries.py'))
277+
self.assertIn('-n 800000', obs_cmds[4])
278+
self.assertTrue(obs_cmds[5].startswith('cat'))
279+
self.assertIn('split_library_log.txt', obs_cmds[5])
280+
self.assertTrue(obs_cmds[6].startswith('cat'))
281+
self.assertIn('seqs.fna', obs_cmds[6])
282+
self.assertEqual(len(obs_cmds[6].split(' ')), 5)
283+
self.assertTrue(obs_cmds[7].startswith('cat'))
284+
self.assertIn('seqs_filtered.qual', obs_cmds[7])
285+
self.assertEqual(len(obs_cmds[7].split(' ')), 5)
286+
287+
def test_get_preprocess_fasta_cmd_sff_run_prefix_match_error_1(self):
288+
# Test that the run prefixes in the prep_template and the file names
289+
# actually match and raise an error if not
290+
conn_handler = SQLConnectionHandler()
291+
sql = ("""
292+
INSERT INTO qiita.filepath (filepath_id, filepath,
293+
filepath_type_id, checksum, checksum_algorithm_id,
294+
data_directory_id) VALUES (19, '1_new.sff', 17, 852952723, 1,
295+
5);
296+
INSERT INTO qiita.raw_filepath (raw_data_id , filepath_id) VALUES
297+
(3, 19);
298+
INSERT INTO qiita.filepath (filepath_id, filepath,
299+
filepath_type_id, checksum, checksum_algorithm_id,
300+
data_directory_id) VALUES (20, '1_error.sff', 17, 852952723,
301+
1, 5);
302+
INSERT INTO qiita.raw_filepath (raw_data_id , filepath_id) VALUES
303+
(3, 20);
304+
UPDATE qiita.prep_1 SET run_prefix='preprocess_test';
305+
UPDATE qiita.prep_1 SET run_prefix='new' WHERE
306+
sample_id = '1.SKB8.640193';
307+
""")
308+
conn_handler.execute(sql)
309+
310+
raw_data = RawData(3)
311+
params = Preprocessed454Params(1)
312+
prep_template = PrepTemplate(1)
313+
314+
with self.assertRaises(ValueError):
315+
_get_preprocess_fasta_cmd(raw_data, prep_template, params)
316+
317+
def test_get_preprocess_fasta_cmd_sff_run_prefix_match_error_2(self):
318+
# Should raise error
319+
conn_handler = SQLConnectionHandler()
320+
sql = ("""
321+
UPDATE qiita.prep_1 SET run_prefix='test1';
322+
UPDATE qiita.prep_1 SET run_prefix='test2' WHERE
323+
sample_id = '1.SKB2.640194';
324+
UPDATE qiita.prep_1 SET run_prefix='error' WHERE
325+
sample_id = '1.SKB8.640193';
326+
""")
327+
conn_handler.execute(sql)
328+
329+
raw_data = RawData(3)
330+
params = Preprocessed454Params(1)
331+
prep_template = PrepTemplate(1)
332+
333+
with self.assertRaises(ValueError):
334+
_get_preprocess_fasta_cmd(raw_data, prep_template, params)
239335

240336
def test_insert_preprocessed_data(self):
241337
study = Study(1)

0 commit comments

Comments
 (0)