Skip to content

Commit 84beb81

Browse files
committed
Fixing _get_qiime_minimal_mapping
1 parent d464237 commit 84beb81

File tree

2 files changed

+53
-49
lines changed

2 files changed

+53
-49
lines changed

qiita_ware/processing_pipeline.py

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,15 @@
2222

2323
def _get_qiime_minimal_mapping(prep_template, out_dir):
2424
"""Generates a minimal QIIME-compliant mapping file for split libraries
25-
2625
The columns of the generated file are, in order: SampleID, BarcodeSequence,
2726
LinkerPrimerSequence, Description. All values are taken from the prep
2827
template except for Description, which always receive the value "Qiita MMF"
29-
3028
Parameters
3129
----------
3230
prep_template : PrepTemplate
3331
The prep template from which we need to generate the minimal mapping
3432
out_dir : str
3533
Path to the output directory
36-
3734
Returns
3835
-------
3936
list of str
@@ -43,30 +40,36 @@ def _get_qiime_minimal_mapping(prep_template, out_dir):
4340
from os.path import join
4441
import pandas as pd
4542

46-
# Get the data in a pandas DataFrame, so it is easier to manage
47-
pt = prep_template.to_dataframe()
43+
# The prep templates has a QIIME mapping file, get it
44+
qiime_map = pd.read_csv(prep_template.qiime_map_fp, sep='\t',
45+
keep_default_na=False, na_values=['unknown'],
46+
index_col=False)
47+
qiime_map.set_index('#SampleID', inplace=True, drop=True)
4848

49-
# We now need to rename some columns to be QIIME compliant.
50-
# Hopefully, this conversion won't be needed if QIIME relaxes its
51-
# constraints
52-
pt.rename(columns={'barcodesequence': 'BarcodeSequence',
53-
'linkerprimersequence': 'LinkerPrimerSequence'},
54-
inplace=True)
55-
pt['Description'] = pd.Series(['Qiita MMF'] * len(pt.index),
56-
index=pt.index)
49+
# We use our own description to avoid potential processing problems
50+
qiime_map['Description'] = pd.Series(['Qiita MMF'] * len(qiime_map.index),
51+
index=qiime_map.index)
5752

5853
# We ensure the order of the columns as QIIME is expecting
5954
cols = ['BarcodeSequence', 'LinkerPrimerSequence', 'Description']
6055

61-
# If the study has more than 1 lane, we should generate a qiita MMF for
62-
# each of the lanes. We know how to split the prep template based on
63-
# the run_prefix column
64-
output_fps = []
6556
path_builder = partial(join, out_dir)
66-
for prefix, df in pt.groupby('run_prefix'):
67-
df = df[cols]
68-
out_fp = path_builder("%s_MMF.txt" % prefix)
69-
output_fps.append(out_fp)
57+
if 'run_prefix' in qiime_map:
58+
# The study potentially has more than 1 lane, so we should generate a
59+
# qiita MMF for each of the lanes. We know how to split the prep
60+
# template based on the run_prefix column
61+
output_fps = []
62+
for prefix, df in qiime_map.groupby('run_prefix'):
63+
df = df[cols]
64+
# Sorting for consistency between serialization
65+
out_fp = path_builder("%s_MMF.txt" % prefix)
66+
output_fps.append(out_fp)
67+
df.to_csv(out_fp, index_label="#SampleID", sep='\t')
68+
else:
69+
# The study only has one lane, just write the MMF
70+
df = qiime_map[cols]
71+
out_fp = path_builder("prep_%d_MMF.txt" % prep_template.id)
72+
output_fps = [out_fp]
7073
df.to_csv(out_fp, index_label="#SampleID", sep='\t')
7174

7275
return output_fps

qiita_ware/test/test_processing_pipeline.py

Lines changed: 29 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ def test_get_qiime_minimal_mapping_multiple(self):
7676
'ebi_submission_accession': None,
7777
'EMP_status': 'EMP',
7878
'str_column': 'Value for sample 1',
79-
'linkerprimersequence': 'GTGCCAGCMGCCGCGGTAA',
80-
'barcodesequence': 'GTCCGCAAGTTA',
79+
'primer': 'GTGCCAGCMGCCGCGGTAA',
80+
'barcode': 'GTCCGCAAGTTA',
8181
'run_prefix': "s_G1_L001_sequences",
8282
'platform': 'ILLUMINA',
8383
'library_construction_protocol': 'AAA',
@@ -87,8 +87,8 @@ def test_get_qiime_minimal_mapping_multiple(self):
8787
'ebi_submission_accession': None,
8888
'EMP_status': 'EMP',
8989
'str_column': 'Value for sample 2',
90-
'linkerprimersequence': 'GTGCCAGCMGCCGCGGTAA',
91-
'barcodesequence': 'CGTAGAGCTCTC',
90+
'primer': 'GTGCCAGCMGCCGCGGTAA',
91+
'barcode': 'CGTAGAGCTCTC',
9292
'run_prefix': "s_G1_L001_sequences",
9393
'platform': 'ILLUMINA',
9494
'library_construction_protocol': 'AAA',
@@ -98,8 +98,8 @@ def test_get_qiime_minimal_mapping_multiple(self):
9898
'ebi_submission_accession': None,
9999
'EMP_status': 'EMP',
100100
'str_column': 'Value for sample 3',
101-
'linkerprimersequence': 'GTGCCAGCMGCCGCGGTAA',
102-
'barcodesequence': 'CCTCTGAGAGCT',
101+
'primer': 'GTGCCAGCMGCCGCGGTAA',
102+
'barcode': 'CCTCTGAGAGCT',
103103
'run_prefix': "s_G1_L002_sequences",
104104
'platform': 'ILLUMINA',
105105
'library_construction_protocol': 'AAA',
@@ -217,6 +217,7 @@ def test_get_preprocess_fasta_cmd_sff_run_prefix(self):
217217
raw_data = RawData(3)
218218
params = Preprocessed454Params(1)
219219
prep_template = PrepTemplate(1)
220+
prep_template.generate_files()
220221

221222
obs_cmd, obs_output_dir = _get_preprocess_fasta_cmd(
222223
raw_data, prep_template, params)
@@ -245,21 +246,21 @@ def test_get_preprocess_fasta_cmd_sff_run_prefix_match(self):
245246
new_fp_id = get_count('qiita.filepath') + 1
246247
conn_handler = SQLConnectionHandler()
247248
sql = ("""
248-
INSERT INTO qiita.filepath (filepath_id, filepath,
249-
filepath_type_id, checksum, checksum_algorithm_id,
250-
data_directory_id) VALUES (%s, '1_new.sff', 17, 852952723, 1,
251-
5);
252-
INSERT INTO qiita.raw_filepath (raw_data_id , filepath_id) VALUES
253-
(3, %s);
249+
INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum,
250+
checksum_algorithm_id, data_directory_id)
251+
VALUES ('1_new.sff', 17, 852952723, 1, 5);
252+
INSERT INTO qiita.raw_filepath (raw_data_id , filepath_id)
253+
VALUES (3, %s);
254254
UPDATE qiita.prep_1 SET run_prefix='preprocess_test';
255-
UPDATE qiita.prep_1 SET run_prefix='new' WHERE
256-
sample_id = '1.SKB8.640193';
255+
UPDATE qiita.prep_1 SET run_prefix='new'
256+
WHERE sample_id = '1.SKB8.640193';
257257
""")
258-
conn_handler.execute(sql, (new_fp_id, new_fp_id))
258+
conn_handler.execute(sql, (new_fp_id,))
259259

260260
raw_data = RawData(3)
261261
params = Preprocessed454Params(1)
262262
prep_template = PrepTemplate(1)
263+
prep_template.generate_files()
263264

264265
obs_cmd, obs_output_dir = _get_preprocess_fasta_cmd(
265266
raw_data, prep_template, params)
@@ -291,28 +292,27 @@ def test_get_preprocess_fasta_cmd_sff_run_prefix_match_error_1(self):
291292
fp_count = get_count('qiita.filepath')
292293
conn_handler = SQLConnectionHandler()
293294
sql = ("""
294-
INSERT INTO qiita.filepath (filepath_id, filepath,
295-
filepath_type_id, checksum, checksum_algorithm_id,
296-
data_directory_id) VALUES (%s, '1_new.sff', 17, 852952723, 1,
297-
5);
298-
INSERT INTO qiita.raw_filepath (raw_data_id , filepath_id) VALUES
299-
(3, %s);
300-
INSERT INTO qiita.filepath (filepath_id, filepath,
301-
filepath_type_id, checksum, checksum_algorithm_id,
302-
data_directory_id) VALUES (%s, '1_error.sff', 17, 852952723,
303-
1, 5);
304-
INSERT INTO qiita.raw_filepath (raw_data_id , filepath_id) VALUES
305-
(3, %s);
295+
INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum,
296+
checksum_algorithm_id, data_directory_id)
297+
VALUES ('1_new.sff', 17, 852952723, 1, 5);
298+
INSERT INTO qiita.raw_filepath (raw_data_id , filepath_id)
299+
VALUES (3, %s);
300+
INSERT INTO qiita.filepath (filepath, filepath_type_id, checksum,
301+
checksum_algorithm_id, data_directory_id)
302+
VALUES ('1_error.sff', 17, 852952723, 1, 5);
303+
INSERT INTO qiita.raw_filepath (raw_data_id , filepath_id)
304+
VALUES (3, %s);
306305
UPDATE qiita.prep_1 SET run_prefix='preprocess_test';
307306
UPDATE qiita.prep_1 SET run_prefix='new' WHERE
308307
sample_id = '1.SKB8.640193';
309308
""")
310309
conn_handler.execute(
311-
sql, (fp_count + 1, fp_count + 1, fp_count + 2, fp_count + 2))
310+
sql, (fp_count + 1, fp_count + 2))
312311

313312
raw_data = RawData(3)
314313
params = Preprocessed454Params(1)
315314
prep_template = PrepTemplate(1)
315+
prep_template.generate_files()
316316

317317
with self.assertRaises(ValueError):
318318
_get_preprocess_fasta_cmd(raw_data, prep_template, params)
@@ -332,6 +332,7 @@ def test_get_preprocess_fasta_cmd_sff_run_prefix_match_error_2(self):
332332
raw_data = RawData(3)
333333
params = Preprocessed454Params(1)
334334
prep_template = PrepTemplate(1)
335+
prep_template.generate_files()
335336

336337
with self.assertRaises(ValueError):
337338
_get_preprocess_fasta_cmd(raw_data, prep_template, params)

0 commit comments

Comments
 (0)