Skip to content

Commit a9155b5

Browse files
antgonzaElDeveloper
authored andcommitted
adds EBI submission of shotgun samples (#2303)
* adds EBI submission of shotgun samples * flake8 fixes * fix error * fixing comments after meeting * addressing @josenavas comments
1 parent 96a174f commit a9155b5

File tree

5 files changed

+240
-48
lines changed

5 files changed

+240
-48
lines changed

qiita_db/metadata_template/constants.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,7 @@
3535
PREP_TEMPLATE_COLUMNS = {
3636
# The following columns are required by EBI for submission
3737
'EBI': Restriction(
38-
columns={'primer': str,
39-
'center_name': str,
38+
columns={'center_name': str,
4039
'platform': str,
4140
'instrument_model': str,
4241
'library_construction_protocol': str,

qiita_db/support_files/patches/60.sql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
-- Sep 20, 2017
2+
-- Allowing per_sample_FASTQ to be submitted to EBI
3+
4+
UPDATE qiita.artifact_type SET can_be_submitted_to_ebi = true WHERE artifact_type='per_sample_FASTQ';

qiita_pet/handlers/api_proxy/tests/test_prep_template.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,7 @@ def test_prep_template_post_req(self):
391391
'\tDemultiplexing disabled.: barcode, primer;',
392392
('\tEBI submission disabled: center_name, '
393393
'experiment_design_description, instrument_model, '
394-
'library_construction_protocol, platform, primer.'),
394+
'library_construction_protocol, platform.'),
395395
('See the Templates tutorial for a description of these '
396396
'fields.')],
397397
'file': 'update.txt',

qiita_ware/ebi.py

Lines changed: 98 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
# -----------------------------------------------------------------------------
88

99
from os.path import basename, join, isdir, isfile, exists
10+
from shutil import copyfile
1011
from os import makedirs, remove, listdir
1112
from datetime import date, timedelta
1213
from urllib import quote
@@ -26,6 +27,9 @@
2627
from qiita_db.ontology import Ontology
2728
from qiita_db.util import convert_to_id, get_mountpoint, open_file
2829
from qiita_db.artifact import Artifact
30+
from qiita_db.metadata_template.constants import (
31+
TARGET_GENE_DATA_TYPES, PREP_TEMPLATE_COLUMNS_TARGET_GENE)
32+
from qiita_db.processing_job import _system_call as system_call
2933

3034

3135
def clean_whitespace(text):
@@ -171,10 +175,15 @@ def __init__(self, artifact_id, action):
171175
self.publications = self.study.publications
172176

173177
# getting the restrictions
174-
st_missing = self.sample_template.check_restrictions(
175-
[self.sample_template.columns_restrictions['EBI']])
176-
pt_missing = self.prep_template.check_restrictions(
177-
[self.prep_template.columns_restrictions['EBI']])
178+
st_restrictions = [self.sample_template.columns_restrictions['EBI']]
179+
pt_restrictions = [self.prep_template.columns_restrictions['EBI']]
180+
if self.artifact.data_type in TARGET_GENE_DATA_TYPES:
181+
# adding restictions on primer and barcode as these are
182+
# conditionally requiered for target gene
183+
pt_restrictions.append(
184+
PREP_TEMPLATE_COLUMNS_TARGET_GENE['demultiplex'])
185+
st_missing = self.sample_template.check_restrictions(st_restrictions)
186+
pt_missing = self.prep_template.check_restrictions(pt_restrictions)
178187
# testing if there are any missing columns
179188
if st_missing:
180189
error_msgs.append("Missing column in the sample template: %s" %
@@ -907,6 +916,80 @@ def data_retriever(key, trans_dict):
907916
return (study_accession, sample_accessions, biosample_accessions,
908917
experiment_accessions, run_accessions)
909918

919+
def _generate_demultiplexed_fastq_per_sample_FASTQ(self):
920+
"""Modularity helper"""
921+
ar = self.artifact
922+
fps = [(basename(fp), fp) for _, fp, fpt in ar.filepaths
923+
if fpt == 'raw_forward_seqs']
924+
fps.sort(key=lambda x: x[1])
925+
if 'run_prefix' in self.prep_template.categories():
926+
rps = [(k, v) for k, v in viewitems(
927+
self.prep_template.get_category('run_prefix'))]
928+
else:
929+
rps = [(v, v.split('.', 1)[1]) for v in self.prep_template.keys()]
930+
rps.sort(key=lambda x: x[1])
931+
demux_samples = set()
932+
for sn, rp in rps:
933+
for i, (bn, fp) in enumerate(fps):
934+
if bn.startswith(rp):
935+
demux_samples.add(sn)
936+
new_fp = self.sample_demux_fps[sn]
937+
if fp.endswith('.gz'):
938+
copyfile(fp, new_fp)
939+
else:
940+
cmd = "gzip -c %s > %s" % (fp, new_fp)
941+
stdout, stderr, rv = system_call(cmd)
942+
if rv != 0:
943+
error_msg = (
944+
"Error:\nStd output:%s\nStd error:%s"
945+
% (stdout, stderr))
946+
raise EBISubmissionError(error_msg)
947+
del fps[i]
948+
break
949+
if fps:
950+
error_msg = (
951+
'Discrepancy between filepaths and sample names. Extra'
952+
' filepaths: %s' % ', '.join([fp[0] for fp in fps]))
953+
LogEntry.create('Runtime', error_msg)
954+
raise EBISubmissionError(error_msg)
955+
956+
return demux_samples, \
957+
set(self.samples.keys()).difference(set(demux_samples))
958+
959+
def _generate_demultiplexed_fastq_demux(self, mtime):
960+
"""Modularity helper"""
961+
# An artifact will hold only one file of type
962+
# `preprocessed_demux`. Thus, we only use the first one
963+
# (the only one present)
964+
ar = self.artifact
965+
demux = [path for _, path, ftype in ar.filepaths
966+
if ftype == 'preprocessed_demux'][0]
967+
968+
demux_samples = set()
969+
with open_file(demux) as demux_fh:
970+
if not isinstance(demux_fh, File):
971+
error_msg = (
972+
"'%s' doesn't look like a demux file" % demux)
973+
LogEntry.create('Runtime', error_msg)
974+
raise EBISubmissionError(error_msg)
975+
for s, i in to_per_sample_ascii(demux_fh,
976+
self.prep_template.keys()):
977+
sample_fp = self.sample_demux_fps[s]
978+
wrote_sequences = False
979+
with GzipFile(sample_fp, mode='w', mtime=mtime) as fh:
980+
for record in i:
981+
fh.write(record)
982+
wrote_sequences = True
983+
984+
if wrote_sequences:
985+
demux_samples.add(s)
986+
else:
987+
del(self.samples[s])
988+
del(self.samples_prep[s])
989+
del(self.sample_demux_fps[s])
990+
remove(sample_fp)
991+
return demux_samples
992+
910993
def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None):
911994
"""Generates demultiplexed fastq
912995
@@ -942,39 +1025,16 @@ def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None):
9421025
- The demux file couldn't be read
9431026
- All samples are removed
9441027
"""
945-
ar = self.artifact
946-
9471028
dir_not_exists = not isdir(self.full_ebi_dir)
1029+
missing_samples = []
9481030
if dir_not_exists or rewrite_fastq:
9491031
makedirs(self.full_ebi_dir)
9501032

951-
# An artifact will hold only one file of type `preprocessed_demux`
952-
# Thus, we only use the first one (the only one present)
953-
demux = [path for _, path, ftype in ar.filepaths
954-
if ftype == 'preprocessed_demux'][0]
955-
956-
demux_samples = set()
957-
with open_file(demux) as demux_fh:
958-
if not isinstance(demux_fh, File):
959-
error_msg = "'%s' doesn't look like a demux file" % demux
960-
LogEntry.create('Runtime', error_msg)
961-
raise EBISubmissionError(error_msg)
962-
for s, i in to_per_sample_ascii(demux_fh,
963-
self.prep_template.keys()):
964-
sample_fp = self.sample_demux_fps[s]
965-
wrote_sequences = False
966-
with GzipFile(sample_fp, mode='w', mtime=mtime) as fh:
967-
for record in i:
968-
fh.write(record)
969-
wrote_sequences = True
970-
971-
if wrote_sequences:
972-
demux_samples.add(s)
973-
else:
974-
del(self.samples[s])
975-
del(self.samples_prep[s])
976-
del(self.sample_demux_fps[s])
977-
remove(sample_fp)
1033+
if self.artifact.artifact_type == 'per_sample_FASTQ':
1034+
demux_samples, missing_samples = \
1035+
self._generate_demultiplexed_fastq_per_sample_FASTQ()
1036+
else:
1037+
demux_samples = self._generate_demultiplexed_fastq_demux(mtime)
9781038
else:
9791039
demux_samples = set()
9801040
extension = '.fastq.gz'
@@ -984,8 +1044,10 @@ def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None):
9841044
if isfile(fpath) and f.endswith(extension):
9851045
demux_samples.add(f[:-extension_len])
9861046

987-
missing_samples = set(self.samples.keys()).difference(
988-
set(demux_samples))
1047+
missing_samples = set(
1048+
self.samples.keys()).difference(demux_samples)
1049+
1050+
if missing_samples:
9891051
for ms in missing_samples:
9901052
del(self.samples[ms])
9911053
del(self.samples_prep[ms])
@@ -997,4 +1059,5 @@ def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None):
9971059
"do not match.")
9981060
LogEntry.create('Runtime', error_msg)
9991061
raise EBISubmissionError(error_msg)
1062+
10001063
return demux_samples

0 commit comments

Comments
 (0)