Skip to content

adds EBI submission of shotgun samples #2303

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Sep 26, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions qiita_db/metadata_template/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@
PREP_TEMPLATE_COLUMNS = {
# The following columns are required by EBI for submission
'EBI': Restriction(
columns={'primer': str,
'center_name': str,
columns={'center_name': str,
'platform': str,
'instrument_model': str,
'library_construction_protocol': str,
Expand Down
4 changes: 4 additions & 0 deletions qiita_db/support_files/patches/60.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
-- Sep 20, 2017
-- Allowing per_sample_FASTQ to be submitted to EBI

UPDATE qiita.artifact_type SET can_be_submitted_to_ebi = true WHERE artifact_type='per_sample_FASTQ';
2 changes: 1 addition & 1 deletion qiita_pet/handlers/api_proxy/tests/test_prep_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ def test_prep_template_post_req(self):
'\tDemultiplexing disabled.: barcode, primer;',
('\tEBI submission disabled: center_name, '
'experiment_design_description, instrument_model, '
'library_construction_protocol, platform, primer.'),
'library_construction_protocol, platform.'),
('See the Templates tutorial for a description of these '
'fields.')],
'file': 'update.txt',
Expand Down
133 changes: 98 additions & 35 deletions qiita_ware/ebi.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# -----------------------------------------------------------------------------

from os.path import basename, join, isdir, isfile, exists
from shutil import copyfile
from os import makedirs, remove, listdir
from datetime import date, timedelta
from urllib import quote
Expand All @@ -26,6 +27,9 @@
from qiita_db.ontology import Ontology
from qiita_db.util import convert_to_id, get_mountpoint, open_file
from qiita_db.artifact import Artifact
from qiita_db.metadata_template.constants import (
TARGET_GENE_DATA_TYPES, PREP_TEMPLATE_COLUMNS_TARGET_GENE)
from qiita_db.processing_job import _system_call as system_call


def clean_whitespace(text):
Expand Down Expand Up @@ -171,10 +175,15 @@ def __init__(self, artifact_id, action):
self.publications = self.study.publications

# getting the restrictions
st_missing = self.sample_template.check_restrictions(
[self.sample_template.columns_restrictions['EBI']])
pt_missing = self.prep_template.check_restrictions(
[self.prep_template.columns_restrictions['EBI']])
st_restrictions = [self.sample_template.columns_restrictions['EBI']]
pt_restrictions = [self.prep_template.columns_restrictions['EBI']]
if self.artifact.data_type in TARGET_GENE_DATA_TYPES:
# adding restictions on primer and barcode as these are
# conditionally requiered for target gene
pt_restrictions.append(
PREP_TEMPLATE_COLUMNS_TARGET_GENE['demultiplex'])
st_missing = self.sample_template.check_restrictions(st_restrictions)
pt_missing = self.prep_template.check_restrictions(pt_restrictions)
# testing if there are any missing columns
if st_missing:
error_msgs.append("Missing column in the sample template: %s" %
Expand Down Expand Up @@ -907,6 +916,80 @@ def data_retriever(key, trans_dict):
return (study_accession, sample_accessions, biosample_accessions,
experiment_accessions, run_accessions)

def _generate_demultiplexed_fastq_per_sample_FASTQ(self):
"""Modularity helper"""
ar = self.artifact
fps = [(basename(fp), fp) for _, fp, fpt in ar.filepaths
if fpt == 'raw_forward_seqs']
fps.sort(key=lambda x: x[1])
if 'run_prefix' in self.prep_template.categories():
rps = [(k, v) for k, v in viewitems(
self.prep_template.get_category('run_prefix'))]
else:
rps = [(v, v.split('.', 1)[1]) for v in self.prep_template.keys()]
rps.sort(key=lambda x: x[1])
demux_samples = set()
for sn, rp in rps:
for i, (bn, fp) in enumerate(fps):
if bn.startswith(rp):
demux_samples.add(sn)
new_fp = self.sample_demux_fps[sn]
if fp.endswith('.gz'):
copyfile(fp, new_fp)
else:
cmd = "gzip -c %s > %s" % (fp, new_fp)
stdout, stderr, rv = system_call(cmd)
if rv != 0:
error_msg = (
"Error:\nStd output:%s\nStd error:%s"
% (stdout, stderr))
raise EBISubmissionError(error_msg)
del fps[i]
break
if fps:
error_msg = (
'Discrepancy between filepaths and sample names. Extra'
' filepaths: %s' % ', '.join([fp[0] for fp in fps]))
LogEntry.create('Runtime', error_msg)
raise EBISubmissionError(error_msg)

return demux_samples, \
set(self.samples.keys()).difference(set(demux_samples))

def _generate_demultiplexed_fastq_demux(self, mtime):
"""Modularity helper"""
# An artifact will hold only one file of type
# `preprocessed_demux`. Thus, we only use the first one
# (the only one present)
ar = self.artifact
demux = [path for _, path, ftype in ar.filepaths
if ftype == 'preprocessed_demux'][0]

demux_samples = set()
with open_file(demux) as demux_fh:
if not isinstance(demux_fh, File):
error_msg = (
"'%s' doesn't look like a demux file" % demux)
LogEntry.create('Runtime', error_msg)
raise EBISubmissionError(error_msg)
for s, i in to_per_sample_ascii(demux_fh,
self.prep_template.keys()):
sample_fp = self.sample_demux_fps[s]
wrote_sequences = False
with GzipFile(sample_fp, mode='w', mtime=mtime) as fh:
for record in i:
fh.write(record)
wrote_sequences = True

if wrote_sequences:
demux_samples.add(s)
else:
del(self.samples[s])
del(self.samples_prep[s])
del(self.sample_demux_fps[s])
remove(sample_fp)
return demux_samples

def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None):
"""Generates demultiplexed fastq
Expand Down Expand Up @@ -942,39 +1025,16 @@ def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None):
- The demux file couldn't be read
- All samples are removed
"""
ar = self.artifact

dir_not_exists = not isdir(self.full_ebi_dir)
missing_samples = []
if dir_not_exists or rewrite_fastq:
makedirs(self.full_ebi_dir)

# An artifact will hold only one file of type `preprocessed_demux`
# Thus, we only use the first one (the only one present)
demux = [path for _, path, ftype in ar.filepaths
if ftype == 'preprocessed_demux'][0]

demux_samples = set()
with open_file(demux) as demux_fh:
if not isinstance(demux_fh, File):
error_msg = "'%s' doesn't look like a demux file" % demux
LogEntry.create('Runtime', error_msg)
raise EBISubmissionError(error_msg)
for s, i in to_per_sample_ascii(demux_fh,
self.prep_template.keys()):
sample_fp = self.sample_demux_fps[s]
wrote_sequences = False
with GzipFile(sample_fp, mode='w', mtime=mtime) as fh:
for record in i:
fh.write(record)
wrote_sequences = True

if wrote_sequences:
demux_samples.add(s)
else:
del(self.samples[s])
del(self.samples_prep[s])
del(self.sample_demux_fps[s])
remove(sample_fp)
if self.artifact.artifact_type == 'per_sample_FASTQ':
demux_samples, missing_samples = \
self._generate_demultiplexed_fastq_per_sample_FASTQ()
else:
demux_samples = self._generate_demultiplexed_fastq_demux(mtime)
else:
demux_samples = set()
extension = '.fastq.gz'
Expand All @@ -984,8 +1044,10 @@ def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None):
if isfile(fpath) and f.endswith(extension):
demux_samples.add(f[:-extension_len])

missing_samples = set(self.samples.keys()).difference(
set(demux_samples))
missing_samples = set(
self.samples.keys()).difference(demux_samples)

if missing_samples:
for ms in missing_samples:
del(self.samples[ms])
del(self.samples_prep[ms])
Expand All @@ -997,4 +1059,5 @@ def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None):
"do not match.")
LogEntry.create('Runtime', error_msg)
raise EBISubmissionError(error_msg)

return demux_samples
Loading