Skip to content

minor fixes 0525 #133

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,4 @@ tests/data/sample-sheets/*/*/*.csv

# test output
tests/data/output_dir/
tests/data/211021_A00000_0000_SAMPLE/
12 changes: 11 additions & 1 deletion src/qp_klp/Assays.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,8 +588,18 @@ def update_prep_templates(self):

for study_id in self.prep_file_paths:
for prep_fp in self.prep_file_paths[study_id]:
metadata = Assay._parse_prep_file(prep_fp)
afact_name, is_repl = self._generate_artifact_name(prep_fp)

# the preps are created by seqpro within the GenPrepFileJob;
# thus, the "best" place to overwrite the values of the
# metadata are here
if hasattr(self, 'overwrite_prep_with_original') and \
self.overwrite_prep_with_original:
sid = basename(prep_fp).split('.')[1]
afact_name = sid
prep_fp = f'{self.pipeline.output_path}/original-prep.csv'

metadata = Assay._parse_prep_file(prep_fp)
data = {'prep_info': dumps(metadata),
'study': study_id,
'job-id': self.master_qiita_job_id,
Expand Down
16 changes: 13 additions & 3 deletions src/qp_klp/StandardMetagenomicWorkflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ def __init__(self, **kwargs):
# specific children requiring specific parameters.
self.qclient = self.kwargs['qclient']

self.overwrite_prep_with_original = False
if 'overwrite_prep_with_original' in self.kwargs:
self.overwrite_prep_with_original = \
self.kwargs['overwrite_prep_with_original']
self.pipeline = Pipeline(self.kwargs['config_fp'],
self.kwargs['run_identifier'],
self.kwargs['uif_path'],
Expand Down Expand Up @@ -93,6 +97,7 @@ def __init__(self, **kwargs):
'please generate one.')
df_summary = pd.read_html(html_summary)[0]
pt.set_index('sample_name', inplace=True)
pt.to_csv(f'{out_dir}/original-prep.csv', sep='\t')

project_name = f'qiita-{pid}-{aid}_{sid}'
# PrepNuQCWorkflow piggy backs on StandardMetagenomicWorkflow and
Expand Down Expand Up @@ -141,6 +146,11 @@ def __init__(self, **kwargs):
ValueError(f'The run_prefix {rp} from {k} has {_d.shape[0]} '
'matches with files')

if 'index' not in vals:
vals['index'] = ''
if 'index2' not in vals:
vals['index2'] = ''

sample = {
'sample sheet Sample_ID': rp.replace('.', '_'),
'Sample': k.replace('_', '.'),
Expand All @@ -153,10 +163,9 @@ def __init__(self, **kwargs):
'Project Plate': '',
'Project Name': project_name,
'Well': '',
'# Reads': int(_d.reads.sum()),
'# Reads': f'{_d.reads.sum()}',
'Lane': '1'}
data.append(sample)

sheet = make_sample_sheet(
metadata, pd.DataFrame(data), 'NovaSeqXPlus', [1])

Expand Down Expand Up @@ -196,6 +205,7 @@ def __init__(self, **kwargs):
# set 'update_qiita' to False to avoid updating Qiita DB
# and copying files into uploads dir. Useful for testing.
'update_qiita': True,
'is_restart': True}
'is_restart': True,
'overwrite_prep_with_original': True}

super().__init__(**kwargs)
1 change: 1 addition & 0 deletions src/qp_klp/WorkflowFactory.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class WorkflowFactory():
ST_TO_IN_MAP = {PROTOCOL_NAME_ILLUMINA: ['standard_metag',
'standard_metat',
'absquant_metag',
'abs_quant_metag',

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that this module already depends on metapool, I would really, really like it if, instead of redefining strings for the same sheet-type names that are (canonically) defined in metapool.sample_sheet, we just used the public values defined there:

https://github.com/biocore/kl-metapool/blob/7bb932f1cb8a69e49c087710e14de046eaef0079/metapool/sample_sheet.py#L43-L47

Also, there still is no absquant_metat (and there are no plans that there ever will be). Using (only) the sheet types from in the module where we actually define the sheet types would ensure that we never have this sort of nonsense entry in the future.

'absquant_metat'],
PROTOCOL_NAME_TELLSEQ: ['tellseq_metag',
'tellseq_absquant']}
Expand Down
53 changes: 21 additions & 32 deletions src/sequence_processing_pipeline/Pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
class InstrumentUtils():
types = {'A': 'NovaSeq 6000', 'D': 'HiSeq 2500', 'FS': 'iSeq',
'K': 'HiSeq 4000', 'LH': 'NovaSeq X Plus', 'M': 'MiSeq',
'MN': 'MiniSeq',
'MN': 'MiniSeq', 'SH': 'MiSeq i100',

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am worried about this one, not for this code, but for kl-metapool, which AFAICT doesn't know about MiSeq i100 (as opposed to just regular MiSeq). What do we know about whether it needs its i5 barcode revcomped or not, etc? Is kl-metapool going to need modifications to handle this sequencer?

# SN – RapidRun which is HiSeq 2500
'SN': 'RapidRun'}

Expand Down Expand Up @@ -911,8 +911,8 @@ def is_sample_sheet(sample_sheet_path):

return False

def _generate_dummy_sample_sheet(self, first_read, last_read,
indexed_reads, dummy_sample_id):
def _generate_dummy_sample_sheet(self, index_cycles, non_index_cycles,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, the code expected (as this is how old RunInfo.xml files are formed) that the reads would be like index1, read1, read2, index2 - however this is not the case in the new versions. Thus, changed the method to actually be aware of what's an index and non_index read

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am 100% in favor of anything that makes the nature of the inputs more explicit! However, I don't really understand what the inputs should be here. Is it possible to add a docstring for this function, maybe with an example input for each param?

len_index, dummy_sample_id):
# create object and initialize header
sheet = AmpliconSampleSheet()
sheet.Header['IEMFileVersion'] = '4'
Expand All @@ -924,13 +924,13 @@ def _generate_dummy_sample_sheet(self, first_read, last_read,
sheet.Header['Chemistry'] = 'Amplicon'

# generate override_cycles string
tmp = [f"N{x['NumCycles']}" for x in indexed_reads]
tmp = [f"N{index_cycles}" for i in range(len_index)]
tmp = ';'.join(tmp)
override_cycles = f"Y{first_read};{tmp};Y{last_read}"
override_cycles = f"Y{non_index_cycles};{tmp};Y{non_index_cycles}"

# set Reads and Settings according to input values
# we'll get this from the code on the server
sheet.Reads = [first_read, last_read]
sheet.Reads = [non_index_cycles, non_index_cycles]
sheet.Settings['OverrideCycles'] = override_cycles
sheet.Settings['MaskShortReads'] = '1'
sheet.Settings['CreateFastqForIndexReads'] = '1'
Expand Down Expand Up @@ -973,32 +973,21 @@ def generate_dummy_sample_sheet(self, run_dir, output_fp):
else:
raise ValueError("run_dir %s not found." % run_dir)

# assumptions are first and last reads are non-indexed and there
# are always two. Between them there is either 1 or 2 indexed
# reads. If this is not true, raise an Error.

if len(reads) < 3 or len(reads) > 4:
# there must be a first and last read w/a minimum of one read
# in the middle and maximum two in the middle.
raise ValueError("RunInfo.xml contains abnormal reads.")

first_read = reads.pop(0)
last_read = reads.pop()

if (first_read['IsIndexedRead'] is True or
last_read['IsIndexedRead'] is True):
# the assumptions are: if we have 3 reads we should only have 1
# index; and if we have 4 reads, 2 should be index

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1!

index_reads = [r for r in reads if r['IsIndexedRead']]
non_index_reads = [r for r in reads if not r['IsIndexedRead']]
len_index_reads = len(index_reads)
len_non_index_reads = len(non_index_reads)
if len_non_index_reads != 2 or (len_index_reads != 1 and
len_index_reads != 2):
raise ValueError("RunInfo.xml contains abnormal reads.")

# confirm the interior read(s) are indexed ones.
for read in reads:
if read['IsIndexedRead'] is False:
raise ValueError("RunInfo.xml contains abnormal reads.")

dummy_sample_id = basename(run_dir) + '_SMPL1'

sheet = self._generate_dummy_sample_sheet(first_read['NumCycles'],
last_read['NumCycles'],
reads, dummy_sample_id)
sheet = self._generate_dummy_sample_sheet(
index_reads[0]['NumCycles'],
non_index_reads[0]['NumCycles'],
len_index_reads, dummy_sample_id)

with open(output_fp, 'w') as f:
sheet.write(f, 1)
Expand All @@ -1009,17 +998,17 @@ def process_reads(reads):
# the contents of each Read element are highly regular.
# for now, process w/out installing xml2dict or other
# library into Qiita env.
found = findall('<Read (.+?) />', reads)
found = findall('<Read (.+?)/>', reads)

results = []
for item in found:
attributes = item.split(' ')
attributes = item.strip().split(' ')
d = {}
for attribute in attributes:
k, v = attribute.split('=')
if k in ['NumCycles', 'Number']:
v = int(v.strip('"'))
elif k in ['IsIndexedRead']:
elif k in ['IsIndexedRead', 'IsReverseComplement']:
v = v.strip('"')
v = False if v == 'N' else True
else:
Expand Down
10 changes: 6 additions & 4 deletions src/sequence_processing_pipeline/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from os.path import basename


PAIR_UNDERSCORE = (re.compile(r'_R1_'), '_R1_', '_R2_')
Expand Down Expand Up @@ -68,10 +69,11 @@ def iter_paired_files(files):
# using find(), r1_prefix and r2_prefix will be the following:
# r1_prefix will be: LS_8_22_2014
# r2_prefix will be: LS_8_22_2014_R1_SRE_S3_L007
r1_prefix = r1_fp[:r1_fp.rfind(r1_exp)]
r2_prefix = r2_fp[:r2_fp.rfind(r2_exp)]

if r1_prefix != r2_prefix:
# NOTE: we need to use basename to be sure that we are not
# comparing the folder name
r1_prefix = basename(r1_fp[:r1_fp.rfind(r1_exp)])
r2_prefix = basename(r2_fp[:r2_fp.rfind(r2_exp)])
if not r1_prefix or not r2_prefix or r1_prefix != r2_prefix:
raise ValueError(f"Mismatch prefixes:\n{r1_prefix}\n"
f"{r2_prefix}")

Expand Down
26 changes: 26 additions & 0 deletions tests/data/RunInfo_Good3.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<?xml version="1.0"?>

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this one from a NovaSeqXPlus?

<RunInfo Version="6">
<Run Id="20250418_SH00252_0069_ASC2107005-SC3" Number="69">
<Flowcell>BWR98012-2217</Flowcell>
<Instrument>SH00252</Instrument>
<Date>2025-04-18T20:11:37Z</Date>
<Reads>
<Read Number="1" NumCycles="12" IsIndexedRead="Y" IsReverseComplement="N"/>
<Read Number="2" NumCycles="151" IsIndexedRead="N" IsReverseComplement="N"/>
<Read Number="3" NumCycles="151" IsIndexedRead="N" IsReverseComplement="N"/>
</Reads>
<FlowcellLayout LaneCount="1" SurfaceCount="1" SwathCount="9" TileCount="10">
<TileSet TileNamingConvention="FourDigit">
<Tiles>
<Tile>1_1101</Tile>
</Tiles>
</TileSet>
</FlowcellLayout>
<ImageDimensions Width="858" Height="512"/>
<ImageChannels>
<Name>green</Name>
<Name>blue</Name>
</ImageChannels>
</Run>
</RunInfo>

11 changes: 8 additions & 3 deletions tests/test_Pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2330,20 +2330,25 @@ def test_process_run_info_file(self):
Pipeline.AMPLICON_PTYPE)

obs = pipeline.process_run_info_file('tests/data/RunInfo_Good1.xml')

exp = [{'NumCycles': 151, 'Number': 1, 'IsIndexedRead': False},
{'NumCycles': 8, 'Number': 2, 'IsIndexedRead': True},
{'NumCycles': 8, 'Number': 3, 'IsIndexedRead': True},
{'NumCycles': 151, 'Number': 4, 'IsIndexedRead': False}]

self.assertEqual(obs, exp)

obs = pipeline.process_run_info_file('tests/data/RunInfo_Good2.xml')

exp = [{'NumCycles': 151, 'Number': 1, 'IsIndexedRead': False},
{'NumCycles': 8, 'Number': 2, 'IsIndexedRead': True},
{'NumCycles': 151, 'Number': 3, 'IsIndexedRead': False}]
self.assertEqual(obs, exp)

obs = pipeline.process_run_info_file('tests/data/RunInfo_Good3.xml')
exp = [{'IsIndexedRead': True, 'IsReverseComplement': False,
'NumCycles': 12, 'Number': 1},
{'IsIndexedRead': False, 'IsReverseComplement': False,
'NumCycles': 151, 'Number': 2},
{'IsIndexedRead': False, 'IsReverseComplement': False,
'NumCycles': 151, 'Number': 3}]
self.assertEqual(obs, exp)

# a ValueError should be raised when a file that is obviously not
Expand Down
4 changes: 4 additions & 0 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ def test_iter_paired_files_mismatch_prefix(self):
with self.assertRaisesRegex(ValueError, "Mismatch prefixes"):
list(iter_paired_files(files))

files = ['/foo/bar/a_R1_001.fastq.gz', '/foo/bar/ab_R2_001.fastq.gz']
with self.assertRaisesRegex(ValueError, "Mismatch prefixes"):
list(iter_paired_files(files))


if __name__ == '__main__':
unittest.main()