From e671b31c4d5ffa24760ed44cb96514638cce3c3f Mon Sep 17 00:00:00 2001 From: ryandkuster Date: Fri, 22 Jul 2022 15:36:35 -0400 Subject: [PATCH] added palindrome check, updated readme --- README.md | 34 +++++++++--------- readsynth.py | 80 ++++++++++++++++++++++++++++-------------- test/test_readsynth.py | 7 +++- 3 files changed, 78 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 268af3a..561ca3c 100644 --- a/README.md +++ b/README.md @@ -61,13 +61,13 @@ The above example takes 'abundances.csv' as a genome abundance file **g** with a ## example 16S library creation ``` -readsynth.py -g abundances.csv -m1 /CCTACGGGNGGCWGCAG /CTGCWGCCNCCCGTAGG -m2 /GACTACHVGGGTATCTAANCC /GGNTTAGATACCCBDGTAGTC -n 1_000_000 -free -l 150 -o /output_directory +readsynth.py -g abundances.csv -m1 /CCTACGGGNGGCWGCAG -m2 /GACTACHVGGGTATCTAANCC -n 1_000_000 -lp -l 150 -o /output_directory ``` -The above example differs from the ddRADseq library creation in that in place of a single, palindromic restriction motif for -m1, we now provide the forward V3-V4 primer along with its reverse complement. Further, this library avoids applying a gaussian size selection step and utilizes the **free** argument to go distribution-free. +The above example differs from the ddRADseq library creation in that in place of a single, palindromic restriction motif for -m1, we now provide the forward and reverse V3-V4 primers. Further, this library avoids applying a gaussian size selection step and utilizes the **lp** argument to avoid size selection and include reads up to a defined "low-pass" bp length. ## example isolength library creation ``` -readsynth.py -g abundances.csv -iso BcgI -n 1_000_000 -l 150 -o /output_directory +readsynth.py -g abundances.csv -iso BcgI -n 1_000_000 -l 150 -lp 50 -o /output_directory ``` The above example uses a single, isolength (type IIB) restriction enzyme to produce fragments, and by its nature, requires no size distribution. @@ -76,24 +76,26 @@ The above example uses a single, isolength (type IIB) restriction enzyme to prod -h, --help show this help message and exit -g G path to file genome -o O path to store output - -m1 M1 [M1 ...] space separated list of RE motifs (e.g., AluI or AG/CT, HindIII or A/AGCTT, SmlI or C/TYRAG) - -m2 M2 [M2 ...] space separated list of RE motifs (e.g., AluI or AG/CT, HindIII or A/AGCTT, SmlI or C/TYRAG) - -iso ISO optional type IIB RE motif (e.g., NN/NNNNNNNNNNCGANNNNNNTGCNNNNNNNNNNNN/) - -l L desired read length of final simulated reads - -test test mode: create newline-separated file of RE digested sequences only + -m1 M1 [M1 ...] space separated list of search motifs (e.g., RE motifs AluI or AG/CT, or 16S primer /CCTACGGGNGGCWGCAG) + -m2 M2 [M2 ...] space separated list of search motifs (e.g., RE motifs SmlI or C/TYRAG, or 16S primer /GACTACHVGGGTATCTAANCC) + -iso ISO optional type IIB RE motif (e.g., BcgI or NN/NNNNNNNNNNCGANNNNNNTGCNNNNNNNNNNNN/) + -l L, -l1 L desired R1 read length of final simulated reads -n N total read number -u U mean (in bp) of read lengths after size selection -sd SD standard deviation (in bp) of read lengths after size selection -x X fragment length where fragment distribution intersects size distribution -d D json dictionary of fragment length:count for all expected bp fragments range - -free distribution-free mode: bypass size selection process - -c C percent probability of per-site cut; use '1' for complete digestion of fragments (fragments will not contain internal RE sites) - -a1 A1 file containing tab/space-separated adapters and barcode that attach 5' to read - -a2 A2 file containing tab/space-separated adapters and barcode that attach 3' to read - -a1s A1S manually provide bp length of adapter a1 before SBS begins - -a2s A2S manually provide bp length of adapter a1 before SBS begins - -q1 Q1 file containing newline-separated R1 Q scores >= length -l - -q2 Q2 file containing newline-separated R2 Q scores >= length -l + -lp LP low-pass mode: defines maximum expected fragment size, distribution free + -c C optional: percent probability of per-site cut; default 1 for complete digestion of fragments (fragments will not contain internal RE sites) + -a1 A1 optional: file containing tab/space-separated adapters and barcode that attach 5' to read + -a2 A2 optional: file containing tab/space-separated adapters and barcode that attach 3' to read + -a1s A1S optional: manually provide bp length of adapter a1 before SBS begins + -a2s A2S optional: manually provide bp length of adapter a1 before SBS begins + -q1 Q1 optional: file containing newline-separated R1 Q scores >= length -l + -q2 Q2 optional: file containing newline-separated R2 Q scores >= length -l + -e E optional: filler base to use if full adapter contaminaton occurs + -l2 L2 optional: desired R2 read length of final simulated reads + -test test mode: skip writing simulated fastq files ``` # software overview diff --git a/readsynth.py b/readsynth.py index 1a81f5b..d4a46ac 100755 --- a/readsynth.py +++ b/readsynth.py @@ -31,30 +31,24 @@ def parse_user_input(): help='path to store output') parser.add_argument('-m1', type=str, required='-iso' not in sys.argv, nargs='+', - help='space separated list of RE motifs (e.g., AluI or AG/CT, HindIII or A/AGCTT, SmlI or C/TYRAG)') + help='space separated list of search motifs (e.g., RE motifs AluI or AG/CT, or 16S primer /CCTACGGGNGGCWGCAG)') parser.add_argument('-m2', type=str, required='-iso' not in sys.argv, nargs='+', - help='space separated list of RE motifs (e.g., AluI or AG/CT, HindIII or A/AGCTT, SmlI or C/TYRAG)') + help='space separated list of search motifs (e.g., RE motifs SmlI or C/TYRAG, or 16S primer /GACTACHVGGGTATCTAANCC)') parser.add_argument('-iso', type=str, required=False, - help='optional type IIB RE motif (e.g., NN/NNNNNNNNNNCGANNNNNNTGCNNNNNNNNNNNN/)') + help='optional type IIB RE motif (e.g., BcgI or NN/NNNNNNNNNNCGANNNNNNTGCNNNNNNNNNNNN/)') - parser.add_argument('-l1', type=int, required=True, + parser.add_argument('-l', '-l1', type=int, required=True, help='desired R1 read length of final simulated reads') - parser.add_argument('-l2', type=int, required=True, - help='desired R2 read length of final simulated reads') - - parser.add_argument('-test', dest='test', action='store_true', - help='test mode: create newline-separated file of RE digested sequences only') - parser.add_argument('-n', type=int, required=True, help='total read number') - parser.add_argument('-u', type=int, required='-d' not in sys.argv, + parser.add_argument('-u', type=int, required='-sd' in sys.argv, help='mean (in bp) of read lengths after size selection') - parser.add_argument('-sd', type=int, required='-d' not in sys.argv, + parser.add_argument('-sd', type=int, required='-u' in sys.argv, help='standard deviation (in bp) of read lengths after size selection') parser.add_argument('-x', type=int, required=False, @@ -63,33 +57,39 @@ def parse_user_input(): parser.add_argument('-d', type=str, required=False, help='json dictionary of fragment length:count for all expected bp fragments range') - parser.add_argument('-free', dest='free', action='store_true', - help='distribution-free mode: bypass size selection process') + parser.add_argument('-lp', type=int, required=False, + help='low-pass mode: defines maximum expected fragment size, distribution free') parser.add_argument('-c', type=float, required=False, - help='percent probability of per-site cut; use \'1\' for complete digestion of fragments (fragments will not contain internal RE sites)') + help='optional: percent probability of per-site cut; default 1 for complete digestion of fragments (fragments will not contain internal RE sites)') parser.add_argument('-a1', type=str, required=False, - help='file containing tab/space-separated adapters and barcode that attach 5\' to read') + help='optional: file containing tab/space-separated adapters and barcode that attach 5\' to read') parser.add_argument('-a2', type=str, required=False, - help='file containing tab/space-separated adapters and barcode that attach 3\' to read') + help='optional: file containing tab/space-separated adapters and barcode that attach 3\' to read') parser.add_argument('-a1s', type=int, required=False, - help='manually provide bp length of adapter a1 before SBS begins') + help='optional: manually provide bp length of adapter a1 before SBS begins') parser.add_argument('-a2s', type=int, required=False, - help='manually provide bp length of adapter a1 before SBS begins') + help='optional: manually provide bp length of adapter a1 before SBS begins') parser.add_argument('-q1', type=str, required=False, - help='file containing newline-separated R1 Q scores >= length -l') + help='optional: file containing newline-separated R1 Q scores >= length -l') parser.add_argument('-q2', type=str, required=False, - help='file containing newline-separated R2 Q scores >= length -l') + help='optional: file containing newline-separated R2 Q scores >= length -l') parser.add_argument('-e', type=str, required=False, help='optional: filler base to use if full adapter contaminaton occurs') + parser.add_argument('-l2', type=int, required=False, + help='optional: desired R2 read length of final simulated reads') + + parser.add_argument('-test', dest='test', action='store_true', + help='test mode: skip writing simulated fastq files') + args = parser.parse_args() return args @@ -119,6 +119,22 @@ def check_for_enzymes_iso(args, re_dt): return m1 +def check_for_palindrome(arg_m): + tmp_ls = [] + for motif in arg_m: + cut_pos = motif.index('/') + orig = motif.replace('/', '') + revc_orig = reverse_comp(orig) + if orig == revc_orig: + pass + else: + revc_orig = revc_orig[:cut_pos] + '/' + revc_orig[cut_pos:] + tmp_ls.append(revc_orig) + arg_m += tmp_ls + + return arg_m + + def iupac_motifs(arg_m): ''' given a list of RE cut motifs, return a dictionary of regex @@ -548,7 +564,7 @@ def save_individual_hist(prob_file, args): plt.close() except ValueError: - print(f'too few bins to produce histogram for {prob_file}') + #print(f'too few bins to produce histogram for {prob_file}') #LOG return @@ -662,6 +678,8 @@ def simulate_error(command, sim_in, error_out): args.motif_dt = {} re_dt = open_enzyme_file(args) args.m1, args.m2 = check_for_enzymes(args, re_dt) + args.m1 = check_for_palindrome(args.m1) + args.m2 = check_for_palindrome(args.m2) args.motif_dt1 = iupac_motifs(args.m1) args.motif_dt.update(args.motif_dt1) args.motif_dt2 = iupac_motifs(args.m2) @@ -671,10 +689,15 @@ def simulate_error(command, sim_in, error_out): if args.d: args.max = check_custom_distribution(args) - else: + elif args.lp: + args.max = args.lp + elif args.u and args.sd: args.max = args.u + (6*args.sd) - if not args.x: - args.x = args.u + else: + sys.exit('must define -d, -lp, or -u/-sd') + + if not args.x: + args.x = args.u if args.a1 and args.a2: args.a1 = get_adapters(args.a1) @@ -690,6 +713,11 @@ def simulate_error(command, sim_in, error_out): args.a1, args.a2 = create_adapters_iso(args) args.a1s, args.a2s = 62, 58 + args.l1 = args.l + + if not args.l2: + args.l2 = args.l1 + if args.q1 or args.q2: if not args.q1 or not args.q2: sys.exit('arguments q1 and q2 required') @@ -725,7 +753,7 @@ def simulate_error(command, sim_in, error_out): and perform simulation of pooled size selection ''' print('\n2. simulating size selection\n') - if args.iso or args.free: + if args.iso or args.lp: fragment_comps = \ total_freqs.groupby('length')['sum_prob'].apply(list).to_dict() fragment_comps = {k: sum(v) for k, v in fragment_comps.items()} diff --git a/test/test_readsynth.py b/test/test_readsynth.py index e448689..74ef975 100644 --- a/test/test_readsynth.py +++ b/test/test_readsynth.py @@ -87,6 +87,9 @@ def __init__(self): class TestInitFunctions(unittest.TestCase): def test_parse_user_input_1(self): + ''' + missing sd creates exit state + ''' testargs = ['', '-g', 'abundances.csv', '-l1', '150', @@ -100,8 +103,10 @@ def test_parse_user_input_1(self): with patch.object(sys, 'argv', testargs): try: args = rs.parse_user_input() + success = True except SystemExit: - assert(True) + success = False + self.assertEqual(success, False) def test_parse_user_input_2(self): testargs = ['',