From e671b31c4d5ffa24760ed44cb96514638cce3c3f Mon Sep 17 00:00:00 2001
From: ryandkuster <ryan.kuster@gmail.com>
Date: Fri, 22 Jul 2022 15:36:35 -0400
Subject: [PATCH] added palindrome check, updated readme

---
 README.md              | 34 +++++++++---------
 readsynth.py           | 80 ++++++++++++++++++++++++++++--------------
 test/test_readsynth.py |  7 +++-
 3 files changed, 78 insertions(+), 43 deletions(-)

diff --git a/README.md b/README.md
index 268af3a..561ca3c 100644
--- a/README.md
+++ b/README.md
@@ -61,13 +61,13 @@ The above example takes 'abundances.csv' as a genome abundance file **g** with a
 
 ## example 16S library creation
 ```
-readsynth.py -g abundances.csv   -m1 /CCTACGGGNGGCWGCAG /CTGCWGCCNCCCGTAGG -m2 /GACTACHVGGGTATCTAANCC /GGNTTAGATACCCBDGTAGTC -n 1_000_000 -free -l 150 -o /output_directory
+readsynth.py -g abundances.csv -m1 /CCTACGGGNGGCWGCAG -m2 /GACTACHVGGGTATCTAANCC -n 1_000_000 -lp -l 150 -o /output_directory
 ```
-The above example differs from the ddRADseq library creation in that in place of a single, palindromic restriction motif for -m1, we now provide the forward V3-V4 primer along with its reverse complement. Further, this library avoids applying a gaussian size selection step and utilizes the **free** argument to go distribution-free.
+The above example differs from the ddRADseq library creation in that in place of a single, palindromic restriction motif for -m1, we now provide the forward and reverse V3-V4 primers. Further, this library avoids applying a gaussian size selection step and utilizes the **lp** argument to avoid size selection and include reads up to a defined "low-pass" bp length.
 
 ## example isolength library creation
 ```
-readsynth.py -g abundances.csv -iso BcgI -n 1_000_000 -l 150 -o /output_directory
+readsynth.py -g abundances.csv -iso BcgI -n 1_000_000 -l 150 -lp 50 -o /output_directory
 ```
 The above example uses a single, isolength (type IIB) restriction enzyme to produce fragments, and by its nature, requires no size distribution.
 
@@ -76,24 +76,26 @@ The above example uses a single, isolength (type IIB) restriction enzyme to prod
   -h, --help       show this help message and exit
   -g G             path to file genome
   -o O             path to store output
-  -m1 M1 [M1 ...]  space separated list of RE motifs (e.g., AluI or AG/CT, HindIII or A/AGCTT, SmlI or C/TYRAG)
-  -m2 M2 [M2 ...]  space separated list of RE motifs (e.g., AluI or AG/CT, HindIII or A/AGCTT, SmlI or C/TYRAG)
-  -iso ISO         optional type IIB RE motif (e.g., NN/NNNNNNNNNNCGANNNNNNTGCNNNNNNNNNNNN/)
-  -l L             desired read length of final simulated reads
-  -test            test mode: create newline-separated file of RE digested sequences only
+  -m1 M1 [M1 ...]  space separated list of search motifs (e.g., RE motifs AluI or AG/CT, or 16S primer /CCTACGGGNGGCWGCAG)
+  -m2 M2 [M2 ...]  space separated list of search motifs (e.g., RE motifs SmlI or C/TYRAG, or 16S primer /GACTACHVGGGTATCTAANCC)
+  -iso ISO         optional type IIB RE motif (e.g., BcgI or NN/NNNNNNNNNNCGANNNNNNTGCNNNNNNNNNNNN/)
+  -l L, -l1 L      desired R1 read length of final simulated reads
   -n N             total read number
   -u U             mean (in bp) of read lengths after size selection
   -sd SD           standard deviation (in bp) of read lengths after size selection
   -x X             fragment length where fragment distribution intersects size distribution
   -d D             json dictionary of fragment length:count for all expected bp fragments range
-  -free            distribution-free mode: bypass size selection process
-  -c C             percent probability of per-site cut; use '1' for complete digestion of fragments (fragments will not contain internal RE sites)
-  -a1 A1           file containing tab/space-separated adapters and barcode that attach 5' to read
-  -a2 A2           file containing tab/space-separated adapters and barcode that attach 3' to read
-  -a1s A1S         manually provide bp length of adapter a1 before SBS begins
-  -a2s A2S         manually provide bp length of adapter a1 before SBS begins
-  -q1 Q1           file containing newline-separated R1 Q scores >= length -l
-  -q2 Q2           file containing newline-separated R2 Q scores >= length -l
+  -lp LP           low-pass mode: defines maximum expected fragment size, distribution free
+  -c C             optional: percent probability of per-site cut; default 1 for complete digestion of fragments (fragments will not contain internal RE sites)
+  -a1 A1           optional: file containing tab/space-separated adapters and barcode that attach 5' to read
+  -a2 A2           optional: file containing tab/space-separated adapters and barcode that attach 3' to read
+  -a1s A1S         optional: manually provide bp length of adapter a1 before SBS begins
+  -a2s A2S         optional: manually provide bp length of adapter a1 before SBS begins
+  -q1 Q1           optional: file containing newline-separated R1 Q scores >= length -l
+  -q2 Q2           optional: file containing newline-separated R2 Q scores >= length -l
+  -e E             optional: filler base to use if full adapter contaminaton occurs
+  -l2 L2           optional: desired R2 read length of final simulated reads
+  -test            test mode: skip writing simulated fastq files
 ```
 
 # software overview
diff --git a/readsynth.py b/readsynth.py
index 1a81f5b..d4a46ac 100755
--- a/readsynth.py
+++ b/readsynth.py
@@ -31,30 +31,24 @@ def parse_user_input():
                         help='path to store output')
 
     parser.add_argument('-m1', type=str, required='-iso' not in sys.argv, nargs='+',
-                        help='space separated list of RE motifs (e.g., AluI or AG/CT, HindIII or A/AGCTT, SmlI or C/TYRAG)')
+                        help='space separated list of search motifs (e.g., RE motifs AluI or AG/CT, or 16S primer /CCTACGGGNGGCWGCAG)')
 
     parser.add_argument('-m2', type=str, required='-iso' not in sys.argv, nargs='+',
-                        help='space separated list of RE motifs (e.g., AluI or AG/CT, HindIII or A/AGCTT, SmlI or C/TYRAG)')
+                        help='space separated list of search motifs (e.g., RE motifs SmlI or C/TYRAG, or 16S primer /GACTACHVGGGTATCTAANCC)')
 
     parser.add_argument('-iso', type=str, required=False,
-                        help='optional type IIB RE motif (e.g., NN/NNNNNNNNNNCGANNNNNNTGCNNNNNNNNNNNN/)')
+                        help='optional type IIB RE motif (e.g., BcgI or NN/NNNNNNNNNNCGANNNNNNTGCNNNNNNNNNNNN/)')
 
-    parser.add_argument('-l1', type=int, required=True,
+    parser.add_argument('-l', '-l1', type=int, required=True,
                         help='desired R1 read length of final simulated reads')
 
-    parser.add_argument('-l2', type=int, required=True,
-                        help='desired R2 read length of final simulated reads')
-
-    parser.add_argument('-test', dest='test', action='store_true',
-                        help='test mode: create newline-separated file of RE digested sequences only')
-
     parser.add_argument('-n', type=int, required=True,
                         help='total read number')
 
-    parser.add_argument('-u', type=int, required='-d' not in sys.argv,
+    parser.add_argument('-u', type=int, required='-sd' in sys.argv,
                         help='mean (in bp) of read lengths after size selection')
 
-    parser.add_argument('-sd', type=int, required='-d' not in sys.argv,
+    parser.add_argument('-sd', type=int, required='-u' in sys.argv,
                         help='standard deviation (in bp) of read lengths after size selection')
 
     parser.add_argument('-x', type=int, required=False,
@@ -63,33 +57,39 @@ def parse_user_input():
     parser.add_argument('-d', type=str, required=False,
                         help='json dictionary of fragment length:count for all expected bp fragments range')
 
-    parser.add_argument('-free', dest='free', action='store_true',
-                        help='distribution-free mode: bypass size selection process')
+    parser.add_argument('-lp', type=int, required=False,
+                        help='low-pass mode: defines maximum expected fragment size, distribution free')
 
     parser.add_argument('-c', type=float, required=False,
-                        help='percent probability of per-site cut; use \'1\' for complete digestion of fragments (fragments will not contain internal RE sites)')
+                        help='optional: percent probability of per-site cut; default 1 for complete digestion of fragments (fragments will not contain internal RE sites)')
 
     parser.add_argument('-a1', type=str, required=False,
-                        help='file containing tab/space-separated adapters and barcode that attach 5\' to read')
+                        help='optional: file containing tab/space-separated adapters and barcode that attach 5\' to read')
 
     parser.add_argument('-a2', type=str, required=False,
-                        help='file containing tab/space-separated adapters and barcode that attach 3\' to read')
+                        help='optional: file containing tab/space-separated adapters and barcode that attach 3\' to read')
 
     parser.add_argument('-a1s', type=int, required=False,
-                        help='manually provide bp length of adapter a1 before SBS begins')
+                        help='optional: manually provide bp length of adapter a1 before SBS begins')
 
     parser.add_argument('-a2s', type=int, required=False,
-                        help='manually provide bp length of adapter a1 before SBS begins')
+                        help='optional: manually provide bp length of adapter a1 before SBS begins')
 
     parser.add_argument('-q1', type=str, required=False,
-                        help='file containing newline-separated R1 Q scores >= length -l')
+                        help='optional: file containing newline-separated R1 Q scores >= length -l')
 
     parser.add_argument('-q2', type=str, required=False,
-                        help='file containing newline-separated R2 Q scores >= length -l')
+                        help='optional: file containing newline-separated R2 Q scores >= length -l')
 
     parser.add_argument('-e', type=str, required=False,
                         help='optional: filler base to use if full adapter contaminaton occurs')
 
+    parser.add_argument('-l2', type=int, required=False,
+                        help='optional: desired R2 read length of final simulated reads')
+
+    parser.add_argument('-test', dest='test', action='store_true',
+                        help='test mode: skip writing simulated fastq files')
+
     args = parser.parse_args()
 
     return args
@@ -119,6 +119,22 @@ def check_for_enzymes_iso(args, re_dt):
     return m1
 
 
+def check_for_palindrome(arg_m):
+    tmp_ls = []
+    for motif in arg_m:
+        cut_pos = motif.index('/')
+        orig = motif.replace('/', '')
+        revc_orig = reverse_comp(orig)
+        if orig == revc_orig:
+            pass
+        else:
+            revc_orig = revc_orig[:cut_pos] + '/' + revc_orig[cut_pos:]
+            tmp_ls.append(revc_orig)
+    arg_m += tmp_ls
+
+    return arg_m
+
+
 def iupac_motifs(arg_m):
     '''
     given a list of RE cut motifs, return a dictionary of regex
@@ -548,7 +564,7 @@ def save_individual_hist(prob_file, args):
         plt.close()
 
     except ValueError:
-        print(f'too few bins to produce histogram for {prob_file}')
+        #print(f'too few bins to produce histogram for {prob_file}') #LOG
         return
 
 
@@ -662,6 +678,8 @@ def simulate_error(command, sim_in, error_out):
         args.motif_dt = {}
         re_dt = open_enzyme_file(args)
         args.m1, args.m2 = check_for_enzymes(args, re_dt)
+        args.m1 = check_for_palindrome(args.m1)
+        args.m2 = check_for_palindrome(args.m2)
         args.motif_dt1 = iupac_motifs(args.m1)
         args.motif_dt.update(args.motif_dt1)
         args.motif_dt2 = iupac_motifs(args.m2)
@@ -671,10 +689,15 @@ def simulate_error(command, sim_in, error_out):
 
     if args.d:
         args.max = check_custom_distribution(args)
-    else:
+    elif args.lp:
+        args.max = args.lp
+    elif args.u and args.sd:
         args.max = args.u + (6*args.sd)
-        if not args.x:
-            args.x = args.u
+    else:
+        sys.exit('must define -d, -lp, or -u/-sd')
+
+    if not args.x:
+        args.x = args.u
 
     if args.a1 and args.a2:
         args.a1 = get_adapters(args.a1)
@@ -690,6 +713,11 @@ def simulate_error(command, sim_in, error_out):
         args.a1, args.a2 = create_adapters_iso(args)
         args.a1s, args.a2s = 62, 58
 
+    args.l1 = args.l
+
+    if not args.l2:
+        args.l2 = args.l1
+
     if args.q1 or args.q2:
         if not args.q1 or not args.q2:
             sys.exit('arguments q1 and q2 required')
@@ -725,7 +753,7 @@ def simulate_error(command, sim_in, error_out):
     and perform simulation of pooled size selection
     '''
     print('\n2. simulating size selection\n')
-    if args.iso or args.free:
+    if args.iso or args.lp:
         fragment_comps = \
             total_freqs.groupby('length')['sum_prob'].apply(list).to_dict()
         fragment_comps = {k: sum(v) for k, v in fragment_comps.items()}
diff --git a/test/test_readsynth.py b/test/test_readsynth.py
index e448689..74ef975 100644
--- a/test/test_readsynth.py
+++ b/test/test_readsynth.py
@@ -87,6 +87,9 @@ def __init__(self):
 class TestInitFunctions(unittest.TestCase):
 
     def test_parse_user_input_1(self):
+        '''
+        missing sd creates exit state
+        '''
         testargs = ['',
                     '-g', 'abundances.csv',
                     '-l1', '150',
@@ -100,8 +103,10 @@ def test_parse_user_input_1(self):
         with patch.object(sys, 'argv', testargs):
             try:
                 args = rs.parse_user_input()
+                success = True
             except SystemExit:
-                assert(True)
+                success = False
+        self.assertEqual(success, False)
 
     def test_parse_user_input_2(self):
         testargs = ['',