Skip to content

Commit

Permalink
Merge branch 'add_palindrome_test'
Browse files Browse the repository at this point in the history
  • Loading branch information
ryandkuster committed Jul 22, 2022
2 parents 5a522ca + e671b31 commit 452784c
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 43 deletions.
34 changes: 18 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,13 @@ The above example takes 'abundances.csv' as a genome abundance file **g** with a

## example 16S library creation
```
readsynth.py -g abundances.csv -m1 /CCTACGGGNGGCWGCAG /CTGCWGCCNCCCGTAGG -m2 /GACTACHVGGGTATCTAANCC /GGNTTAGATACCCBDGTAGTC -n 1_000_000 -free -l 150 -o /output_directory
readsynth.py -g abundances.csv -m1 /CCTACGGGNGGCWGCAG -m2 /GACTACHVGGGTATCTAANCC -n 1_000_000 -lp -l 150 -o /output_directory
```
The above example differs from the ddRADseq library creation in that in place of a single, palindromic restriction motif for -m1, we now provide the forward V3-V4 primer along with its reverse complement. Further, this library avoids applying a gaussian size selection step and utilizes the **free** argument to go distribution-free.
The above example differs from the ddRADseq library creation in that in place of a single, palindromic restriction motif for -m1, we now provide the forward and reverse V3-V4 primers. Further, this library avoids applying a gaussian size selection step and utilizes the **lp** argument to avoid size selection and include reads up to a defined "low-pass" bp length.

## example isolength library creation
```
readsynth.py -g abundances.csv -iso BcgI -n 1_000_000 -l 150 -o /output_directory
readsynth.py -g abundances.csv -iso BcgI -n 1_000_000 -l 150 -lp 50 -o /output_directory
```
The above example uses a single, isolength (type IIB) restriction enzyme to produce fragments, and by its nature, requires no size distribution.

Expand All @@ -76,24 +76,26 @@ The above example uses a single, isolength (type IIB) restriction enzyme to prod
-h, --help show this help message and exit
-g G path to file genome
-o O path to store output
-m1 M1 [M1 ...] space separated list of RE motifs (e.g., AluI or AG/CT, HindIII or A/AGCTT, SmlI or C/TYRAG)
-m2 M2 [M2 ...] space separated list of RE motifs (e.g., AluI or AG/CT, HindIII or A/AGCTT, SmlI or C/TYRAG)
-iso ISO optional type IIB RE motif (e.g., NN/NNNNNNNNNNCGANNNNNNTGCNNNNNNNNNNNN/)
-l L desired read length of final simulated reads
-test test mode: create newline-separated file of RE digested sequences only
-m1 M1 [M1 ...] space separated list of search motifs (e.g., RE motifs AluI or AG/CT, or 16S primer /CCTACGGGNGGCWGCAG)
-m2 M2 [M2 ...] space separated list of search motifs (e.g., RE motifs SmlI or C/TYRAG, or 16S primer /GACTACHVGGGTATCTAANCC)
-iso ISO optional type IIB RE motif (e.g., BcgI or NN/NNNNNNNNNNCGANNNNNNTGCNNNNNNNNNNNN/)
-l L, -l1 L desired R1 read length of final simulated reads
-n N total read number
-u U mean (in bp) of read lengths after size selection
-sd SD standard deviation (in bp) of read lengths after size selection
-x X fragment length where fragment distribution intersects size distribution
-d D json dictionary of fragment length:count for all expected bp fragments range
-free distribution-free mode: bypass size selection process
-c C percent probability of per-site cut; use '1' for complete digestion of fragments (fragments will not contain internal RE sites)
-a1 A1 file containing tab/space-separated adapters and barcode that attach 5' to read
-a2 A2 file containing tab/space-separated adapters and barcode that attach 3' to read
-a1s A1S manually provide bp length of adapter a1 before SBS begins
-a2s A2S manually provide bp length of adapter a1 before SBS begins
-q1 Q1 file containing newline-separated R1 Q scores >= length -l
-q2 Q2 file containing newline-separated R2 Q scores >= length -l
-lp LP low-pass mode: defines maximum expected fragment size, distribution free
-c C optional: percent probability of per-site cut; default 1 for complete digestion of fragments (fragments will not contain internal RE sites)
-a1 A1 optional: file containing tab/space-separated adapters and barcode that attach 5' to read
-a2 A2 optional: file containing tab/space-separated adapters and barcode that attach 3' to read
-a1s A1S optional: manually provide bp length of adapter a1 before SBS begins
-a2s A2S optional: manually provide bp length of adapter a1 before SBS begins
-q1 Q1 optional: file containing newline-separated R1 Q scores >= length -l
-q2 Q2 optional: file containing newline-separated R2 Q scores >= length -l
-e E optional: filler base to use if full adapter contaminaton occurs
-l2 L2 optional: desired R2 read length of final simulated reads
-test test mode: skip writing simulated fastq files
```

# software overview
Expand Down
80 changes: 54 additions & 26 deletions readsynth.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,30 +31,24 @@ def parse_user_input():
help='path to store output')

parser.add_argument('-m1', type=str, required='-iso' not in sys.argv, nargs='+',
help='space separated list of RE motifs (e.g., AluI or AG/CT, HindIII or A/AGCTT, SmlI or C/TYRAG)')
help='space separated list of search motifs (e.g., RE motifs AluI or AG/CT, or 16S primer /CCTACGGGNGGCWGCAG)')

parser.add_argument('-m2', type=str, required='-iso' not in sys.argv, nargs='+',
help='space separated list of RE motifs (e.g., AluI or AG/CT, HindIII or A/AGCTT, SmlI or C/TYRAG)')
help='space separated list of search motifs (e.g., RE motifs SmlI or C/TYRAG, or 16S primer /GACTACHVGGGTATCTAANCC)')

parser.add_argument('-iso', type=str, required=False,
help='optional type IIB RE motif (e.g., NN/NNNNNNNNNNCGANNNNNNTGCNNNNNNNNNNNN/)')
help='optional type IIB RE motif (e.g., BcgI or NN/NNNNNNNNNNCGANNNNNNTGCNNNNNNNNNNNN/)')

parser.add_argument('-l1', type=int, required=True,
parser.add_argument('-l', '-l1', type=int, required=True,
help='desired R1 read length of final simulated reads')

parser.add_argument('-l2', type=int, required=True,
help='desired R2 read length of final simulated reads')

parser.add_argument('-test', dest='test', action='store_true',
help='test mode: create newline-separated file of RE digested sequences only')

parser.add_argument('-n', type=int, required=True,
help='total read number')

parser.add_argument('-u', type=int, required='-d' not in sys.argv,
parser.add_argument('-u', type=int, required='-sd' in sys.argv,
help='mean (in bp) of read lengths after size selection')

parser.add_argument('-sd', type=int, required='-d' not in sys.argv,
parser.add_argument('-sd', type=int, required='-u' in sys.argv,
help='standard deviation (in bp) of read lengths after size selection')

parser.add_argument('-x', type=int, required=False,
Expand All @@ -63,33 +57,39 @@ def parse_user_input():
parser.add_argument('-d', type=str, required=False,
help='json dictionary of fragment length:count for all expected bp fragments range')

parser.add_argument('-free', dest='free', action='store_true',
help='distribution-free mode: bypass size selection process')
parser.add_argument('-lp', type=int, required=False,
help='low-pass mode: defines maximum expected fragment size, distribution free')

parser.add_argument('-c', type=float, required=False,
help='percent probability of per-site cut; use \'1\' for complete digestion of fragments (fragments will not contain internal RE sites)')
help='optional: percent probability of per-site cut; default 1 for complete digestion of fragments (fragments will not contain internal RE sites)')

parser.add_argument('-a1', type=str, required=False,
help='file containing tab/space-separated adapters and barcode that attach 5\' to read')
help='optional: file containing tab/space-separated adapters and barcode that attach 5\' to read')

parser.add_argument('-a2', type=str, required=False,
help='file containing tab/space-separated adapters and barcode that attach 3\' to read')
help='optional: file containing tab/space-separated adapters and barcode that attach 3\' to read')

parser.add_argument('-a1s', type=int, required=False,
help='manually provide bp length of adapter a1 before SBS begins')
help='optional: manually provide bp length of adapter a1 before SBS begins')

parser.add_argument('-a2s', type=int, required=False,
help='manually provide bp length of adapter a1 before SBS begins')
help='optional: manually provide bp length of adapter a1 before SBS begins')

parser.add_argument('-q1', type=str, required=False,
help='file containing newline-separated R1 Q scores >= length -l')
help='optional: file containing newline-separated R1 Q scores >= length -l')

parser.add_argument('-q2', type=str, required=False,
help='file containing newline-separated R2 Q scores >= length -l')
help='optional: file containing newline-separated R2 Q scores >= length -l')

parser.add_argument('-e', type=str, required=False,
help='optional: filler base to use if full adapter contaminaton occurs')

parser.add_argument('-l2', type=int, required=False,
help='optional: desired R2 read length of final simulated reads')

parser.add_argument('-test', dest='test', action='store_true',
help='test mode: skip writing simulated fastq files')

args = parser.parse_args()

return args
Expand Down Expand Up @@ -119,6 +119,22 @@ def check_for_enzymes_iso(args, re_dt):
return m1


def check_for_palindrome(arg_m):
tmp_ls = []
for motif in arg_m:
cut_pos = motif.index('/')
orig = motif.replace('/', '')
revc_orig = reverse_comp(orig)
if orig == revc_orig:
pass
else:
revc_orig = revc_orig[:cut_pos] + '/' + revc_orig[cut_pos:]
tmp_ls.append(revc_orig)
arg_m += tmp_ls

return arg_m


def iupac_motifs(arg_m):
'''
given a list of RE cut motifs, return a dictionary of regex
Expand Down Expand Up @@ -548,7 +564,7 @@ def save_individual_hist(prob_file, args):
plt.close()

except ValueError:
print(f'too few bins to produce histogram for {prob_file}')
#print(f'too few bins to produce histogram for {prob_file}') #LOG
return


Expand Down Expand Up @@ -662,6 +678,8 @@ def simulate_error(command, sim_in, error_out):
args.motif_dt = {}
re_dt = open_enzyme_file(args)
args.m1, args.m2 = check_for_enzymes(args, re_dt)
args.m1 = check_for_palindrome(args.m1)
args.m2 = check_for_palindrome(args.m2)
args.motif_dt1 = iupac_motifs(args.m1)
args.motif_dt.update(args.motif_dt1)
args.motif_dt2 = iupac_motifs(args.m2)
Expand All @@ -671,10 +689,15 @@ def simulate_error(command, sim_in, error_out):

if args.d:
args.max = check_custom_distribution(args)
else:
elif args.lp:
args.max = args.lp
elif args.u and args.sd:
args.max = args.u + (6*args.sd)
if not args.x:
args.x = args.u
else:
sys.exit('must define -d, -lp, or -u/-sd')

if not args.x:
args.x = args.u

if args.a1 and args.a2:
args.a1 = get_adapters(args.a1)
Expand All @@ -690,6 +713,11 @@ def simulate_error(command, sim_in, error_out):
args.a1, args.a2 = create_adapters_iso(args)
args.a1s, args.a2s = 62, 58

args.l1 = args.l

if not args.l2:
args.l2 = args.l1

if args.q1 or args.q2:
if not args.q1 or not args.q2:
sys.exit('arguments q1 and q2 required')
Expand Down Expand Up @@ -725,7 +753,7 @@ def simulate_error(command, sim_in, error_out):
and perform simulation of pooled size selection
'''
print('\n2. simulating size selection\n')
if args.iso or args.free:
if args.iso or args.lp:
fragment_comps = \
total_freqs.groupby('length')['sum_prob'].apply(list).to_dict()
fragment_comps = {k: sum(v) for k, v in fragment_comps.items()}
Expand Down
7 changes: 6 additions & 1 deletion test/test_readsynth.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ def __init__(self):
class TestInitFunctions(unittest.TestCase):

def test_parse_user_input_1(self):
'''
missing sd creates exit state
'''
testargs = ['',
'-g', 'abundances.csv',
'-l1', '150',
Expand All @@ -100,8 +103,10 @@ def test_parse_user_input_1(self):
with patch.object(sys, 'argv', testargs):
try:
args = rs.parse_user_input()
success = True
except SystemExit:
assert(True)
success = False
self.assertEqual(success, False)

def test_parse_user_input_2(self):
testargs = ['',
Expand Down

0 comments on commit 452784c

Please sign in to comment.