-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubpath_to_DNA1.py
executable file
·185 lines (171 loc) · 9.11 KB
/
subpath_to_DNA1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
from comparison_sol import str_path
#from comparison_sol import path_str
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import SeqUtils
from Bio.SeqRecord import SeqRecord
import sys
import argparse
import pandas as pd
import random
# This script is used to convert subpaths and path solutions into DNA sequences. It relies on Biopython.
# This function is from comparison_sol.py.
# It converts paths into strings.
def path_str(x: list):
to_list = [str(i) for i in x]
to_string = ",".join(to_list)
return to_string
# This function extracts either the DNA length of each LU or the DNA sequence if SEQ=True. The input is a fasta file which contains LU DNA sequences.
def extract_LU_len(LU_fasta="IXR_BACnewseq.loxpreg.fa", SEQ=False):
LU_ID_len_seq = {}
# Open the reference fasta file
with open(LU_fasta) as handle:
for record in SeqIO.parse(handle, "fasta"):
# Extract the LU integer from the LU_ID. Usually the LU_ID is chrname_start_end_LU. Therefore we need the part after the last "_". This can be access by splitting the LU_ID into a list and by taking the last element [-1].
LU_ID = int(record.id.split("_")[-1])
#print(record.id, LU_ID)
LU_seq = record.seq
if SEQ:
LU_ID_len_seq[LU_ID] = LU_seq
else:
# Only save the LU length
LU_ID_len_seq[LU_ID] = len(LU_seq)
return LU_ID_len_seq
def introduce_mismatches(DNA_seq, percentage_mismatches=0.05):
DNA_seq_mismatches = DNA_seq[:]
bases = ["A", "C", "G", "T"]
num_mismatches = round(len(DNA_seq)*percentage_mismatches)
for _ in range(num_mismatches):
# Find a random position
pos_mis = random.randrange(1, len(DNA_seq))
# Replace the base in that position with a random bases
new_base = random.choice(bases)
DNA_seq_mismatches = DNA_seq_mismatches[:pos_mis] + new_base + DNA_seq_mismatches[pos_mis+1:]
return DNA_seq_mismatches
# This function takes as input a subpath or path and outputs a DNA sequence. It also save this DNA sequence in a new fasta file.
# If the input is a list of paths, it will save each path in a different fasta file.
def path_to_DNA_separate(path=[1,2,3], LU_fasta="IXR_BACnewseq.loxpreg.fa", filename="A_test"):
if isinstance(path, str):
# For example: "1,2,-3" => [1, 2, -3]
path = str_path(path)
# Extract the LU sequences. LU_ID_seq = {LU_ID: LU_seq}
LU_ID_seq = extract_LU_len(LU_fasta=LU_fasta, SEQ=True)
if isinstance(path[0], list): # the input is a list of subpaths
for i in range(len(path)):
path_to_DNA_separate(path=path[i], LU_fasta="IXR_BACnewseq.loxpreg.fa", filename=filename + "_" + str(i))
return None
else:
# Convert the path into a DNA sequence
path_DNA = Seq("")
for LU in path:
if abs(LU) not in LU_ID_seq.keys(): # Note this assumes that all LUs in LU_ID_seq.keys() are positive
print("ERROR! The LU", abs(LU), "in the path is not present in the fasta file", LU_fasta)
print("Check your path or your fasta file!")
#continue
return None
if LU >= 0:
path_DNA = path_DNA + LU_ID_seq[LU]
else:
# The LU is negative. Therefore it is in the reverse orientation.
path_DNA = path_DNA + LU_ID_seq[abs(LU)].reverse_complement()
# Save the path DNA into a fasta file
if isinstance(path[0], int): # There is only one sequence in "subpaths"
path = [path]
path_DNA = [path_DNA]
records = []
for i in range(len(path)):
path_string = path_str(path[i])
Description = "path=" + path_string + ", path_DNA_size=" + str(len(path_DNA[i]))
records.append(SeqRecord(path_DNA[i], "path_" + path_string, description=Description))
SeqIO.write(records, filename + ".sol.fa", "fasta")
return path_DNA
# Use this function to convert paths or subpaths into DNA sequences in fasta files.
# If the input is a list of paths, it will save each path in same fasta file in different lines
def path_to_DNA(path=[1,2,3], LU_fasta="IXR_BACnewseq.loxpreg.fa", filename="A_test", ID="path", mismatches=False, percentage_mismatches=0.05, random_cut_ends=False):
if isinstance(path, str):
# For example: "1,2,-3" => [1, 2, -3]
path = str_path(path)
# Extract the LU sequences. LU_ID_seq = {LU_ID: LU_seq}
LU_ID_seq = extract_LU_len(LU_fasta=LU_fasta, SEQ=True)
if isinstance(path[0], list): # The input is a list of subpaths
# Convert the paths into DNA sequences
path_DNA = []
for i in range(len(path)):
path_DNA_temp = Seq("")
for LU in path[i]:
if abs(LU) not in LU_ID_seq.keys(): # Note this assumes that all LUs in LU_ID_seq.keys() are positive
print("ERROR! The LU", abs(LU), "in the path is not present in the fasta file", LU_fasta)
print("Check your path or your fasta file!")
# continue
return None
if LU >= 0:
path_DNA_temp = path_DNA_temp + LU_ID_seq[LU]
else:
# The LU is negative. Therefore it is in the reverse orientation.
path_DNA_temp = path_DNA_temp + LU_ID_seq[abs(LU)].reverse_complement()
if mismatches:
path_DNA_temp = introduce_mismatches(path_DNA_temp, percentage_mismatches=percentage_mismatches)
if random_cut_ends:
# Cut first end. Find the position where to cut
pos_cut1 = random.randrange(1, len(LU_ID_seq[abs(path[i][0])]))
# Cut second end
pos_cut2 = random.randrange(1, len(LU_ID_seq[abs(path[i][-1])]))
pos_cut2 = len(path_DNA_temp) - pos_cut2
# Make the cuts
#print(pos_cut1, pos_cut2)
path_DNA_temp = path_DNA_temp[pos_cut1:pos_cut2]
# Add the DNA sequence to the list of sequences
path_DNA.append(path_DNA_temp)
else:
# Convert the path into a DNA sequence
path_DNA = Seq("")
for LU in path:
if abs(LU) not in LU_ID_seq.keys(): # Note this assumes that all LUs in LU_ID_seq.keys() are positive
print("ERROR! The LU", abs(LU), "in the path is not present in the fasta file", LU_fasta)
print("Check your path or your fasta file!")
#continue
return None
if LU >= 0:
path_DNA = path_DNA + LU_ID_seq[LU]
else:
# The LU is negative. Therefore it is in the reverse orientation.
path_DNA = path_DNA + LU_ID_seq[abs(LU)].reverse_complement()
if mismatches:
path_DNA = introduce_mismatches(path_DNA, percentage_mismatches=percentage_mismatches)
if random_cut_ends:
# Cut first end. Find the position where to cut
pos_cut1 = random.randrange(1, len(LU_ID_seq[abs(path[0])]))
# Cut second end
pos_cut2 = random.randrange(1, len(LU_ID_seq[abs(path[-1])]))
pos_cut2 = len(path_DNA) - pos_cut2
# Make the cuts
#print(pos_cut1, pos_cut2)
path_DNA = path_DNA[pos_cut1:pos_cut2]
# Save the path DNA into a fasta file
if isinstance(path[0], int): # There is only one sequence in "subpaths"
path = [path]
path_DNA = [path_DNA]
records = []
for i in range(len(path)):
path_string = path_str(path[i])
Description = "path=" + path_string + ", path_DNA_size=" + str(len(path_DNA[i]))
records.append(SeqRecord(path_DNA[i], ID + "_" + str(i+1), description=Description)) # + "_" + path_string
SeqIO.write(records, filename + ".sol.fa", "fasta")
#return path_DNA
return None
if __name__ == '__main__':
# Test the code
LU_fasta = "IXR_BACnewseq.loxpreg.fa"
filename = "A_test"
path = [1, 2, 3, -4]
path = [[3, -4], [4,-5,-6], [7,-8,-9]]
#print(extract_LU_len(LU_fasta=LU_fasta))
#print(introduce_mismatches("ACACACACACACACAC", percentage_mismatches=0.05))
#path_to_DNA(path=path, LU_fasta=LU_fasta, filename=filename, ID="path", mismatches=True, percentage_mismatches=0.05, random_cut_ends=True)
parser = argparse.ArgumentParser(description="Convert the path or subpath into DNA sequences.")
parser.add_argument("-path", "--path", type=str, metavar="", required=True, help="The path/s that you want to convert in DNA sequences")
parser.add_argument("-LU", "--LU_fasta", type=str, metavar="", required=True, help="The fasta file that contain the sequences of the LUs")
parser.add_argument("-name", "--filename", type=str, metavar="", required=False, default="A_test", help="The name of the output file")
parser.add_argument("-ID", "--seq_ID", type=str, metavar="", required=False, default="path", help="The ID of the sequence inside the fasta file")
args = parser.parse_args()
path_to_DNA(path=args.path, LU_fasta=args.LU_fasta, filename=args.filename, ID=args.seq_ID)