3636def prepare_seq (seqs : dict , output_file_name : str ):
3737 """
3838 Prepare DNA sequences for alignment using Clustal Omega.
39-
39+
4040 Converts dictionary of sequences to FASTA format and runs Clustal Omega
4141 to create a multiple sequence alignment file.
42-
42+
4343 Args:
4444 seqs: Dictionary with sequence names as keys and sequences as values
4545 output_file_name: Base name for output alignment file
46-
46+
4747 Raises:
4848 SystemExit: If Clustal Omega execution fails
4949 """
@@ -68,13 +68,13 @@ def prepare_seq(seqs: dict, output_file_name: str):
6868def prepare_formatted_seq (aln_file_name : str ) -> str :
6969 """
7070 Format aligned sequences in triplet notation.
71-
72- Reads a Clustal alignment file and reformats it to add spaces
71+
72+ Reads a Clustal alignment file and reformats it to add spaces
7373 after every three nucleotides for better readability.
74-
74+
7575 Args:
7676 aln_file_name: Path to the Clustal alignment file
77-
77+
7878 Returns:
7979 Formatted alignment string with triplet notation
8080 """
@@ -114,10 +114,10 @@ def prepare_formatted_seq(aln_file_name: str) -> str:
114114def split_line (line : str ):
115115 """
116116 Split an alignment line into sequence name and actual sequence.
117-
117+
118118 Args:
119119 line: A line from the alignment file
120-
120+
121121 Returns:
122122 Tuple of (name, sequence) or None if parsing fails
123123 """
@@ -129,25 +129,25 @@ def split_line(line: str):
129129 return None
130130
131131
132- def mark_sequence_line (sequence_line : str , search_sequence : str , pattern_index : int = None ,
132+ def mark_sequence_line (sequence_line : str , search_sequence : str , pattern_index : int = None ,
133133 preliminary_start : int = None , ignore_spaces : bool = False ):
134134 """
135135 Find all occurrences of a pattern in a sequence line.
136-
136+
137137 This function implements the core sequence search algorithm with two modes:
138138 1. Exact matching - spaces are significant
139139 2. Spaced matching - spaces are ignored during comparison
140-
140+
141141 The function also handles partial matches that might continue on the next line
142142 by tracking preliminary match state.
143-
143+
144144 Args:
145145 sequence_line: The text line containing the sequence to search
146146 search_sequence: The pattern to search for
147147 pattern_index: Index in search_sequence for continuing a previous match
148148 preliminary_start: Starting position of a preliminary match from previous line
149149 ignore_spaces: Whether to ignore spaces when matching
150-
150+
151151 Returns:
152152 Tuple of (matches, match_count, preliminary_state, preliminary_completed)
153153 - matches: List of (start, end) positions of matches
@@ -160,10 +160,10 @@ def mark_sequence_line(sequence_line: str, search_sequence: str, pattern_index:
160160 len_search = len (search_sequence )
161161 len_seq = len (sequence_line )
162162 preliminary = () # Stores state for potential match continuation
163-
163+
164164 # Track if a preliminary match was completed
165165 # None = preliminary match in progress
166- # True = preliminary match successful
166+ # True = preliminary match successful
167167 # False = preliminary match failed or no preliminary match was attempted
168168 preliminary_completed = False if pattern_index is None else None
169169
@@ -189,30 +189,30 @@ def mark_sequence_line(sequence_line: str, search_sequence: str, pattern_index:
189189 pattern_i = 0 # Current position in pattern
190190 sequence_i = current_pos # Current position in sequence
191191 match_start_i = - 1 # Starting position of potential match
192-
192+
193193 # Continue a match from previous line if pattern_index is provided
194194 if pattern_index is not None and pattern_index != - 1 :
195195 pattern_i = pattern_index # Resume pattern matching from this position
196196 if preliminary_start is not None :
197197 match_start_i = preliminary_start # Use provided start position
198198 pattern_index = - 1 # Reset to avoid using this value again
199-
199+
200200 # Character-by-character comparison loop
201201 while pattern_i < len_search and sequence_i < len_seq :
202202 seq_char = sequence_line [sequence_i ]
203-
203+
204204 # Skip spaces in the sequence
205205 if seq_char == ' ' :
206206 sequence_i += 1
207207 # If we haven't started a match yet, update current position too
208208 if match_start_i == - 1 and pattern_i == 0 :
209209 current_pos = sequence_i
210210 continue
211-
211+
212212 # Mark the start of a potential match
213213 if match_start_i == - 1 :
214214 match_start_i = sequence_i
215-
215+
216216 # Character matches
217217 if seq_char == search_sequence [pattern_i ]:
218218 pattern_i += 1
@@ -223,14 +223,14 @@ def mark_sequence_line(sequence_line: str, search_sequence: str, pattern_index:
223223 if preliminary_completed is None :
224224 preliminary_completed = False
225225 break
226-
226+
227227 # Complete match found
228228 if pattern_i == len_search :
229229 match_end_i = sequence_i
230230 matches .append ((match_start_i , match_end_i ))
231231 matches_num += 1
232232 current_pos = match_end_i # Continue search after this match
233-
233+
234234 # If this was a preliminary match continuation, mark it complete
235235 if preliminary_completed is None :
236236 preliminary_completed = True
@@ -239,34 +239,34 @@ def mark_sequence_line(sequence_line: str, search_sequence: str, pattern_index:
239239 # Track partial match at end of line for potential continuation
240240 if pattern_i > 0 and sequence_i == len_seq :
241241 preliminary = (match_start_i , sequence_i , pattern_i )
242-
242+
243243 current_pos += 1 # Try next position
244-
244+
245245 # Store partial match data if at end of sequence
246246 if current_pos == len_seq and match_start_i != - 1 and pattern_i > 0 :
247247 preliminary = (match_start_i , sequence_i , pattern_i )
248-
248+
249249 # Return results
250250 if len (matches ) != 0 :
251251 return matches , matches_num , preliminary , preliminary_completed
252252 else :
253253 return None , 0 , preliminary , preliminary_completed if preliminary_completed is not None else False
254254
255255
256- def process_lines (lines : list , search_sequence : str , sequences_num : int ,
256+ def process_lines (lines : list , search_sequence : str , sequences_num : int ,
257257 ignore_spaces : bool = False ) -> tuple :
258258 """
259259 Process all lines of the alignment to find pattern matches.
260-
260+
261261 This function handles matches that continue across multiple lines by tracking
262262 preliminary (partial) matches for each sequence name.
263-
263+
264264 Args:
265265 lines: All lines from the alignment file
266266 search_sequence: Pattern to search for
267267 sequences_num: Number of sequences in the alignment
268268 ignore_spaces: Whether to ignore spaces during matching
269-
269+
270270 Returns:
271271 Tuple of (results, matches_num):
272272 - results: List of matches for each line (None or list of (start, end) tuples)
@@ -276,48 +276,48 @@ def process_lines(lines: list, search_sequence: str, sequences_num: int,
276276 preliminary = {}
277277 matches_num = 0
278278 results = []
279-
279+
280280 # Number of lines between consecutive occurrences of the same sequence
281281 # (includes sequences + blank/marker lines)
282282 lines_until_next_sequence = sequences_num + 2
283-
283+
284284 # Process each line of the alignment
285285 for i , line in enumerate (lines ):
286286 # Skip header lines (CLUSTAL format has 3 header lines)
287287 if i < 3 :
288288 results .append (None )
289289 continue
290-
290+
291291 # Extract sequence name and content
292292 name_sequence_line = split_line (line )
293293 if name_sequence_line :
294294 name , sequence_line = name_sequence_line
295-
295+
296296 # Check if we have a partial match from previous occurrence of this sequence
297297 if name in preliminary and preliminary .get (name ) != ():
298298 prev_start , _ , prev_pattern_i = preliminary .get (name )
299-
299+
300300 # Continue matching from where we left off
301301 # If this is the immediate next line, use preliminary_start
302302 matches_compound = mark_sequence_line (
303- sequence_line ,
304- search_sequence ,
305- pattern_index = prev_pattern_i ,
303+ sequence_line ,
304+ search_sequence ,
305+ pattern_index = prev_pattern_i ,
306306 preliminary_start = prev_start if i - lines_until_next_sequence < 0 else None ,
307307 ignore_spaces = ignore_spaces
308308 )
309309 else :
310310 # Start fresh match for this sequence
311311 matches_compound = mark_sequence_line (sequence_line , search_sequence , ignore_spaces = ignore_spaces )
312-
312+
313313 # Process match results
314314 if matches_compound is not None :
315315 matches , matches_count , preliminary_value , preliminary_completed = matches_compound
316-
316+
317317 # Handle case where a preliminary match was completed
318318 # This means we need to mark the end of the previous line too
319- if (preliminary_completed and
320- name in preliminary and
319+ if (preliminary_completed and
320+ name in preliminary and
321321 preliminary .get (name ) != ()):
322322 # Find previous line with this sequence
323323 prev_line_index = i - lines_until_next_sequence
@@ -330,10 +330,10 @@ def process_lines(lines: list, search_sequence: str, sequences_num: int,
330330 match_end = len (lines [prev_line_index ].split (' ' , 1 )[1 ]) if ' ' in lines [prev_line_index ] else 0
331331 results [prev_line_index ].append ((match_start , match_end ))
332332 matches_num += 1
333-
333+
334334 # Update preliminary match state for this sequence
335335 preliminary [name ] = preliminary_value
336-
336+
337337 # Add matches for current line
338338 if matches :
339339 matches_num += matches_count # Note: This could be a bug, should be matches_num not matches_count
@@ -344,7 +344,7 @@ def process_lines(lines: list, search_sequence: str, sequences_num: int,
344344 results .append (None )
345345 else :
346346 results .append (None )
347-
347+
348348 return results , matches_num
349349
350350
@@ -356,18 +356,18 @@ def save_matches_html(
356356):
357357 """
358358 Generate an HTML document with highlighted sequence matches.
359-
359+
360360 Creates an HTML file with the alignment where matched patterns
361361 are highlighted with yellow background.
362-
362+
363363 Args:
364364 file_path: Output HTML file path
365365 lines: Alignment lines to display
366366 matches_results: Match positions for each line
367367 page_title: HTML page title
368368 font_name: Font to use for sequence display
369369 max_width_percent: Maximum width of the sequence display area
370-
370+
371371 Returns:
372372 Boolean indicating success or failure
373373 """
@@ -469,10 +469,10 @@ def save_matches_word(
469469):
470470 """
471471 Generate a Word document with highlighted sequence matches.
472-
472+
473473 Creates a Word document with the alignment where matched patterns
474474 are highlighted with the specified color.
475-
475+
476476 Args:
477477 file_path: Output DOCX file path
478478 lines: Alignment lines to display
@@ -481,7 +481,7 @@ def save_matches_word(
481481 font_size: Font size in points
482482 margin_inches: Document margins in inches
483483 highlight_color: Color to use for highlighting matches
484-
484+
485485 Returns:
486486 Boolean indicating success or failure
487487 """
@@ -578,7 +578,7 @@ def save_matches_word(
578578def main ():
579579 """
580580 Main program execution flow.
581-
581+
582582 Steps:
583583 1. Load sequence data from JSON
584584 2. Generate or reuse sequence alignment
@@ -595,8 +595,8 @@ def main():
595595 under certain conditions; refer to the LICENSE file for details.
596596""" )
597597
598- output_path = os . path . join ( os . path . dirname ( __file__ ), "output" )
599- input_path = os . path . join ( os . path . dirname ( __file__ ), "input" )
598+ output_path = "output"
599+ input_path = "input"
600600
601601 os .makedirs (output_path , exist_ok = True )
602602 os .makedirs (input_path , exist_ok = True )
@@ -619,17 +619,16 @@ def main():
619619 if os .path .exists (path_hash ) and os .path .isfile (path_hash ):
620620 with open (path_hash , "r" , encoding = "utf-8" ) as file :
621621 last_hash = file .read ()
622-
622+
623623 # Calculate hash of current sequences
624624 new_hash = hashlib .md5 (str (sequences ).encode ("utf-8" ), usedforsecurity = False ).hexdigest ()
625-
625+
626626 # Generate alignment if hash changed or alignment file doesn't exist
627627 if last_hash != new_hash or not os .path .exists (os .path .join (output_path , "sequences.aln" )) or not os .path .isfile (os .path .join (output_path , "sequences.aln" )):
628628 with open (path_hash , "w" , encoding = "utf-8" ) as file :
629629 file .write (new_hash )
630- print ("Computing DNA sequence alignment..." , end = '' )
631630 prepare_seq (sequences , os .path .join (output_path , "sequences.aln" ))
632- print ("\r Computed DNA sequence alignment. \n " )
631+ print ("Computed DNA sequence alignment.\n " )
633632 else :
634633 print ("Reusing unchanged DNA alignment file.\n " )
635634
@@ -642,11 +641,11 @@ def main():
642641
643642 # Initialize match results list
644643 match_results = []
645-
644+
646645 # Get search pattern from user
647646 search_word = input ("Input DNA sequence to search for (e.g. \" ACC\" or \" \" to disable marking)\n > " ).strip ()
648647 print ()
649-
648+
650649 if len (search_word ) != 0 :
651650 # Determine search mode: exact or ignoring spaces
652651 skip_spaces = True if "space" in input ("Search mode (exact/spaced)\n > " ) else False
@@ -659,13 +658,13 @@ def main():
659658 print (f"{ match_num } matches found.\n " )
660659 else :
661660 print ("Entered empty search phrase.\n " )
662-
661+
663662 # Create output documents with highlighted matches
664663 if save_matches_html (html_output_filename , text_lines , match_results ):
665664 print (f"Saved HTML to '{ html_output_filename } '" )
666665 if save_matches_word (word_output_filename , text_lines , match_results ):
667666 print (f"Saved Word document to '{ word_output_filename } '" )
668-
667+
669668 print ("\n Exiting..." )
670669
671670
0 commit comments