Skip to content

Commit

Permalink
#84 MAJOR: Refactor start. Moved logic from TileLayout to DataSource
Browse files Browse the repository at this point in the history
  • Loading branch information
josiahseaman committed Oct 8, 2019
1 parent e9b6aee commit c4d4fbe
Show file tree
Hide file tree
Showing 5 changed files with 178 additions and 119 deletions.
13 changes: 12 additions & 1 deletion DDV/DDVUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from collections import defaultdict
from datetime import datetime

from DNASkittleUtils.Contigs import read_contigs
from DNASkittleUtils.Contigs import read_contigs, write_contigs_to_file
from PIL import ImageDraw


Expand Down Expand Up @@ -448,3 +448,14 @@ def viridis_palette():
palette[254] = (250, 230, 34)
palette[255] = (253, 231, 36)
return palette


def write_contigs_to_chunks_dir(project_dir, fasta_name, contigs):
chunks_dir = os.path.join(project_dir, 'chunks', fasta_name)
try:
os.makedirs(chunks_dir, exist_ok=True)
except BaseException:
pass
for i, contig in enumerate(contigs):
filename = os.path.join(chunks_dir, '%i.fa' % i)
write_contigs_to_file(filename, [contig],verbose=False)
123 changes: 123 additions & 0 deletions DDV/DataSource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
from DNASkittleUtils.Contigs import Contig, write_contigs_to_file, read_contigs
import os
from DDVUtils import write_contigs_to_chunks_dir, filter_by_contigs
from DDV.DDVUtils import multi_line_height, copy_to_sources
from Layouts import LayoutFrame


class DataSource:
def __init__(self, fasta_name: str, sort_contigs: bool, extract_contigs: bool, layouframe: LayoutFrame):
self.coords = layouframe # LayoutFrame
self.contigs = [] # List[Contig]
self.fasta_name = fasta_name # str
self.spacing_memory = ''
self.layout_algorithm = "0" # rastered tile layout
self.protein_palette = False
self.using_spectrum = False
self.skip_small_titles = False
self.sort_contigs = sort_contigs
self.extract_contigs = extract_contigs

def __getitem__(self, index):
return self.coords[index]

@property
def base_width(self):
"""Shorthand for the column width value that is used often. This can change
based on the current self.i_layout."""
return self.coords.base_width

@property
def origin(self):
return self.coords.origin

def relative_position(self, progress): # Alias for layout: Optimize?
return self.coords.relative_position(progress)

def position_on_screen(self, progress): # Alias for layout: Optimize?
return self.coords.position_on_screen(progress)

def output_fasta(self, output_folder, fasta, no_webpage, extract_contigs,
append_fasta_sources=True):
bare_file = os.path.basename(fasta)

# also make single file
if not no_webpage:
write_contigs_to_chunks_dir(output_folder, bare_file, self.contigs)
fasta_destination = os.path.join(output_folder, 'sources', bare_file)
if self.extract_contigs or self.sort_contigs: # customized_fasta
length_sum = sum([len(c.seq) for c in self.contigs])
fasta_destination = '%s__%ibp.fa' % (os.path.splitext(fasta_destination)[0], length_sum)
write_contigs_to_file(fasta_destination, self.contigs) # shortened fasta
else:
copy_to_sources(output_folder, fasta)
print("Sequence saved in:", fasta_destination)
self.clear_sequences()

def clear_sequences(self):
self.spacing_memory = self.contig_struct()
self.contigs = []

def contig_struct(self):
if not self.contigs and self.spacing_memory:
return self.spacing_memory # original value was already cached
json = []
xy_seq_start = 0
for index, contig in enumerate(self.contigs):
if index > 1000:
break # I don't want to use a slice operator on the for loop because that will copy it
xy_seq_start += contig.reset_padding + contig.title_padding
xy_seq_end = xy_seq_start + len(contig.seq)
json.append(
{"name": contig.name.replace("'", ""), "xy_seq_start": xy_seq_start, "xy_seq_end": xy_seq_end,
"title_padding": contig.title_padding, "tail_padding": contig.tail_padding,
"xy_title_start": xy_seq_start - contig.title_padding,
"nuc_title_start": contig.nuc_title_start, "nuc_seq_start": contig.nuc_seq_start})
xy_seq_start += len(contig.seq) + contig.tail_padding
return json


def read_contigs_and_calc_padding(self, input_file_path, extract_contigs=None):
self.extract_contigs = extract_contigs
self.fasta_name = os.path.basename(input_file_path)
try:
self.contigs = read_contigs(input_file_path) # TODO:, extract_contigs)
except UnicodeDecodeError as e:
print(e)
print("Important: Non-standard characters detected. Switching to 256 colormap for bytes")
self.using_spectrum = True
self.contigs = [Contig(input_file_path, open(input_file_path, 'rb').read())]
self.contigs = filter_by_contigs(self.contigs, extract_contigs)
self.protein_palette = is_protein_sequence(self.contigs[0])

# if len(self.levels) >= 5 and len(self.contigs[0].seq) > self.levels[4].chunk_size and multipart_file:
# self.enable_fat_headers() # first contig is huge and there's more contigs coming
if len(self.contigs) > 10000:
print("Over 10,000 scaffolds detected! Titles for entries less than 10,000bp will not be drawn.")
self.skip_small_titles = True
# Important! Skipping isn't valid unless they're sorted
if not self.sort_contigs:
self.sort_contigs = True
print("Scaffolds are being sorted by length.")
# Best to bring the largest contigs to the forefront
self.contigs.sort(key=lambda fragment: -len(fragment.seq))


class PaddedContig(Contig):
def __init__(self, name, seq):
super(PaddedContig, self).__init__(name, seq)
self.reset_padding = 0
self.title_padding = 0
self.tail_padding = 0


def is_protein_sequence(contig):
"""Checks if there are any peptide characters in the first 100 of the first contig"""
peptides = {'D', 'E', 'F', 'H', 'I', 'K', 'L', 'M', 'P', 'Q', 'R', 'S', 'V', 'W', 'X', 'Y'}
matches = set(contig.seq[:100]).intersection(peptides)
print("Found matches", matches)
return len(matches) > 0




9 changes: 4 additions & 5 deletions DDV/MultipleAlignmentLayout.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from PIL import Image, ImageDraw

import math
from DDV.TileLayout import hex_to_rgb, TileLayout, is_protein_sequence
from DDV.TileLayout import hex_to_rgb, TileLayout
from natsort import natsorted

from DDV.DDVUtils import make_output_directory
Expand Down Expand Up @@ -101,9 +101,8 @@ def process_all_alignments(self, input_fasta_folder, output_folder, output_file_
print("Initialized Image:", datetime.now() - start_time, "\n")
#TODO: sort all layouts with corresponding sequence?

for file_no, single_MSA in enumerate(self.fasta_sources):
for file_no, single_MSA in enumerate(self.each_layout):
self.i_layout = file_no
self.contigs = self.all_contents[single_MSA]
# self.read_contigs_and_calc_padding(single_MSA, None)
try: # These try catch statements ensure we get at least some output. These jobs can take hours
self.draw_nucleotides()
Expand All @@ -113,7 +112,7 @@ def process_all_alignments(self, input_fasta_folder, output_folder, output_file_
print('Encountered exception while drawing nucleotides:', '\n')
traceback.print_exc()
input_path = os.path.join(input_fasta_folder, single_MSA)
self.output_fasta(output_folder, input_path, False, None, False, append_fasta_sources=False)
single_MSA.output_fasta(output_folder, input_path, False, None, False, append_fasta_sources=False)
print("\nDrew Nucleotides:", datetime.now() - start_time)
self.output_image(output_folder, output_file_name, False)
print("Output Image in:", datetime.now() - start_time)
Expand Down Expand Up @@ -273,11 +272,11 @@ def draw_titles(self):
title_lines, title_width, upper_left, False, self.image)

def spread_large_MSA_source(self, fasta_path):
# TODO: update this with DataSource
individuals = read_contigs(fasta_path)
self.contigs = individuals
self.fasta_sources = [os.path.basename(fasta_path) + str(i) for i in range(len(individuals))]
self.all_contents = {source: [individuals[i]] for i, source in enumerate(self.fasta_sources)}
self.protein_palette = is_protein_sequence(self.contigs[0])

# Zero padding
for name, container in self.all_contents.items():
Expand Down
7 changes: 4 additions & 3 deletions DDV/ParallelGenomeLayout.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(self, n_genomes, low_contrast=False, base_width=100, column_widths=
if column_widths is None: # just copies the TileLayout levels several times
column_widths = [self.base_width] * n_genomes

self.each_layout = [] # one layout per data source assumed same order as self.fasta_sources
#each_layout = one layout per data source also tracks fasta_source names
# I found that less padding is better for keeping visual patterns coherent over clusters
# of columns. The white space has a disproportionate effect if you space it out too much.
p = 1 # padding_between_layouts
Expand Down Expand Up @@ -64,7 +64,8 @@ def process_file(self, output_folder, output_file_name, fasta_files,

try:
# Do inner work for each file
for index, filename in enumerate(fasta_files):
for index, source in enumerate(self.each_layout):
filename = source.fasta_name
self.changes_per_genome()
if index != 0:
self.read_contigs_and_calc_padding(filename, extract_contigs)
Expand All @@ -73,7 +74,7 @@ def process_file(self, output_folder, output_file_name, fasta_files,
self.draw_titles()
self.genome_processed += 1
print("Drew File:", filename, datetime.now() - start_time)
self.output_fasta(output_folder, filename, False, extract_contigs, self.sort_contigs)
source.output_fasta(output_folder, filename, False, extract_contigs)
except Exception as e:
print('Encountered exception while drawing nucleotides:', '\n')
traceback.print_exc()
Expand Down
Loading

0 comments on commit c4d4fbe

Please sign in to comment.