Skip to content

Commit

Permalink
Implements all changes for merging into master
Browse files Browse the repository at this point in the history
  • Loading branch information
samhorsfield96 committed Oct 18, 2024
1 parent 629d5d2 commit 5cb6416
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 80 deletions.
4 changes: 2 additions & 2 deletions src/ORF_scoring.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ std::vector<float> score_TIS (const std::vector<std::tuple<std::string, std::str
}

// ensure sequence is padded if too short
const size_t len_diff = 32 - encoded.size();
const int len_diff = 32 - encoded.size();
for (int pad = 0; pad < len_diff; pad++)
{
encoded.push_back(0);
Expand All @@ -104,7 +104,7 @@ std::vector<float> score_TIS (const std::vector<std::tuple<std::string, std::str

if (!pos_idx.empty())
{
// pad tensor to 32 bp if only single sequence
// pad tensor to 32 bp if only single sequence, scoring guaranteed to be on 32 length vector
if (pos_idx.size() == 1)
{
torch::Tensor zeroes = torch::zeros({32}, torch::kInt64);
Expand Down
6 changes: 1 addition & 5 deletions src/call_ORFs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -323,12 +323,8 @@ void generate_ORFs(const int& colour_ID,
std::string start_site_AA = translate(start_site_DNA);
site_hash = hasher{}(start_site_AA);

int num_kmers = start_site_AA.size() - aa_kmer;
// ensure if small start found, can still generate sequence
if (num_kmers <= 0)
{
num_kmers = 1;
}
int num_kmers = start_site_AA.size() > aa_kmer ? start_site_AA.size() - aa_kmer : 1;

site_coverage.resize(num_kmers);

Expand Down
109 changes: 36 additions & 73 deletions src/indexing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,35 @@ std::vector<std::pair<Kmer, bool>> get_neighbours (const T& neighbour_iterator)
return neighbour_vector;
}

void calc_start_freq (std::string& start_site_AA,
const boost::dynamic_bitset<>& full_unitig_colour,
tbb::concurrent_unordered_map<std::string, tbb::concurrent_unordered_set<int>>& start_freq_set,
const int& aa_kmer,
const size_t& nb_colours)
{
// ensure if small start found, can still generate sequence
int num_kmers = start_site_AA.size() > aa_kmer ? start_site_AA.size() - aa_kmer : 1;

std::vector<std::string> AA_kmers(num_kmers);

for (int kmer_index = 0; kmer_index < num_kmers; ++kmer_index)
{
AA_kmers[kmer_index] = get_kmers(start_site_AA, kmer_index, aa_kmer);
}

// add colours to start_freq_set
for (int i = 0; i < nb_colours; i++)
{
if ((bool)full_unitig_colour[i])
{
for (const auto& entry_aa : AA_kmers)
{
start_freq_set[entry_aa].insert(i);
}
}
}
}

template <class T, class U, bool is_const>
void analyse_unitigs_binary (ColoredCDBG<MyUnitigMap>& ccdbg,
UnitigMap<DataAccessor<T>, DataStorage<U>, is_const> um,
Expand Down Expand Up @@ -290,43 +319,10 @@ void analyse_unitigs_binary (ColoredCDBG<MyUnitigMap>& ccdbg,
// pull out start codon positions
for (const auto& pos : found_indices)
{
//if (unitig.size() - pos >= kmer)
{
std::string start_site_DNA = unitig.substr(pos, kmer);
std::string start_site_AA = translate(start_site_DNA);

// remove any starts with stop codon
// if (start_site_AA.find('*') != std::string::npos)
// {
// continue;
// }

int num_kmers = start_site_AA.size() - aa_kmer;
// ensure if small start found, can still generate sequence
if (num_kmers <= 0)
{
num_kmers = 1;
}

std::vector<std::string> AA_kmers(num_kmers);
std::string start_site_DNA = unitig.substr(pos, kmer);
std::string start_site_AA = translate(start_site_DNA);

for (int kmer_index = 0; kmer_index < num_kmers; ++kmer_index)
{
AA_kmers[kmer_index] = get_kmers(start_site_AA, kmer_index, aa_kmer);
}

// add colours to start_freq_set
for (int i = 0; i < nb_colours; i++)
{
if ((bool)full_unitig_colour[i])
{
for (const auto& entry_aa : AA_kmers)
{
start_freq_set[entry_aa].insert(i);
}
}
}
}
calc_start_freq (start_site_AA, full_unitig_colour, start_freq_set, aa_kmer, nb_colours);
}
}

Expand All @@ -347,43 +343,10 @@ void analyse_unitigs_binary (ColoredCDBG<MyUnitigMap>& ccdbg,
// pull out start codon positions
for (const auto& pos : found_indices)
{
//if (unitig.size() - pos >= kmer)
{
std::string start_site_DNA = rev_unitig.substr(pos, kmer);
std::string start_site_AA = translate(start_site_DNA);

// remove any starts with stop codon
// if (start_site_AA.find('*') != std::string::npos)
// {
// continue;
// }

int num_kmers = start_site_AA.size() - aa_kmer;
// ensure if small start found, can still generate sequence
if (num_kmers <= 0)
{
num_kmers = 1;
}

std::vector<std::string> AA_kmers(num_kmers);
std::string start_site_DNA = rev_unitig.substr(pos, kmer);
std::string start_site_AA = translate(start_site_DNA);

for (int kmer_index = 0; kmer_index < num_kmers; ++kmer_index)
{
AA_kmers[kmer_index] = get_kmers(start_site_AA, kmer_index, aa_kmer);
}

// add colours to start_freq_set
for (int i = 0; i < nb_colours; i++)
{
if ((bool)full_unitig_colour[i])
{
for (const auto& entry_aa : AA_kmers)
{
start_freq_set[entry_aa].insert(i);
}
}
}
}
calc_start_freq (start_site_AA, full_unitig_colour, start_freq_set, aa_kmer, nb_colours);
}
}
}
Expand Down Expand Up @@ -605,4 +568,4 @@ NodeColourVector index_graph(std::vector<Kmer>& head_kmer_arr,

// return node_colour vector
return node_colour_vector;
}
}
1 change: 1 addition & 0 deletions src/translation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ std::string translate (const std::string& dna_seq)
{
const std::string codon = dna_seq.substr(i, 3);
aa_seq += codonMap_[codon];
// break if stop codon present
if (codonMap_[codon] == '*') {
break;
}
Expand Down

0 comments on commit 5cb6416

Please sign in to comment.