Skip to content

Commit

Permalink
Resolved all conflicts by accepting incoming changes
Browse files Browse the repository at this point in the history
  • Loading branch information
samhorsfield96 committed Oct 18, 2024
2 parents bee0340 + 5cb6416 commit 0430b35
Show file tree
Hide file tree
Showing 37 changed files with 1,406 additions and 1,147 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ include_directories(${Boost_INCLUDE_DIRS})

# find pybind and create python module
find_package(pybind11 REQUIRED)
pybind11_add_module(ggCaller_cpp src/bindings.cpp src/call_ORFs.cpp src/graph.cpp src/indexing.cpp src/match_string.cpp src/traversal.cpp src/unitigDict.cpp src/gene_overlap.cpp src/ORF_connection.cpp src/ORF_clustering.cpp src/gene_refinding.cpp src/distances.cpp src/search_DBG.cpp src/edlib/edlib.cpp src/ORF_scoring.cpp src/gene_graph.cpp)
pybind11_add_module(ggCaller_cpp src/bindings.cpp src/call_ORFs.cpp src/graph.cpp src/indexing.cpp src/match_string.cpp src/traversal.cpp src/unitigDict.cpp src/gene_overlap.cpp src/ORF_connection.cpp src/ORF_clustering.cpp src/gene_refinding.cpp src/distances.cpp src/search_DBG.cpp src/edlib/edlib.cpp src/ORF_scoring.cpp src/gene_graph.cpp src/translation.cpp)

# check for conda environment
IF( DEFINED ENV{CONDA_PREFIX} )
Expand Down
4 changes: 3 additions & 1 deletion docs/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,15 @@ You will find the following files in the output directory ``ggc_Bentley_et_al_CP
- ``aligned_gene_sequences``: directory of alignment files for each cluster in FASTA format
- ``GFF``: directory of GFF files for each sample in GFF3 format
- ``ggc_data``: intermediate datastructures written to disk, required for querying.
- ``ORF_dir``: intermediate datastructures written to disk, containing gene predictions.
- ``Path_dir``: intermediate datastructures written to disk, containing genome paths through the DBG.

Querying the graph
------------------

We can now query the graph. To do so, run::

ggcaller --query CPS_queries.fasta --graph Bentley_et_al_2006_CPS_sequences/input.gfa --colours Bentley_et_al_2006_CPS_sequences/input.color.bfg --data ggc_Bentley_et_al_CPS/ggc_data --out ggc_Bentley_et_al_CPS --threads 4
ggcaller --query CPS_queries.fasta --graph Bentley_et_al_2006_CPS_sequences/input.gfa --colours Bentley_et_al_2006_CPS_sequences/input.color.bfg --prev_run ggc_Bentley_et_al_CPS --out ggc_Bentley_et_al_CPS --threads 4

Results will be saved in ``ggc_Bentley_et_al_CPS/matched_queries.fasta``.

Expand Down
3 changes: 1 addition & 2 deletions environment_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ name: ggc_env
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- python>=3.9
- cxx-compiler
Expand Down Expand Up @@ -43,4 +42,4 @@ dependencies:
- joblib
- gffutils
- cd-hit
- python-wget
- python-wget
1 change: 0 additions & 1 deletion environment_macOS.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ name: ggc_env
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- python>=3.9
- cxx-compiler
Expand Down
2 changes: 1 addition & 1 deletion ggCaller/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

'''ggCaller: a gene caller for Bifrost graphs'''

__version__ = '1.3.6'
__version__ = '1.4.0'
69 changes: 45 additions & 24 deletions ggCaller/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ def get_options():
default=False,
help='Save graph objects for sequence querying. '
'[Default = False] ')
IO.add_argument('--data',
IO.add_argument('--prev-run',
default=None,
help='Directory containing data from previous ggCaller run generated via "--save" ')
help='Directory containing data from previous ggCaller run. Must have been run with "--save" ')
IO.add_argument(
"--all-seq-in-graph",
dest="all_seq_in_graph",
Expand Down Expand Up @@ -396,17 +396,25 @@ def main():
# make sure trailing forward slash is present
output_dir = os.path.join(options.out, "")

# create directory for reference path files
Path_dir = os.path.join(output_dir, "Path_dir")
if not os.path.exists(Path_dir):
os.mkdir(Path_dir)

# ensure trailing slash present
Path_dir = os.path.join(Path_dir, "")

# if build graph specified, build graph and then call ORFs
if (options.graph is not None) and (options.colours is not None) and (options.query is None):
graph_tuple = graph.read(options.graph, options.colours, stop_codons_for, stop_codons_rev,
start_codons_for, start_codons_rev, options.threads, is_ref, ref_set)
start_codons_for, start_codons_rev, options.threads, is_ref, ref_set, Path_dir)
# query unitigs in previous saved ggc graph
elif (options.graph is not None) and (options.colours is not None) and (options.refs is None) and \
(options.query is not None):
if options.data is None:
print("Please specify a ggc_data directory from a previous ggCaller run.")
if options.prev_run is None:
print("Please specify a directory from a previous ggCaller run.")
sys.exit(1)
search_graph(graph, options.graph, options.colours, options.query, options.data, output_dir, options.query_id,
search_graph(graph, options.graph, options.colours, options.query, options.prev_run, output_dir, options.query_id,
options.threads)
print("Finished.")
sys.exit(0)
Expand All @@ -415,17 +423,17 @@ def main():
options.reads is None) and (
options.query is None):
graph_tuple = graph.build(options.refs, options.kmer, stop_codons_for, stop_codons_rev, start_codons_for,
start_codons_rev, options.threads, True, options.no_write_graph, "NA", ref_set)
start_codons_rev, options.threads, True, options.no_write_graph, "NA", ref_set, Path_dir)
# if reads file specified for building
elif (options.graph is None) and (options.colours is None) and (options.refs is None) and (
options.reads is not None) and (options.query is None):
graph_tuple = graph.build(options.reads, options.kmer, stop_codons_for, stop_codons_rev, start_codons_for,
start_codons_rev, options.threads, False, options.no_write_graph, "NA", ref_set)
start_codons_rev, options.threads, False, options.no_write_graph, "NA", ref_set, Path_dir)
# if both reads and refs file specified for building
elif (options.graph is None) and (options.colours is None) and (options.refs is not None) and (
options.reads is not None) and (options.query is None):
graph_tuple = graph.build(options.refs, options.kmer, stop_codons_for, stop_codons_rev, start_codons_for,
start_codons_rev, options.threads, False, options.no_write_graph, options.reads, ref_set)
start_codons_rev, options.threads, False, options.no_write_graph, options.reads, ref_set, Path_dir)
elif options.balrog_db is not None:
db_dir = download_db(options.balrog_db)
sys.exit(0)
Expand Down Expand Up @@ -493,6 +501,17 @@ def main():
# Create temp_file for cluster_map
cluster_file = os.path.join(temp_dir, "cluster_map.dat")

# create directory for ORF map
ORFMap_dir = os.path.join(output_dir, "ORF_dir")
if not os.path.exists(ORFMap_dir):
os.mkdir(ORFMap_dir)

# ensure trailing slash present
ORFMap_dir = os.path.join(ORFMap_dir, "")

# save the kmer_array to ORFMap_dir
graph.data_out(ORFMap_dir + "kmer_array.dat")

# load models models if required
if not options.no_filter:
print("Loading gene models...")
Expand All @@ -501,15 +520,19 @@ def main():
else:
ORF_model_file, TIS_model_file = "NA", "NA"

gene_tuple = graph.findGenes(options.repeat, overlap, options.max_path_length,
file_tuple = graph.findGenes(options.repeat, overlap, options.max_path_length,
options.no_filter, stop_codons_for, start_codons_for, options.min_orf_length,
options.max_ORF_overlap, input_colours, ORF_model_file,
TIS_model_file, options.min_orf_score, options.min_path_score,
options.max_orf_orf_distance, not options.no_clustering,
options.identity_cutoff, options.len_diff_cutoff, options.threads, cluster_file,
options.score_tolerance)
options.score_tolerance, ORFMap_dir, Path_dir)

high_scoring_ORFs, high_scoring_ORF_edges = gene_tuple
ORF_file_paths, Edge_file_paths = file_tuple

# save the ORF file paths
with open(ORFMap_dir + "ORF_file_paths.dat", "wb") as o:
cPickle.dump(ORF_file_paths, o)

# generate ORF clusters
if not options.no_clustering:
Expand All @@ -518,7 +541,7 @@ def main():
total_arr = np.array([graph])
array_shd, array_shd_tup = generate_shared_mem_array(total_arr, smm)
with Pool(processes=options.threads) as pool:
run_panaroo(pool, array_shd_tup, high_scoring_ORFs, high_scoring_ORF_edges,
run_panaroo(pool, array_shd_tup, ORF_file_paths, Edge_file_paths,
cluster_file, overlap, input_colours, output_dir, temp_dir, options.verbose,
options.threads, options.length_outlier_support_proportion, options.identity_cutoff,
options.family_threshold, options.min_trailing_support, options.trailing_recursive,
Expand All @@ -527,10 +550,10 @@ def main():
options.no_write_idx, overlap + 1, options.repeat,
options.search_radius, options.refind_prop_match, options.annotate, options.evalue,
annotation_db, hmm_db, options.call_variants, options.ignore_pseduogenes,
options.truncation_threshold, options.save, options.refind)
options.truncation_threshold, options.save, options.refind, Path_dir)

else:
print_ORF_calls(high_scoring_ORFs, os.path.join(output_dir, "gene_calls"),
print_ORF_calls(ORF_file_paths, os.path.join(output_dir, "gene_calls"),
input_colours, overlap, graph)

if options.save:
Expand All @@ -542,18 +565,16 @@ def main():
# make sure trailing forward slash is present
objects_dir = os.path.join(objects_dir, "")

# serialise graph object and high scoring ORFs to future reading
graph[0].data_out(objects_dir + "ggc_graph.dat")
with open(objects_dir + "high_scoring_orfs.dat", "wb") as o:
cPickle.dump(high_scoring_ORFs, o)

# create index of all high_scoring_ORFs node_IDs
node_index = defaultdict(list)
for colour, gene_dict in high_scoring_ORFs.items():
for ORF_ID, ORF_info in gene_dict.items():
entry_ID = str(colour) + "_" + str(ORF_ID)
for colour_ID, file_path in ORF_file_paths.items():
ORF_map = ggCaller_cpp.read_ORF_file(file_path)

for ORF_ID, ORF_info in ORF_map.items():
delim = "_0_" if ORF_ID > 0 else "_refound_"
entry_ID = str(colour_ID) + delim + str(ORF_ID)
for node in ORF_info[0]:
node_index[node].append(entry_ID)
node_index[abs(node)].append(entry_ID)

with open(objects_dir + "node_index.dat", "wb") as o:
cPickle.dump(node_index, o)
Expand Down
96 changes: 65 additions & 31 deletions ggCaller/graph_traversal.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
import os, sys
import _pickle as cPickle
from Bio import SeqIO
import networkx as nx
from collections import defaultdict
import ggCaller_cpp

# TODO update this to work with ggCaller ORF map outputs and panaroo graph which contains annotation
def search_graph(graph, graphfile, coloursfile, queryfile, objects_dir, output_dir, query_id, num_threads):
# check if objects_dir present, if not exit
if not os.path.exists(objects_dir):
print("Please specify a ggc_data directory")
sys.exit(1)

ggc_data_dir = os.path.join(objects_dir, "ggc_data")
objects_dir = os.path.join(objects_dir, "")

if not os.path.exists(ggc_data_dir):
print("Please specify a directory from a ggCaller run with '--save'")
sys.exit(1)

# read in and parse sequences in queryfile
id_vec = []
query_vec = []
Expand All @@ -32,7 +37,7 @@ def search_graph(graph, graphfile, coloursfile, queryfile, objects_dir, output_d
query_vec.append(line.strip())

# read in graph object and high_scoring ORFs and node_index
graph.data_in(objects_dir + "ggc_graph.dat", graphfile, coloursfile, num_threads)
graph.data_in(os.path.join(objects_dir, "ORF_dir", "kmer_array.dat"), graphfile, coloursfile, num_threads)

# query the sequences in the graph
print("Querying unitigs in graph...")
Expand All @@ -43,39 +48,68 @@ def search_graph(graph, graphfile, coloursfile, queryfile, objects_dir, output_d
os.path.splitext(os.path.basename(x))[0] for x in input_colours
]

with open(objects_dir + "high_scoring_orfs.dat", "rb") as input_file:
high_scoring_ORFs = cPickle.load(input_file)

with open(objects_dir + "node_index.dat", "rb") as input_file:
# load in gene to DBG node mappings
with open(os.path.join(ggc_data_dir, "node_index.dat"), "rb") as input_file:
node_index = cPickle.load(input_file)

outfile = output_dir + "matched_queries.fasta"
print("Matching overlapping ORFs...")
# load in paths to ORF data
with open(os.path.join(objects_dir, "ORF_dir", "ORF_file_paths.dat"), "rb") as input_file:
ORF_file_paths = cPickle.load(input_file)

# try to load panaroo graph, not present if panaroo not run
G = None
ids_len_stop = None
node_to_node_map = {}
try:
G = nx.read_gml(os.path.join(objects_dir, "final_graph.gml"))
# remap mislabelled nodes
for node in G.nodes():
node_to_node_map[G.nodes[node]['CID']] = node

# load in gene to panaroo node mappings
with open(os.path.join(ggc_data_dir, "ORF_to_node_map.dat"), "rb") as input_file:
ids_len_stop = cPickle.load(input_file)
except:
pass

ORF_dict = {}
outfile = os.path.join(output_dir, "matched_queries.fasta")
print("Matching overlapping ORFs...")
ORF_dict = defaultdict(lambda: defaultdict(list))
for i in range(len(query_nodes)):
for node in query_nodes[i]:
for ORF_ID in node_index[node]:
if ORF_ID not in ORF_dict:
ORF_dict[ORF_ID] = []
ORF_dict[ORF_ID].append(i)
for ORF in node_index[node]:
split_ID = ORF.split("_")
colour = int(split_ID[0])
ORF_ID = int(split_ID[-1])
ORF_dict[colour][ORF_ID].append(i)

with open(outfile, "w") as f:
for ORF, aligned in ORF_dict.items():
split_ID = ORF.split("_")
colour = int(split_ID[0])
ORF_ID = int(split_ID[-1])
for colour, ORF_element in ORF_dict.items():
# load ORF info
ORF_map = ggCaller_cpp.read_ORF_file(ORF_file_paths[colour])
fasta_ID = isolate_names[colour] + "_" + str(ORF_ID)
ORF_info = high_scoring_ORFs[colour][ORF_ID]
seq = graph.generate_sequence(ORF_info[0], ORF_info[1], kmer - 1)
if fasta_file:
id_set = set([id_vec[i] for i in aligned])
queries = ";".join([i for i in id_set])
else:
queries = ";".join([query_vec[i] for i in aligned])

# add annotation
f.write(
">" + fasta_ID + " ggcID=" + ORF + " QUERY=" + queries + " annotation=" + ORF_info[-1] + "\n" + seq + "\n")

# iterate over each ORF that has match
for ORF_ID, aligned in ORF_element.items():
ORF_info = ORF_map[ORF_ID]
seq = graph.generate_sequence(ORF_info[0], ORF_info[1], kmer - 1)
if fasta_file:
id_set = set([id_vec[i] for i in aligned])
queries = ";".join([i for i in id_set])
else:
queries = ";".join([query_vec[i] for i in aligned])

# get annotation for node
annotation = "NA"
if G is not None:
delim = "_0_" if ORF_ID >= 0 else "_refound_"
pan_ORF_id = str(colour) + delim + str(ORF_ID)
# if node not in G then has been removed so pass
node = ids_len_stop[pan_ORF_id][2]
annotation = G.nodes[node_to_node_map[node]]["description"]

# add annotation
f.write(
">" + fasta_ID + " ggcID=" + ORF + " QUERY=" + queries + " annotation=" + annotation + "\n" + seq + "\n")

return
Loading

0 comments on commit 0430b35

Please sign in to comment.