Resolved all conflicts by accepting incoming changes

samhorsfield96 · Oct 18, 2024 · 0430b35 · 0430b35
2 parents bee0340 + 5cb6416
commit 0430b35
Show file tree

Hide file tree

Showing 37 changed files with 1,406 additions and 1,147 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -19,7 +19,7 @@ include_directories(${Boost_INCLUDE_DIRS})
 
 # find pybind and create python module
 find_package(pybind11 REQUIRED)
-pybind11_add_module(ggCaller_cpp src/bindings.cpp src/call_ORFs.cpp src/graph.cpp src/indexing.cpp src/match_string.cpp src/traversal.cpp src/unitigDict.cpp src/gene_overlap.cpp src/ORF_connection.cpp src/ORF_clustering.cpp src/gene_refinding.cpp src/distances.cpp src/search_DBG.cpp src/edlib/edlib.cpp src/ORF_scoring.cpp src/gene_graph.cpp)
+pybind11_add_module(ggCaller_cpp src/bindings.cpp src/call_ORFs.cpp src/graph.cpp src/indexing.cpp src/match_string.cpp src/traversal.cpp src/unitigDict.cpp src/gene_overlap.cpp src/ORF_connection.cpp src/ORF_clustering.cpp src/gene_refinding.cpp src/distances.cpp src/search_DBG.cpp src/edlib/edlib.cpp src/ORF_scoring.cpp src/gene_graph.cpp src/translation.cpp)
 
 # check for conda environment
 IF( DEFINED ENV{CONDA_PREFIX} )

diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -98,13 +98,15 @@ You will find the following files in the output directory ``ggc_Bentley_et_al_CP
 - ``aligned_gene_sequences``: directory of alignment files for each cluster in FASTA format
 - ``GFF``: directory of GFF files for each sample in GFF3 format
 - ``ggc_data``: intermediate datastructures written to disk, required for querying.
+- ``ORF_dir``: intermediate datastructures written to disk, containing gene predictions.
+- ``Path_dir``: intermediate datastructures written to disk, containing genome paths through the DBG.
 
 Querying the graph
 ------------------
 
 We can now query the graph. To do so, run::
 
-    ggcaller --query CPS_queries.fasta --graph Bentley_et_al_2006_CPS_sequences/input.gfa --colours Bentley_et_al_2006_CPS_sequences/input.color.bfg --data ggc_Bentley_et_al_CPS/ggc_data --out ggc_Bentley_et_al_CPS --threads 4
+    ggcaller --query CPS_queries.fasta --graph Bentley_et_al_2006_CPS_sequences/input.gfa --colours Bentley_et_al_2006_CPS_sequences/input.color.bfg --prev_run ggc_Bentley_et_al_CPS --out ggc_Bentley_et_al_CPS --threads 4
 
 Results will be saved in ``ggc_Bentley_et_al_CPS/matched_queries.fasta``.
 

diff --git a/environment_linux.yml b/environment_linux.yml
@@ -2,7 +2,6 @@ name: ggc_env
 channels:
   - conda-forge
   - bioconda
-  - defaults
 dependencies:
   - python>=3.9
   - cxx-compiler
@@ -43,4 +42,4 @@ dependencies:
   - joblib
   - gffutils
   - cd-hit
-  - python-wget
+  - python-wget
diff --git a/environment_macOS.yml b/environment_macOS.yml
@@ -2,7 +2,6 @@ name: ggc_env
 channels:
   - conda-forge
   - bioconda
-  - defaults
 dependencies:
   - python>=3.9
   - cxx-compiler

diff --git a/ggCaller/__init__.py b/ggCaller/__init__.py
@@ -2,4 +2,4 @@
 
 '''ggCaller: a gene caller for Bifrost graphs'''
 
-__version__ = '1.3.6'
+__version__ = '1.4.0'
diff --git a/ggCaller/__main__.py b/ggCaller/__main__.py
@@ -59,9 +59,9 @@ def get_options():
                     default=False,
                     help='Save graph objects for sequence querying. '
                          '[Default = False] ')
-    IO.add_argument('--data',
+    IO.add_argument('--prev-run',
                     default=None,
-                    help='Directory containing data from previous ggCaller run generated via "--save" ')
+                    help='Directory containing data from previous ggCaller run. Must have been run with "--save" ')
     IO.add_argument(
         "--all-seq-in-graph",
         dest="all_seq_in_graph",
@@ -396,17 +396,25 @@ def main():
     # make sure trailing forward slash is present
     output_dir = os.path.join(options.out, "")
 
+    # create directory for reference path files
+    Path_dir = os.path.join(output_dir, "Path_dir")
+    if not os.path.exists(Path_dir):
+        os.mkdir(Path_dir)    
+
+    # ensure trailing slash present
+    Path_dir = os.path.join(Path_dir, "")
+
     # if build graph specified, build graph and then call ORFs
     if (options.graph is not None) and (options.colours is not None) and (options.query is None):
         graph_tuple = graph.read(options.graph, options.colours, stop_codons_for, stop_codons_rev,
-                                 start_codons_for, start_codons_rev, options.threads, is_ref, ref_set)
+                                 start_codons_for, start_codons_rev, options.threads, is_ref, ref_set, Path_dir)
     # query unitigs in previous saved ggc graph
     elif (options.graph is not None) and (options.colours is not None) and (options.refs is None) and \
             (options.query is not None):
-        if options.data is None:
-            print("Please specify a ggc_data directory from a previous ggCaller run.")
+        if options.prev_run is None:
+            print("Please specify a directory from a previous ggCaller run.")
             sys.exit(1)
-        search_graph(graph, options.graph, options.colours, options.query, options.data, output_dir, options.query_id,
+        search_graph(graph, options.graph, options.colours, options.query, options.prev_run, output_dir, options.query_id,
                      options.threads)
         print("Finished.")
         sys.exit(0)
@@ -415,17 +423,17 @@ def main():
             options.reads is None) and (
             options.query is None):
         graph_tuple = graph.build(options.refs, options.kmer, stop_codons_for, stop_codons_rev, start_codons_for,
-                                  start_codons_rev, options.threads, True, options.no_write_graph, "NA", ref_set)
+                                  start_codons_rev, options.threads, True, options.no_write_graph, "NA", ref_set, Path_dir)
     # if reads file specified for building
     elif (options.graph is None) and (options.colours is None) and (options.refs is None) and (
             options.reads is not None) and (options.query is None):
         graph_tuple = graph.build(options.reads, options.kmer, stop_codons_for, stop_codons_rev, start_codons_for,
-                                  start_codons_rev, options.threads, False, options.no_write_graph, "NA", ref_set)
+                                  start_codons_rev, options.threads, False, options.no_write_graph, "NA", ref_set, Path_dir)
     # if both reads and refs file specified for building
     elif (options.graph is None) and (options.colours is None) and (options.refs is not None) and (
             options.reads is not None) and (options.query is None):
         graph_tuple = graph.build(options.refs, options.kmer, stop_codons_for, stop_codons_rev, start_codons_for,
-                                  start_codons_rev, options.threads, False, options.no_write_graph, options.reads, ref_set)
+                                  start_codons_rev, options.threads, False, options.no_write_graph, options.reads, ref_set, Path_dir)
     elif options.balrog_db is not None:
         db_dir = download_db(options.balrog_db)
         sys.exit(0)
@@ -493,6 +501,17 @@ def main():
     # Create temp_file for cluster_map
     cluster_file = os.path.join(temp_dir, "cluster_map.dat")
 
+    # create directory for ORF map
+    ORFMap_dir = os.path.join(output_dir, "ORF_dir")
+    if not os.path.exists(ORFMap_dir):
+        os.mkdir(ORFMap_dir)
+
+    # ensure trailing slash present
+    ORFMap_dir = os.path.join(ORFMap_dir, "")
+
+    # save the kmer_array to ORFMap_dir
+    graph.data_out(ORFMap_dir + "kmer_array.dat")
+
     # load models models if required
     if not options.no_filter:
         print("Loading gene models...")
@@ -501,15 +520,19 @@ def main():
     else:
         ORF_model_file, TIS_model_file = "NA", "NA"
 
-    gene_tuple = graph.findGenes(options.repeat, overlap, options.max_path_length,
+    file_tuple = graph.findGenes(options.repeat, overlap, options.max_path_length,
                                  options.no_filter, stop_codons_for, start_codons_for, options.min_orf_length,
                                  options.max_ORF_overlap, input_colours, ORF_model_file,
                                  TIS_model_file, options.min_orf_score, options.min_path_score,
                                  options.max_orf_orf_distance, not options.no_clustering,
                                  options.identity_cutoff, options.len_diff_cutoff, options.threads, cluster_file,
-                                 options.score_tolerance)
+                                 options.score_tolerance, ORFMap_dir, Path_dir)
 
-    high_scoring_ORFs, high_scoring_ORF_edges = gene_tuple
+    ORF_file_paths, Edge_file_paths = file_tuple
+
+    # save the ORF file paths
+    with open(ORFMap_dir + "ORF_file_paths.dat", "wb") as o:
+        cPickle.dump(ORF_file_paths, o)
 
     # generate ORF clusters
     if not options.no_clustering:
@@ -518,7 +541,7 @@ def main():
             total_arr = np.array([graph])
             array_shd, array_shd_tup = generate_shared_mem_array(total_arr, smm)
             with Pool(processes=options.threads) as pool:
-                run_panaroo(pool, array_shd_tup, high_scoring_ORFs, high_scoring_ORF_edges,
+                run_panaroo(pool, array_shd_tup, ORF_file_paths, Edge_file_paths,
                             cluster_file, overlap, input_colours, output_dir, temp_dir, options.verbose,
                             options.threads, options.length_outlier_support_proportion, options.identity_cutoff,
                             options.family_threshold, options.min_trailing_support, options.trailing_recursive,
@@ -527,10 +550,10 @@ def main():
                             options.no_write_idx, overlap + 1, options.repeat,
                             options.search_radius, options.refind_prop_match, options.annotate, options.evalue,
                             annotation_db, hmm_db, options.call_variants, options.ignore_pseduogenes,
-                            options.truncation_threshold, options.save, options.refind)
+                            options.truncation_threshold, options.save, options.refind, Path_dir)
 
     else:
-        print_ORF_calls(high_scoring_ORFs, os.path.join(output_dir, "gene_calls"),
+        print_ORF_calls(ORF_file_paths, os.path.join(output_dir, "gene_calls"),
                         input_colours, overlap, graph)
 
         if options.save:
@@ -542,18 +565,16 @@ def main():
             # make sure trailing forward slash is present
             objects_dir = os.path.join(objects_dir, "")
 
-            # serialise graph object and high scoring ORFs to future reading
-            graph[0].data_out(objects_dir + "ggc_graph.dat")
-            with open(objects_dir + "high_scoring_orfs.dat", "wb") as o:
-                cPickle.dump(high_scoring_ORFs, o)
-
             # create index of all high_scoring_ORFs node_IDs
             node_index = defaultdict(list)
-            for colour, gene_dict in high_scoring_ORFs.items():
-                for ORF_ID, ORF_info in gene_dict.items():
-                    entry_ID = str(colour) + "_" + str(ORF_ID)
+            for colour_ID, file_path in ORF_file_paths.items():
+                ORF_map = ggCaller_cpp.read_ORF_file(file_path)
+
+                for ORF_ID, ORF_info in ORF_map.items():
+                    delim = "_0_" if ORF_ID > 0 else "_refound_"
+                    entry_ID = str(colour_ID) + delim + str(ORF_ID)
                     for node in ORF_info[0]:
-                        node_index[node].append(entry_ID)
+                        node_index[abs(node)].append(entry_ID)
 
             with open(objects_dir + "node_index.dat", "wb") as o:
                 cPickle.dump(node_index, o)

diff --git a/ggCaller/graph_traversal.py b/ggCaller/graph_traversal.py
@@ -1,15 +1,20 @@
 import os, sys
 import _pickle as cPickle
 from Bio import SeqIO
+import networkx as nx
+from collections import defaultdict
+import ggCaller_cpp
 
+# TODO update this to work with ggCaller ORF map outputs and panaroo graph which contains annotation
 def search_graph(graph, graphfile, coloursfile, queryfile, objects_dir, output_dir, query_id, num_threads):
     # check if objects_dir present, if not exit
-    if not os.path.exists(objects_dir):
-        print("Please specify a ggc_data directory")
-        sys.exit(1)
-
+    ggc_data_dir = os.path.join(objects_dir, "ggc_data")
     objects_dir = os.path.join(objects_dir, "")
 
+    if not os.path.exists(ggc_data_dir):
+        print("Please specify a directory from a ggCaller run with '--save'")
+        sys.exit(1)
+
     # read in and parse sequences in queryfile
     id_vec = []
     query_vec = []
@@ -32,7 +37,7 @@ def search_graph(graph, graphfile, coloursfile, queryfile, objects_dir, output_d
                     query_vec.append(line.strip())
 
     # read in graph object and high_scoring ORFs and node_index
-    graph.data_in(objects_dir + "ggc_graph.dat", graphfile, coloursfile, num_threads)
+    graph.data_in(os.path.join(objects_dir, "ORF_dir", "kmer_array.dat"), graphfile, coloursfile, num_threads)
 
     # query the sequences in the graph
     print("Querying unitigs in graph...")
@@ -43,39 +48,68 @@ def search_graph(graph, graphfile, coloursfile, queryfile, objects_dir, output_d
         os.path.splitext(os.path.basename(x))[0] for x in input_colours
     ]
 
-    with open(objects_dir + "high_scoring_orfs.dat", "rb") as input_file:
-        high_scoring_ORFs = cPickle.load(input_file)
-
-    with open(objects_dir + "node_index.dat", "rb") as input_file:
+    # load in gene to DBG node mappings
+    with open(os.path.join(ggc_data_dir, "node_index.dat"), "rb") as input_file:
         node_index = cPickle.load(input_file)
 
-    outfile = output_dir + "matched_queries.fasta"
-    print("Matching overlapping ORFs...")
+    # load in paths to ORF data
+    with open(os.path.join(objects_dir, "ORF_dir", "ORF_file_paths.dat"), "rb") as input_file:
+        ORF_file_paths = cPickle.load(input_file)
+
+    # try to load panaroo graph, not present if panaroo not run
+    G = None
+    ids_len_stop = None
+    node_to_node_map = {}
+    try:
+        G = nx.read_gml(os.path.join(objects_dir, "final_graph.gml"))
+        # remap mislabelled nodes
+        for node in G.nodes():
+            node_to_node_map[G.nodes[node]['CID']] = node
+
+        # load in gene to panaroo node mappings
+        with open(os.path.join(ggc_data_dir, "ORF_to_node_map.dat"), "rb") as input_file:
+                ids_len_stop = cPickle.load(input_file)
+    except:
+        pass
 
-    ORF_dict = {}
+    outfile = os.path.join(output_dir, "matched_queries.fasta")
+    print("Matching overlapping ORFs...")
+    ORF_dict = defaultdict(lambda: defaultdict(list))
     for i in range(len(query_nodes)):
         for node in query_nodes[i]:
-            for ORF_ID in node_index[node]:
-                if ORF_ID not in ORF_dict:
-                    ORF_dict[ORF_ID] = []
-                ORF_dict[ORF_ID].append(i)
+            for ORF in node_index[node]:
+                split_ID = ORF.split("_")
+                colour = int(split_ID[0])
+                ORF_ID = int(split_ID[-1])
+                ORF_dict[colour][ORF_ID].append(i)
 
     with open(outfile, "w") as f:
-        for ORF, aligned in ORF_dict.items():
-            split_ID = ORF.split("_")
-            colour = int(split_ID[0])
-            ORF_ID = int(split_ID[-1])
+        for colour, ORF_element in ORF_dict.items():
+            # load ORF info
+            ORF_map = ggCaller_cpp.read_ORF_file(ORF_file_paths[colour])
             fasta_ID = isolate_names[colour] + "_" + str(ORF_ID)
-            ORF_info = high_scoring_ORFs[colour][ORF_ID]
-            seq = graph.generate_sequence(ORF_info[0], ORF_info[1], kmer - 1)
-            if fasta_file:
-                id_set = set([id_vec[i] for i in aligned])
-                queries = ";".join([i for i in id_set])
-            else:
-                queries = ";".join([query_vec[i] for i in aligned])
-
-            # add annotation
-            f.write(
-                    ">" + fasta_ID + " ggcID=" + ORF + " QUERY=" + queries + " annotation=" + ORF_info[-1] + "\n" + seq + "\n")
+
+            # iterate over each ORF that has match
+            for ORF_ID, aligned in ORF_element.items():
+                ORF_info = ORF_map[ORF_ID]
+                seq = graph.generate_sequence(ORF_info[0], ORF_info[1], kmer - 1)
+                if fasta_file:
+                    id_set = set([id_vec[i] for i in aligned])
+                    queries = ";".join([i for i in id_set])
+                else:
+                    queries = ";".join([query_vec[i] for i in aligned])
+
+                # get annotation for node
+                annotation = "NA"
+                if G is not None:
+                    delim = "_0_" if ORF_ID >= 0 else "_refound_"
+                    pan_ORF_id = str(colour) + delim + str(ORF_ID)
+                    # if node not in G then has been removed so pass
+                    node = ids_len_stop[pan_ORF_id][2]
+                    annotation = G.nodes[node_to_node_map[node]]["description"]
+
+                # add annotation
+                f.write(
+                        ">" + fasta_ID + " ggcID=" + ORF + " QUERY=" + queries + " annotation=" + annotation + "\n" + seq + "\n")
 
     return
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,4 +2,4 @@

		'''ggCaller: a gene caller for Bifrost graphs'''

		__version__ = '1.3.6'
		__version__ = '1.4.0'