MRG: read taxburst JSON for sankey & treemap (#87)

ctb · web-flow · commit 649a5dc1e041 · 2025-09-21T10:01:01.000-07:00
* WIP: read taxburst JSON for sankey &amp; treemap

* add json loading to sankey

* note

* normalize; some debug

* fix multiple links stuff in tax_annotate output

* refactor

* add taxburst dep

* cleanup

* add taxburst examples to test workflow

* update taxburst

* comment; cleanup

* bump version

* attempt remaining part of manual merge

* try manual merge round 2

* fix Snakefile removal
diff --git a/README.md b/README.md
@@ -11,7 +11,9 @@ categories. It also includes support for sparse comparison output
 formats produced by the fast multithreaded `manysearch` and `pairwise`
 functions in the
 [branchwater plugin for sourmash](https://github.com/sourmash-bio/sourmash_plugin_branchwater).
-Finally, it includes a sankey/alluvial flow plot to visualize metagenomic profiling from the `sourmash gather` to `sourmash tax` workflow.
+Finally, it includes a sankey/alluvial flow plot and a treemap plot to
+visualize metagenomic profiling from the `sourmash gather` to
+`sourmash tax` workflow.
 
 ## Why does this plugin exist?
 
@@ -437,6 +439,10 @@ produces:
 
 By default, we will open an interactive `html` file. To output to a file, specify the file name with `-o` and use your desired filetype extension (.html, .png, .jpg, .jpeg, .pdf, or .svg). To specify the title, use `--title`.
 
+The `sankey` command also supports ingest of
+[taxburst's JSON format](https://taxburst.github.io/taxburst/command-line/#outputting-json-format),
+which allows `sankey` to be used with SingleM and Krona formats, among
+others.
 
 ### `tree` - plot Neighbor-Joining tree
 
@@ -482,6 +488,11 @@ produces:
 
 ![treemap visualization](examples/tax-mg.treemap.png)
 
+The `treemap` command also supports ingest of
+[taxburst's JSON format](https://taxburst.github.io/taxburst/command-line/#outputting-json-format),
+which allows `treemap` to be used with SingleM and Krona formats, among
+others.
+
 ### `presence_filter` - plot presence/abundance scatterplot of genomes detected by gather
 
 It is sometimes interesting to look at the distribution of size and abundance
diff --git a/examples/Snakefile b/examples/Snakefile
@@ -19,11 +19,20 @@ rule all:
         "weighted_venn.png",
         "tax-mg.sankey.png",
         "tax-annot.sankey.png",
+        "taxburst.sankey.html",
         "disttree10sketches.matrix.png",
         "disttree10sketches.pairwise.png",
         "tax-mg.treemap.png",
+        "taxburst.treemap.png",
         "presence_filter.png",
 
+rule tax:
+    input:
+        "tax-mg.sankey.png",
+        "tax-mg.treemap.png",
+        "taxburst.treemap.png",
+        "taxburst.sankey.html",
+
 rule make_10sketches:
     input:
         expand("sketches/{n}.sig.zip", n=sketches_10)
@@ -312,6 +321,15 @@ rule treemap_mgx_summary:
         sourmash scripts treemap {input} -o {output}
         """
 
+rule taxburst_treemap:
+    input:
+        "tax/SRR11125891.t0.lineages.json",
+    output:
+        "taxburst.treemap.png",
+    shell: """
+        sourmash scripts treemap --taxburst-json {input} -o {output}
+    """
+
 rule sankey_mgx_annotate:
     input:
         "tax/test.gather.with-lineages.csv"
@@ -322,6 +340,15 @@ rule sankey_mgx_annotate:
         sourmash scripts sankey --annotate-csv {input} -o {output}
         """
 
+rule sankey_taxburst:
+    input:
+        "tax/SRR11125891.t0.lineages.json",
+    output:
+        "taxburst.sankey.html",
+    shell: """
+        sourmash scripts sankey --taxburst-json {input} -o {output}
+    """
+
 rule tree_10sketches_compare_matrix:
     input:
         cmp="10sketches.cmp",
diff --git a/examples/tax/SRR11125891.t0.lineages.json b/examples/tax/SRR11125891.t0.lineages.json
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,14 +3,14 @@ name = "sourmash_plugin_betterplot"
 description = "sourmash plugin for improved plotting/viz and cluster examination."
 readme = "README.md"
 requires-python = ">=3.11"
-version = "0.5.5"
+version = "0.5.6"
 
 # note: "legacy_cgi" is currently needed for ete3, but may need to be changed on next ete release, see: https://github.com/etetoolkit/ete/issues/780
 dependencies = ["sourmash>=4.9.4,<5", "sourmash_utils>=0.2",
                 "matplotlib", "numpy", "scipy", "scikit-learn",
                 "seaborn", "upsetplot", "matplotlib_venn", "pandas",
                 "plotly", "biopython", "ete3", "kaleido", "pyqt5",
-                "legacy_cgi", "squarify==0.4.4"]
+                "legacy_cgi", "squarify==0.4.4", "taxburst>=0.3.1"]
 
 [build-system]
 requires = ["setuptools>=61.0"]
diff --git a/src/sourmash_plugin_betterplot.py b/src/sourmash_plugin_betterplot.py
@@ -13,6 +13,7 @@
 from collections import defaultdict, Counter
 from itertools import chain, combinations
 import pickle
+import json
 
 import numpy
 import pylab
@@ -28,6 +29,7 @@
 import pandas as pd
 import plotly.graph_objects as go
 import squarify
+import taxburst
 
 # this turns off a warning in presence_filter, but results in an error in
 # upsetplot :sweat_smile:
@@ -38,7 +40,7 @@
 
 import sourmash
 from sourmash import sourmash_args
-from sourmash.logging import debug_literal, error, notify
+from sourmash.logging import debug_literal, error, notify, print_results
 from sourmash.plugins import CommandLinePlugin
 import sourmash_utils
 from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, add_scaled_arg)
@@ -211,6 +213,21 @@ def sample_d_to_idents(sample_d):
 
     return xx
 
+def load_taxburst_json(filename, *, normalize_counts=True):
+    """
+    Load in JSON format output by taxburst.
+
+    Optionally normalize counts to fractions that sum to 1.
+    """
+    with open(filename, 'rt') as fp:
+        top_nodes = json.load(fp)
+
+    if normalize_counts:
+        taxburst.tree_utils.normalize_tree_counts(top_nodes)
+
+    return top_nodes
+
+
 #
 # CLI plugin code
 #
@@ -1765,6 +1782,7 @@ def save_sankey_diagram(fig, output_file):
         else:
             fig.show()  # Show the plot if no output file is specified
 
+
 def load_lingroups(map_csv):
     """Return {full_lineage_string: human_name}."""
     lin2name = {}
@@ -1774,6 +1792,7 @@ def load_lingroups(map_csv):
     notify(f"loaded {len(lin2name)} lingroup names from '{map_csv}'")
     return lin2name
 
+
 def expand_with_ancestors_sum(rows, fraction_col):
     """Expand rows with all ancestor paths, summing children if ancestor missing."""
     lineage_fracs = {row["lineage"].strip(): float(row[fraction_col])
@@ -2038,6 +2057,47 @@ def process_csv_for_sankey(input_csv, csv_type, lingroup_map=None):
 
     return nodes, links, hover_texts
 
+
+def process_taxburst_for_sankey(input_file):
+    nodes = []  # List of unique taxonomy nodes
+    node_map = {}  # Map taxonomic label to index
+    links = []  # List of link connections with flow values
+    hover_texts = []  # Custom hover text for percentages
+    processed_lineages = set()  # Tracks added lineage links
+
+    top_nodes = load_taxburst_json(input_file)
+    all_nodes = taxburst.tree_utils.collect_all_nodes(top_nodes)
+
+    # Process each row in the dataset
+    for n, node in enumerate(all_nodes):
+        source_label = node["name"]
+
+        if source_label not in node_map:
+            node_map[source_label] = len(nodes)
+            nodes.append(source_label)
+
+        # Iterate through children
+        for child_node in node.get("children", []):
+            percent = float(child_node["count"]) * 100
+            target_label = child_node["name"]
+
+            # Assign indices to nodes
+            if target_label not in node_map:
+                node_map[target_label] = len(nodes)
+                nodes.append(target_label)
+
+            # Create a link between source and target
+            links.append({
+                "source": node_map[source_label],
+                "target": node_map[target_label],
+                "value": percent
+            })
+            hover_texts.append(f"{source_label} → {target_label}<br>{percent:.2f}%")
+    notify(f"loaded {n+1} nodes from '{input_file}'")
+
+    return nodes, links, hover_texts
+
+
 class Command_Sankey(CommandLinePlugin):
     command = 'sankey'
     description = """\
@@ -2056,6 +2116,7 @@ def __init__(self, subparser):
         group = subparser.add_mutually_exclusive_group(required=True)
         group.add_argument("--summary-csv", type=str, help="Path to csv_summary generated by running 'sourmash tax metagenome' on a sourmash gather csv")
         group.add_argument("--annotate-csv", type=str, help="Path to 'with-lineages' file generated by running 'sourmash tax annotate' on a sourmash gather csv")
+        group.add_argument('--taxburst-json', type=str, help="taxburst JSON output")
         subparser.add_argument("--lingroups", type=str, help="Path to 'lingroups' file (lineage to lingroup mapping) to enable lingroup labeling in the Sankey diagram. Not needed if `csv_summary` was generated with `--lingroups` file provided.")
         
         subparser.add_argument("-o", "--output", type=str, help="output file for alluvial flow diagram")
@@ -2064,25 +2125,31 @@ def __init__(self, subparser):
         subparser.epilog = "You must provide either --summary-csv or --annotate-csv, but not both."
 
     def main(self, args):
-        # Build info appropriately based on input file type
-        if args.summary_csv:
-            input_csv = args.summary_csv
-            csv_type = "csv_summary"
-            required_headers = ["f_weighted_at_rank", "lineage"]
+        if args.summary_csv or args.annotate_csv:
+            # Build info appropriately based on input file type
+            if args.summary_csv:
+                input_csv = args.summary_csv
+                csv_type = "csv_summary"
+                required_headers = ["f_weighted_at_rank", "lineage"]
+            else:
+                input_csv = args.annotate_csv
+                csv_type = "with-lineages"
+                required_headers = ["f_unique_weighted", "lineage"]
+
+            # Check if the required headers are present
+            with open(input_csv, 'r') as file:
+                reader = csv.DictReader(file)
+                if not all(header in reader.fieldnames for header in required_headers):
+                    raise ValueError(f"Expected headers {required_headers} not found. Is this a correct file for '{csv_type}' type?")
+
+            # process csv
+            nodes, links, hover_texts = process_csv_for_sankey(input_csv, csv_type, lingroup_map=args.lingroups)
+            base_title = os.path.basename(input_csv.rsplit(".csv")[0])
+        elif args.taxburst_json:
+            nodes, links, hover_texts = process_taxburst_for_sankey(args.taxburst_json)
+            base_title = os.path.basename(args.taxburst_json.rsplit(".json")[0])
         else:
-            input_csv = args.annotate_csv
-            csv_type = "with-lineages"
-            required_headers = ["f_unique_weighted", "lineage"]
-
-        # Check if the required headers are present
-        with open(input_csv, 'r') as file:
-            reader = csv.DictReader(file)
-            if not all(header in reader.fieldnames for header in required_headers):
-                raise ValueError(f"Expected headers {required_headers} not found. Is this a correct file for '{csv_type}' type?")
-
-        # process csv
-        nodes, links, hover_texts = process_csv_for_sankey(input_csv, csv_type, lingroup_map=args.lingroups)
-        base_title = os.path.basename(input_csv.rsplit(".csv")[0])
+            assert 0, "unhandled input format"
 
         # Create Sankey diagram
         fig = go.Figure(go.Sankey(
@@ -2283,14 +2350,17 @@ class Command_TreeMap(CommandLinePlugin):
 
     def __init__(self, subparser):
         super().__init__(subparser)
-        subparser.add_argument('csvfile', help='csv_summary output from tax metagenome')
+        subparser.add_argument('inputfile', help='input taxonomy - by default, csv_summary output from tax metagenome')
         subparser.add_argument('-o', '--output', required=True,
                                help='output figure to this file')
         subparser.add_argument('-r', '--rank', default='phylum',
                                help='display at this rank')
         subparser.add_argument('-n', '--num-to-display', type=int,
                                default=25,
                                help="display at most these many taxa; aggregate the remainder (default: 25; 0 to display all)")
+        subparser.add_argument('--taxburst-json',
+                               action='store_true',
+                               help='input format is JSON from taxburst')
 
         
     def main(self, args):
@@ -2303,24 +2373,39 @@ def plot_treemap(args):
     import itertools
     cmap = colormaps['viridis']
 
-    df = pd.read_csv(args.csvfile)
+    if not args.taxburst_json:
+        df = pd.read_csv(args.inputfile)
 
-    print(f"reading input file '{args.csvfile}'")
-    for colname in ('query_name', 'rank', 'f_weighted_at_rank', 'lineage'):
-        if colname not in df.columns:
-            print(f"input is missing column '{colname}'; is this a csv_summary file?")
-            sys.exit(-1)
+        print(f"reading input file '{args.inputfile}'")
+        for colname in ('query_name', 'rank', 'f_weighted_at_rank', 'lineage'):
+            if colname not in df.columns:
+                print(f"input is missing column '{colname}'; is this a csv_summary file?")
+                sys.exit(-1)
 
-    df = df.sort_values(by='f_weighted_at_rank')
+        df = df.sort_values(by='f_weighted_at_rank')
 
-    # select rank
-    df2 = df[df['rank'] == args.rank]
-    df2['name'] = df2['lineage'].apply(lambda x: x.split(';')[-1])
+        # select rank
+        df2 = df[df['rank'] == args.rank]
+        df2['name'] = df2['lineage'].apply(lambda x: x.split(';')[-1])
 
-    fractions = list(df2['f_weighted_at_rank'].tolist())
-    names = list(df2['name'].tolist())
-    fractions.reverse()
-    names.reverse()
+        fractions = list(df2['f_weighted_at_rank'].tolist())
+        names = list(df2['name'].tolist())
+        fractions.reverse()
+        names.reverse()
+    else:
+        assert args.taxburst_json
+        top_nodes = load_taxburst_json(args.inputfile)
+
+        all_nodes = taxburst.tree_utils.collect_all_nodes(top_nodes)
+        all_nodes = [ n for n in all_nodes if n["rank"] == args.rank ]
+        unclass = [ n for n in top_nodes if n["name"] == "unclassified" ]
+        if unclass:
+            assert len(unclass) == 1
+            all_nodes.append(unclass[0])
+
+        all_nodes.sort(key=lambda n: -n["count"])
+        fractions = [ n["count"] for n in all_nodes ]
+        names = [ n["name"] for n in all_nodes ]
 
     num = max(args.num_to_display, 0) # non-negative
     num = min(args.num_to_display, len(names)) # list of names