Graphify-Labs · endritmurati99 · Jul 1, 2026 · Jul 1, 2026 · Jul 1, 2026
diff --git a/graphify/__main__.py b/graphify/__main__.py
@@ -171,17 +171,21 @@ def _version_tuple(version: str) -> tuple[int, ...]:
     Reads the leading digits of each dot-segment, so pre/post-release suffixes
     (``1.0.0rc1``) compare by their numeric core. A non-numeric or empty segment
     becomes 0, so a malformed stamp degrades to a conservative comparison rather
-    than raising.
+    than raising. PEP 440 local versions (``+local``) sort after their public
+    version, matching packaging.version for the install-warning direction check.
     """
+    base, plus, _local = str(version).partition("+")
     parts: list[int] = []
-    for segment in str(version).split("."):
+    for segment in base.split("."):
         digits = ""
         for ch in segment:
             if ch.isdigit():
                 digits += ch
             else:
                 break
         parts.append(int(digits) if digits else 0)
+    if plus:
+        parts.append(1)
     return tuple(parts)
 
 
@@ -802,12 +806,8 @@ def install(platform: str = "claude", *, project: bool = False, project_dir: Pat
     if platform == "opencode":
         _install_opencode_plugin(project_dir if project else Path("."))
 
-    # Refresh version stamps in all other previously-installed skill dirs so
-    # stale-version warnings don't fire for platforms not explicitly re-installed.
     if project:
         _print_project_git_add_hint([_project_scope_root(skill_dst, project_dir)])
-    else:
-        _refresh_all_version_stamps()
 
     print()
     print("Done. Open your AI coding assistant and type:")
@@ -822,6 +822,34 @@ def _print_install_usage() -> None:
     print(f"Platforms: {platforms}")
 
 
+def _print_cluster_usage(command: str) -> None:
+    if command == "label":
+        print("Usage: graphify label <path> [options]")
+        print()
+        print("Options:")
+        print("  --missing-only         keep existing labels and only name missing/placeholder communities")
+        print("  --backend <name>       backend to use (default: auto-detect from API keys)")
+        print("  --model <name>         model to use for community naming")
+        print("  --max-concurrency N    parallel labeling LLM calls (default 4; forced to 1 for ollama/claude-cli)")
+        print("  --batch-size N         communities per labeling LLM call (default 100)")
+        return
+
+    print("Usage: graphify cluster-only <path> [options]")
+    print()
+    print("Options:")
+    print("  --graph <path>          path to graph.json (default <path>/graphify-out/graph.json)")
+    print("  --no-viz                skip graph.html generation (useful for >5000 node graphs / CI)")
+    print("  --no-label              keep 'Community N' placeholders (skip LLM community naming)")
+    print("  --resolution N          Leiden/Louvain resolution (higher = more, smaller communities)")
+    print("  --exclude-hubs N        exclude top-N percentile degree hubs from partitioning")
+    print("  --backend <name>        backend to use for community naming (default: auto-detect)")
+    print("  --model <name>          model to use for community naming")
+    print("  --max-concurrency N     parallel community-labeling LLM calls (default 4; forced to 1 for ollama/claude-cli)")
+    print("  --batch-size N          communities per labeling LLM call (default 100)")
+    print("  --min-community-size N  omit smaller communities from report detail (default 3)")
+    print("  --timing                print per-stage wall-clock timings")
+
+
 # The always-on instruction blocks are packaged markdown under graphify/always_on/,
 # generated by tools/skillgen and guarded by `skillgen --check`. Reading them at
 # load keeps the install-string / issue-#580 contract byte-for-byte while letting
@@ -2291,10 +2319,14 @@ def main() -> None:
         print("    --no-viz                skip graph.html generation (useful for >5000 node graphs / CI)")
         print("    --graph <path>          path to graph.json (default <path>/graphify-out/graph.json)")
         print("    --no-label              keep 'Community N' placeholders (skip LLM community naming)")
+        print("    --resolution=N          Leiden/Louvain resolution (higher = more, smaller communities)")
+        print("    --exclude-hubs=N        exclude top-N percentile degree hubs from partitioning")
         print("    --backend=<name>        backend to use for community naming (default: auto-detect)")
         print("    --model=<name>          model to use for community naming")
         print("    --max-concurrency=N     parallel community-labeling LLM calls (default 4; forced to 1 for ollama/claude-cli)")
         print("    --batch-size=N          communities per labeling LLM call (default 100)")
+        print("    --min-community-size=N  omit smaller communities from report detail (default 3)")
+        print("    --timing                print per-stage wall-clock timings")
         print("  label <path>            (re)name communities with the configured LLM backend, regenerate report")
         print("    --missing-only         keep existing labels and only name missing/placeholder communities")
         print("    --backend=<name>        backend to use (default: auto-detect from API keys)")
@@ -2444,7 +2476,10 @@ def main() -> None:
     # "install"/"uninstall" which have their own per-subcommand help handlers.
     _FREE_TEXT_CMDS = {"query", "explain", "path", "save-result", "install", "uninstall"}
     if cmd not in _FREE_TEXT_CMDS and any(a in {"-h", "--help", "-?"} for a in sys.argv[2:]):
-        print(f"Run 'graphify --help' for full usage.")
+        if cmd in {"cluster-only", "label"}:
+            _print_cluster_usage(cmd)
+        else:
+            print(f"Run 'graphify --help' for full usage.")
         return
 
     if cmd == "install":
@@ -3093,6 +3128,7 @@ def main() -> None:
             )
             sys.exit(1)
         from graphify.serve import _score_nodes
+        from graphify.security import sanitize_label as _sl
         from networkx.readwrite import json_graph
         import networkx as _nx
 
@@ -3120,19 +3156,19 @@ def main() -> None:
         src_scored = _score_nodes(G, [t.lower() for t in source_label.split()])
         tgt_scored = _score_nodes(G, [t.lower() for t in target_label.split()])
         if not src_scored:
-            print(f"No node matching '{source_label}' found.", file=sys.stderr)
+            print(f"No node matching '{_sl(source_label)}' found.", file=sys.stderr)
             sys.exit(1)
         if not tgt_scored:
-            print(f"No node matching '{target_label}' found.", file=sys.stderr)
+            print(f"No node matching '{_sl(target_label)}' found.", file=sys.stderr)
             sys.exit(1)
         src_nid, tgt_nid = src_scored[0][1], tgt_scored[0][1]
         # Ambiguity guard: when both queries resolve to the same node, the
         # shortest path is trivially zero hops, which is almost never what the
         # caller wanted (see bug #828).
         if src_nid == tgt_nid:
             print(
-                f"'{source_label}' and '{target_label}' both resolved to the same "
-                f"node '{src_nid}'. Use a more specific label or the exact node ID.",
+                f"'{_sl(source_label)}' and '{_sl(target_label)}' both resolved to the same "
+                f"node '{_sl(src_nid)}'. Use a more specific label or the exact node ID.",
                 file=sys.stderr,
             )
             sys.exit(1)
@@ -3148,7 +3184,7 @@ def main() -> None:
         try:
             path_nodes = _nx.shortest_path(G.to_undirected(as_view=True), src_nid, tgt_nid)
         except (_nx.NetworkXNoPath, _nx.NodeNotFound):
-            print(f"No path found between '{source_label}' and '{target_label}'.")
+            print(f"No path found between '{_sl(source_label)}' and '{_sl(target_label)}'.")
             sys.exit(0)
         hops = len(path_nodes) - 1
         segments = []
@@ -3162,15 +3198,16 @@ def main() -> None:
             else:
                 edata = edge_data(G, v, u)
                 forward = False
-            rel = edata.get("relation", "")
-            conf = edata.get("confidence", "")
+            rel = _sl(str(edata.get("relation", "")))
+            conf = _sl(str(edata.get("confidence", "")))
             conf_str = f" [{conf}]" if conf else ""
             if i == 0:
-                segments.append(G.nodes[u].get("label", u))
+                segments.append(_sl(G.nodes[u].get("label", u)))
+            v_label = _sl(G.nodes[v].get("label", v))
             if forward:
-                segments.append(f"--{rel}{conf_str}--> {G.nodes[v].get('label', v)}")
+                segments.append(f"--{rel}{conf_str}--> {v_label}")
             else:
-                segments.append(f"<--{rel}{conf_str}-- {G.nodes[v].get('label', v)}")
+                segments.append(f"<--{rel}{conf_str}-- {v_label}")
         print(f"Shortest path ({hops} hops):\n  " + " ".join(segments))
         from graphify import querylog
         querylog.log_query(
@@ -3185,6 +3222,7 @@ def main() -> None:
             print('Usage: graphify explain "<node>" [--graph path]', file=sys.stderr)
             sys.exit(1)
         from graphify.serve import _find_node
+        from graphify.security import sanitize_label as _sl
         from networkx.readwrite import json_graph
 
         label = sys.argv[2]
@@ -3209,17 +3247,19 @@ def main() -> None:
             G = json_graph.node_link_graph(_raw)
         matches = _find_node(G, label)
         if not matches:
-            print(f"No node matching '{label}' found.")
+            print(f"No node matching '{_sl(label)}' found.")
             sys.exit(0)
         nid = matches[0]
         d = G.nodes[nid]
-        print(f"Node: {d.get('label', nid)}")
-        print(f"  ID:        {nid}")
-        print(
-            f"  Source:    {d.get('source_file', '')} {d.get('source_location', '')}".rstrip()
-        )
-        print(f"  Type:      {d.get('file_type', '')}")
-        print(f"  Community: {d.get('community_name') or d.get('community', '')}")
+        print(f"Node: {_sl(d.get('label', nid))}")
+        print(f"  ID:        {_sl(nid)}")
+        source_info = (
+            f"{_sl(str(d.get('source_file', '')))} "
+            f"{_sl(str(d.get('source_location', '')))}"
+        ).rstrip()
+        print(f"  Source:    {source_info}")
+        print(f"  Type:      {_sl(str(d.get('file_type', '')))}")
+        print(f"  Community: {_sl(str(d.get('community_name') or d.get('community', '')))}")
         # Work-memory overlay: a derived experiential hint from `graphify reflect`,
         # merged in display-only from the .graphify_learning.json sidecar next to
         # graph.json. No line when the node has no overlay entry.
@@ -3255,10 +3295,10 @@ def main() -> None:
             print(f"\nConnections ({len(connections)}):")
             connections.sort(key=lambda c: G.degree(c[1]), reverse=True)
             for direction, nb, edata in connections[:20]:
-                rel = edata.get("relation", "")
-                conf = edata.get("confidence", "")
+                rel = _sl(str(edata.get("relation", "")))
+                conf = _sl(str(edata.get("confidence", "")))
                 arrow = "-->" if direction == "out" else "<--"
-                print(f"  {arrow} {G.nodes[nb].get('label', nb)} [{rel}] [{conf}]")
+                print(f"  {arrow} {_sl(G.nodes[nb].get('label', nb))} [{rel}] [{conf}]")
             if len(connections) > 20:
                 print(f"  ... and {len(connections) - 20} more")
         from graphify import querylog

diff --git a/graphify/build.py b/graphify/build.py
@@ -103,6 +103,44 @@ def _normalize_hyperedge_members(he: object) -> None:
         he.pop(alias, None)
 
 
+def _remap_hyperedge_members(
+    hyperedges: list,
+    remap: dict,
+    *,
+    allow_normalized: bool = False,
+) -> None:
+    """Apply node-id remaps to hyperedge member lists in place.
+
+    Edges are rewired through the dedup and graph-build remap tables; hyperedges
+    are graph metadata, so they need the same treatment or they retain dangling
+    member IDs after the canonical node has won.
+    """
+    if not remap:
+        return
+    for he in hyperedges or []:
+        _normalize_hyperedge_members(he)
+        if not isinstance(he, dict) or not isinstance(he.get("nodes"), list):
+            continue
+        remapped: list = []
+        seen: set = set()
+        for ref in he["nodes"]:
+            mapped = ref
+            try:
+                mapped = remap.get(ref, ref)
+            except TypeError:
+                pass
+            if mapped is ref and allow_normalized and isinstance(ref, str):
+                mapped = remap.get(_normalize_id(ref), ref)
+            try:
+                if mapped in seen:
+                    continue
+                seen.add(mapped)
+            except TypeError:
+                pass
+            remapped.append(mapped)
+        he["nodes"] = remapped
+
+
 def _norm_source_file(p: str | None, root: str | None = None) -> str | None:
     """Normalize path separators and relativize absolute paths.
 
@@ -519,6 +557,11 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat
             alias = old_stem + suffix
             norm_to_id.setdefault(_normalize_id(alias), nid)
             norm_to_id.setdefault(alias, nid)
+    _remap_hyperedge_members(
+        extraction.get("hyperedges", []) or [],
+        norm_to_id,
+        allow_normalized=True,
+    )
     # Iterate edges in a deterministic order. The graph is undirected and stores
     # direction in _src/_tgt; when two edges collapse onto the same node pair the
     # last write wins, so an unstable iteration order flips _src/_tgt run-to-run
@@ -656,10 +699,12 @@ def build(
         combined["input_tokens"] += ext.get("input_tokens", 0)
         combined["output_tokens"] += ext.get("output_tokens", 0)
     if dedup and combined["nodes"]:
-        combined["nodes"], combined["edges"] = deduplicate_entities(
+        combined["nodes"], combined["edges"], remap = deduplicate_entities(
             combined["nodes"], combined["edges"], communities={},
             dedup_llm_backend=dedup_llm_backend,
+            return_remap=True,
         )
+        _remap_hyperedge_members(combined["hyperedges"], remap)
     return build_from_json(combined, directed=directed, root=root)
 
 

diff --git a/graphify/cache.py b/graphify/cache.py
@@ -83,15 +83,16 @@ def _body_content(content: bytes) -> bytes:
     return text[closer.start() + 3:].encode()
 
 
-# Stat-based index: maps absolute path → {size, mtime_ns, hash}.
-# Loaded once per process, flushed via atexit. Skips full file reads when
-# size+mtime_ns are unchanged — same trade-off as make(1).
-# Correctness risks: `touch` causes a harmless extra re-hash; same-size edits
-# within NFS second-resolution mtime have a 1-second window (same as make).
+# Stat-based index: maps root+absolute path → {size, mtime_ns, ctime_ns, hash}.
+# Loaded once per process, flushed via atexit. By default this is advisory only:
+# metadata is not a correctness proof on every filesystem, so file_hash still
+# reads the file. Set GRAPHIFY_TRUST_STAT_CACHE=1 to re-enable the legacy
+# metadata fastpath in controlled environments.
 # Use `graphify extract --force` to bypass when needed.
 _stat_index: dict[str, dict] = {}
 _stat_index_root: Path | None = None
 _stat_index_dirty: bool = False
+_stat_index_registered: bool = False
 
 
 def _stat_index_file(root: Path) -> Path:
@@ -101,10 +102,13 @@ def _stat_index_file(root: Path) -> Path:
 
 
 def _ensure_stat_index(root: Path) -> None:
-    global _stat_index, _stat_index_root, _stat_index_dirty
-    if _stat_index_root is not None:
+    global _stat_index, _stat_index_root, _stat_index_dirty, _stat_index_registered
+    resolved_root = Path(root).resolve()
+    if _stat_index_root == resolved_root:
         return
-    _stat_index_root = Path(root).resolve()
+    if _stat_index_root is not None:
+        _flush_stat_index()
+    _stat_index_root = resolved_root
     p = _stat_index_file(_stat_index_root)
     if p.exists():
         try:
@@ -113,7 +117,10 @@ def _ensure_stat_index(root: Path) -> None:
             _stat_index = {}
     else:
         _stat_index = {}
-    atexit.register(_flush_stat_index)
+    _stat_index_dirty = False
+    if not _stat_index_registered:
+        atexit.register(_flush_stat_index)
+        _stat_index_registered = True
 
 
 def _flush_stat_index() -> None:
@@ -153,12 +160,17 @@ def _normalize_path(path: Path) -> Path:
     return Path(os.path.normcase(s))
 
 
+def _stat_cache_key(path: Path, root: Path) -> str:
+    """Key stat fastpath by both path and root because the hash includes relpath."""
+    return f"{Path(root).resolve()}\0{Path(path).resolve()}"
+
+
 def file_hash(path: Path, root: Path = Path(".")) -> str:
     """SHA256 of file contents + path relative to root.
 
-    Uses a stat-based fastpath (size + mtime_ns) to skip full reads when the
-    file hasn't changed. Falls through to full SHA256 on first encounter or
-    when stat changes. Index is flushed atomically at process exit.
+    Reads file contents for correctness. A stat index is still maintained for
+    callers that explicitly opt into the legacy metadata fastpath with
+    GRAPHIFY_TRUST_STAT_CACHE=1. Index is flushed atomically at process exit.
 
     Using a relative path (not absolute) makes cache entries portable across
     machines and checkout directories, so shared caches and CI work correctly.
@@ -174,14 +186,21 @@ def file_hash(path: Path, root: Path = Path(".")) -> str:
         raise IsADirectoryError(f"file_hash requires a file, got: {p}")
 
     _ensure_stat_index(root)
-    abs_key = str(p.resolve())
+    stat_key = _stat_cache_key(p, root)
     st: "os.stat_result | None" = None
     try:
         st = p.stat()
-        entry = _stat_index.get(abs_key)
-        if (entry
+        entry = _stat_index.get(stat_key)
+        trust_stat_cache = os.environ.get("GRAPHIFY_TRUST_STAT_CACHE", "").lower() in (
+            "1",
+            "true",
+            "yes",
+        )
+        if (trust_stat_cache
+                and entry
                 and entry.get("size") == st.st_size
-                and entry.get("mtime_ns") == st.st_mtime_ns):
+                and entry.get("mtime_ns") == st.st_mtime_ns
+                and entry.get("ctime_ns") == st.st_ctime_ns):
             return entry["hash"]
     except OSError:
         pass
@@ -199,7 +218,12 @@ def file_hash(path: Path, root: Path = Path(".")) -> str:
     digest = h.hexdigest()
 
     if st is not None:
-        _stat_index[abs_key] = {"size": st.st_size, "mtime_ns": st.st_mtime_ns, "hash": digest}
+        _stat_index[stat_key] = {
+            "size": st.st_size,
+            "mtime_ns": st.st_mtime_ns,
+            "ctime_ns": st.st_ctime_ns,
+            "hash": digest,
+        }
         _stat_index_dirty = True
 
     return digest