Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 68 additions & 28 deletions graphify/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,17 +171,21 @@ def _version_tuple(version: str) -> tuple[int, ...]:
Reads the leading digits of each dot-segment, so pre/post-release suffixes
(``1.0.0rc1``) compare by their numeric core. A non-numeric or empty segment
becomes 0, so a malformed stamp degrades to a conservative comparison rather
than raising.
than raising. PEP 440 local versions (``+local``) sort after their public
version, matching packaging.version for the install-warning direction check.
"""
base, plus, _local = str(version).partition("+")
parts: list[int] = []
for segment in str(version).split("."):
for segment in base.split("."):
digits = ""
for ch in segment:
if ch.isdigit():
digits += ch
else:
break
parts.append(int(digits) if digits else 0)
if plus:
parts.append(1)
return tuple(parts)


Expand Down Expand Up @@ -802,12 +806,8 @@ def install(platform: str = "claude", *, project: bool = False, project_dir: Pat
if platform == "opencode":
_install_opencode_plugin(project_dir if project else Path("."))

# Refresh version stamps in all other previously-installed skill dirs so
# stale-version warnings don't fire for platforms not explicitly re-installed.
if project:
_print_project_git_add_hint([_project_scope_root(skill_dst, project_dir)])
else:
_refresh_all_version_stamps()

print()
print("Done. Open your AI coding assistant and type:")
Expand All @@ -822,6 +822,34 @@ def _print_install_usage() -> None:
print(f"Platforms: {platforms}")


def _print_cluster_usage(command: str) -> None:
if command == "label":
print("Usage: graphify label <path> [options]")
print()
print("Options:")
print(" --missing-only keep existing labels and only name missing/placeholder communities")
print(" --backend <name> backend to use (default: auto-detect from API keys)")
print(" --model <name> model to use for community naming")
print(" --max-concurrency N parallel labeling LLM calls (default 4; forced to 1 for ollama/claude-cli)")
print(" --batch-size N communities per labeling LLM call (default 100)")
return

print("Usage: graphify cluster-only <path> [options]")
print()
print("Options:")
print(" --graph <path> path to graph.json (default <path>/graphify-out/graph.json)")
print(" --no-viz skip graph.html generation (useful for >5000 node graphs / CI)")
print(" --no-label keep 'Community N' placeholders (skip LLM community naming)")
print(" --resolution N Leiden/Louvain resolution (higher = more, smaller communities)")
print(" --exclude-hubs N exclude top-N percentile degree hubs from partitioning")
print(" --backend <name> backend to use for community naming (default: auto-detect)")
print(" --model <name> model to use for community naming")
print(" --max-concurrency N parallel community-labeling LLM calls (default 4; forced to 1 for ollama/claude-cli)")
print(" --batch-size N communities per labeling LLM call (default 100)")
print(" --min-community-size N omit smaller communities from report detail (default 3)")
print(" --timing print per-stage wall-clock timings")


# The always-on instruction blocks are packaged markdown under graphify/always_on/,
# generated by tools/skillgen and guarded by `skillgen --check`. Reading them at
# load keeps the install-string / issue-#580 contract byte-for-byte while letting
Expand Down Expand Up @@ -2291,10 +2319,14 @@ def main() -> None:
print(" --no-viz skip graph.html generation (useful for >5000 node graphs / CI)")
print(" --graph <path> path to graph.json (default <path>/graphify-out/graph.json)")
print(" --no-label keep 'Community N' placeholders (skip LLM community naming)")
print(" --resolution=N Leiden/Louvain resolution (higher = more, smaller communities)")
print(" --exclude-hubs=N exclude top-N percentile degree hubs from partitioning")
print(" --backend=<name> backend to use for community naming (default: auto-detect)")
print(" --model=<name> model to use for community naming")
print(" --max-concurrency=N parallel community-labeling LLM calls (default 4; forced to 1 for ollama/claude-cli)")
print(" --batch-size=N communities per labeling LLM call (default 100)")
print(" --min-community-size=N omit smaller communities from report detail (default 3)")
print(" --timing print per-stage wall-clock timings")
print(" label <path> (re)name communities with the configured LLM backend, regenerate report")
print(" --missing-only keep existing labels and only name missing/placeholder communities")
print(" --backend=<name> backend to use (default: auto-detect from API keys)")
Expand Down Expand Up @@ -2444,7 +2476,10 @@ def main() -> None:
# "install"/"uninstall" which have their own per-subcommand help handlers.
_FREE_TEXT_CMDS = {"query", "explain", "path", "save-result", "install", "uninstall"}
if cmd not in _FREE_TEXT_CMDS and any(a in {"-h", "--help", "-?"} for a in sys.argv[2:]):
print(f"Run 'graphify --help' for full usage.")
if cmd in {"cluster-only", "label"}:
_print_cluster_usage(cmd)
else:
print(f"Run 'graphify --help' for full usage.")
return

if cmd == "install":
Expand Down Expand Up @@ -3093,6 +3128,7 @@ def main() -> None:
)
sys.exit(1)
from graphify.serve import _score_nodes
from graphify.security import sanitize_label as _sl
from networkx.readwrite import json_graph
import networkx as _nx

Expand Down Expand Up @@ -3120,19 +3156,19 @@ def main() -> None:
src_scored = _score_nodes(G, [t.lower() for t in source_label.split()])
tgt_scored = _score_nodes(G, [t.lower() for t in target_label.split()])
if not src_scored:
print(f"No node matching '{source_label}' found.", file=sys.stderr)
print(f"No node matching '{_sl(source_label)}' found.", file=sys.stderr)
sys.exit(1)
if not tgt_scored:
print(f"No node matching '{target_label}' found.", file=sys.stderr)
print(f"No node matching '{_sl(target_label)}' found.", file=sys.stderr)
sys.exit(1)
src_nid, tgt_nid = src_scored[0][1], tgt_scored[0][1]
# Ambiguity guard: when both queries resolve to the same node, the
# shortest path is trivially zero hops, which is almost never what the
# caller wanted (see bug #828).
if src_nid == tgt_nid:
print(
f"'{source_label}' and '{target_label}' both resolved to the same "
f"node '{src_nid}'. Use a more specific label or the exact node ID.",
f"'{_sl(source_label)}' and '{_sl(target_label)}' both resolved to the same "
f"node '{_sl(src_nid)}'. Use a more specific label or the exact node ID.",
file=sys.stderr,
)
sys.exit(1)
Expand All @@ -3148,7 +3184,7 @@ def main() -> None:
try:
path_nodes = _nx.shortest_path(G.to_undirected(as_view=True), src_nid, tgt_nid)
except (_nx.NetworkXNoPath, _nx.NodeNotFound):
print(f"No path found between '{source_label}' and '{target_label}'.")
print(f"No path found between '{_sl(source_label)}' and '{_sl(target_label)}'.")
sys.exit(0)
hops = len(path_nodes) - 1
segments = []
Expand All @@ -3162,15 +3198,16 @@ def main() -> None:
else:
edata = edge_data(G, v, u)
forward = False
rel = edata.get("relation", "")
conf = edata.get("confidence", "")
rel = _sl(str(edata.get("relation", "")))
conf = _sl(str(edata.get("confidence", "")))
conf_str = f" [{conf}]" if conf else ""
if i == 0:
segments.append(G.nodes[u].get("label", u))
segments.append(_sl(G.nodes[u].get("label", u)))
v_label = _sl(G.nodes[v].get("label", v))
if forward:
segments.append(f"--{rel}{conf_str}--> {G.nodes[v].get('label', v)}")
segments.append(f"--{rel}{conf_str}--> {v_label}")
else:
segments.append(f"<--{rel}{conf_str}-- {G.nodes[v].get('label', v)}")
segments.append(f"<--{rel}{conf_str}-- {v_label}")
print(f"Shortest path ({hops} hops):\n " + " ".join(segments))
from graphify import querylog
querylog.log_query(
Expand All @@ -3185,6 +3222,7 @@ def main() -> None:
print('Usage: graphify explain "<node>" [--graph path]', file=sys.stderr)
sys.exit(1)
from graphify.serve import _find_node
from graphify.security import sanitize_label as _sl
from networkx.readwrite import json_graph

label = sys.argv[2]
Expand All @@ -3209,17 +3247,19 @@ def main() -> None:
G = json_graph.node_link_graph(_raw)
matches = _find_node(G, label)
if not matches:
print(f"No node matching '{label}' found.")
print(f"No node matching '{_sl(label)}' found.")
sys.exit(0)
nid = matches[0]
d = G.nodes[nid]
print(f"Node: {d.get('label', nid)}")
print(f" ID: {nid}")
print(
f" Source: {d.get('source_file', '')} {d.get('source_location', '')}".rstrip()
)
print(f" Type: {d.get('file_type', '')}")
print(f" Community: {d.get('community_name') or d.get('community', '')}")
print(f"Node: {_sl(d.get('label', nid))}")
print(f" ID: {_sl(nid)}")
source_info = (
f"{_sl(str(d.get('source_file', '')))} "
f"{_sl(str(d.get('source_location', '')))}"
).rstrip()
print(f" Source: {source_info}")
print(f" Type: {_sl(str(d.get('file_type', '')))}")
print(f" Community: {_sl(str(d.get('community_name') or d.get('community', '')))}")
# Work-memory overlay: a derived experiential hint from `graphify reflect`,
# merged in display-only from the .graphify_learning.json sidecar next to
# graph.json. No line when the node has no overlay entry.
Expand Down Expand Up @@ -3255,10 +3295,10 @@ def main() -> None:
print(f"\nConnections ({len(connections)}):")
connections.sort(key=lambda c: G.degree(c[1]), reverse=True)
for direction, nb, edata in connections[:20]:
rel = edata.get("relation", "")
conf = edata.get("confidence", "")
rel = _sl(str(edata.get("relation", "")))
conf = _sl(str(edata.get("confidence", "")))
arrow = "-->" if direction == "out" else "<--"
print(f" {arrow} {G.nodes[nb].get('label', nb)} [{rel}] [{conf}]")
print(f" {arrow} {_sl(G.nodes[nb].get('label', nb))} [{rel}] [{conf}]")
if len(connections) > 20:
print(f" ... and {len(connections) - 20} more")
from graphify import querylog
Expand Down
47 changes: 46 additions & 1 deletion graphify/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,44 @@ def _normalize_hyperedge_members(he: object) -> None:
he.pop(alias, None)


def _remap_hyperedge_members(
hyperedges: list,
remap: dict,
*,
allow_normalized: bool = False,
) -> None:
"""Apply node-id remaps to hyperedge member lists in place.

Edges are rewired through the dedup and graph-build remap tables; hyperedges
are graph metadata, so they need the same treatment or they retain dangling
member IDs after the canonical node has won.
"""
if not remap:
return
for he in hyperedges or []:
_normalize_hyperedge_members(he)
if not isinstance(he, dict) or not isinstance(he.get("nodes"), list):
continue
remapped: list = []
seen: set = set()
for ref in he["nodes"]:
mapped = ref
try:
mapped = remap.get(ref, ref)
except TypeError:
pass
if mapped is ref and allow_normalized and isinstance(ref, str):
mapped = remap.get(_normalize_id(ref), ref)
try:
if mapped in seen:
continue
seen.add(mapped)
except TypeError:
pass
remapped.append(mapped)
he["nodes"] = remapped


def _norm_source_file(p: str | None, root: str | None = None) -> str | None:
"""Normalize path separators and relativize absolute paths.

Expand Down Expand Up @@ -519,6 +557,11 @@ def build_from_json(extraction: dict, *, directed: bool = False, root: str | Pat
alias = old_stem + suffix
norm_to_id.setdefault(_normalize_id(alias), nid)
norm_to_id.setdefault(alias, nid)
_remap_hyperedge_members(
extraction.get("hyperedges", []) or [],
norm_to_id,
allow_normalized=True,
)
# Iterate edges in a deterministic order. The graph is undirected and stores
# direction in _src/_tgt; when two edges collapse onto the same node pair the
# last write wins, so an unstable iteration order flips _src/_tgt run-to-run
Expand Down Expand Up @@ -656,10 +699,12 @@ def build(
combined["input_tokens"] += ext.get("input_tokens", 0)
combined["output_tokens"] += ext.get("output_tokens", 0)
if dedup and combined["nodes"]:
combined["nodes"], combined["edges"] = deduplicate_entities(
combined["nodes"], combined["edges"], remap = deduplicate_entities(
combined["nodes"], combined["edges"], communities={},
dedup_llm_backend=dedup_llm_backend,
return_remap=True,
)
_remap_hyperedge_members(combined["hyperedges"], remap)
return build_from_json(combined, directed=directed, root=root)


Expand Down
58 changes: 41 additions & 17 deletions graphify/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,15 +83,16 @@ def _body_content(content: bytes) -> bytes:
return text[closer.start() + 3:].encode()


# Stat-based index: maps absolute path → {size, mtime_ns, hash}.
# Loaded once per process, flushed via atexit. Skips full file reads when
# size+mtime_ns are unchanged — same trade-off as make(1).
# Correctness risks: `touch` causes a harmless extra re-hash; same-size edits
# within NFS second-resolution mtime have a 1-second window (same as make).
# Stat-based index: maps root+absolute path → {size, mtime_ns, ctime_ns, hash}.
# Loaded once per process, flushed via atexit. By default this is advisory only:
# metadata is not a correctness proof on every filesystem, so file_hash still
# reads the file. Set GRAPHIFY_TRUST_STAT_CACHE=1 to re-enable the legacy
# metadata fastpath in controlled environments.
# Use `graphify extract --force` to bypass when needed.
_stat_index: dict[str, dict] = {}
_stat_index_root: Path | None = None
_stat_index_dirty: bool = False
_stat_index_registered: bool = False


def _stat_index_file(root: Path) -> Path:
Expand All @@ -101,10 +102,13 @@ def _stat_index_file(root: Path) -> Path:


def _ensure_stat_index(root: Path) -> None:
global _stat_index, _stat_index_root, _stat_index_dirty
if _stat_index_root is not None:
global _stat_index, _stat_index_root, _stat_index_dirty, _stat_index_registered
resolved_root = Path(root).resolve()
if _stat_index_root == resolved_root:
return
_stat_index_root = Path(root).resolve()
if _stat_index_root is not None:
_flush_stat_index()
_stat_index_root = resolved_root
p = _stat_index_file(_stat_index_root)
if p.exists():
try:
Expand All @@ -113,7 +117,10 @@ def _ensure_stat_index(root: Path) -> None:
_stat_index = {}
else:
_stat_index = {}
atexit.register(_flush_stat_index)
_stat_index_dirty = False
if not _stat_index_registered:
atexit.register(_flush_stat_index)
_stat_index_registered = True


def _flush_stat_index() -> None:
Expand Down Expand Up @@ -153,12 +160,17 @@ def _normalize_path(path: Path) -> Path:
return Path(os.path.normcase(s))


def _stat_cache_key(path: Path, root: Path) -> str:
"""Key stat fastpath by both path and root because the hash includes relpath."""
return f"{Path(root).resolve()}\0{Path(path).resolve()}"


def file_hash(path: Path, root: Path = Path(".")) -> str:
"""SHA256 of file contents + path relative to root.

Uses a stat-based fastpath (size + mtime_ns) to skip full reads when the
file hasn't changed. Falls through to full SHA256 on first encounter or
when stat changes. Index is flushed atomically at process exit.
Reads file contents for correctness. A stat index is still maintained for
callers that explicitly opt into the legacy metadata fastpath with
GRAPHIFY_TRUST_STAT_CACHE=1. Index is flushed atomically at process exit.

Using a relative path (not absolute) makes cache entries portable across
machines and checkout directories, so shared caches and CI work correctly.
Expand All @@ -174,14 +186,21 @@ def file_hash(path: Path, root: Path = Path(".")) -> str:
raise IsADirectoryError(f"file_hash requires a file, got: {p}")

_ensure_stat_index(root)
abs_key = str(p.resolve())
stat_key = _stat_cache_key(p, root)
st: "os.stat_result | None" = None
try:
st = p.stat()
entry = _stat_index.get(abs_key)
if (entry
entry = _stat_index.get(stat_key)
trust_stat_cache = os.environ.get("GRAPHIFY_TRUST_STAT_CACHE", "").lower() in (
"1",
"true",
"yes",
)
if (trust_stat_cache
and entry
and entry.get("size") == st.st_size
and entry.get("mtime_ns") == st.st_mtime_ns):
and entry.get("mtime_ns") == st.st_mtime_ns
and entry.get("ctime_ns") == st.st_ctime_ns):
return entry["hash"]
except OSError:
pass
Expand All @@ -199,7 +218,12 @@ def file_hash(path: Path, root: Path = Path(".")) -> str:
digest = h.hexdigest()

if st is not None:
_stat_index[abs_key] = {"size": st.st_size, "mtime_ns": st.st_mtime_ns, "hash": digest}
_stat_index[stat_key] = {
"size": st.st_size,
"mtime_ns": st.st_mtime_ns,
"ctime_ns": st.st_ctime_ns,
"hash": digest,
}
_stat_index_dirty = True

return digest
Expand Down
Loading