Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Full release notes with details on each version: [GitHub Releases](https://githu

## Unreleased

- Feat: per-project `query` defaults for `--budget`/`--depth`, read from an optional `graphify-out/config.json` (#1654, thanks @Ns2384-star). Declare `{"query": {"default_budget": N, "default_depth": N}}` (flat `budget`/`depth` keys also accepted) to seed the query CLI's defaults before flag parsing, so a per-repo budget/depth becomes the norm without retyping it — and an explicit `--budget`/`--depth` flag still overrides. The `query` command also gains a `--depth` flag (it had none, forcing the old hardcoded depth 2), making the traversal depth tunable per invocation. A missing, unreadable, malformed, or ill-typed config silently degrades to the built-in defaults (budget 2000, depth 2) so a bad file never crashes a query.
- Fix: a malformed semantic chunk no longer crashes `extract` and discards every successful chunk (#1631, thanks @ssazy). When an LLM returned a well-formed object whose `edges` (or `nodes`/`hyperedges`) array carried a stray non-dict entry — a nested list where an edge object belongs — the AST+semantic merge and the semantic-cache write both called `.get()` per entry and raised `AttributeError: 'list' object has no attribute 'get'`. On a 34-chunk run where 33 succeeded, that meant no `graph.json` was written and the cache write failed too, so a re-run re-extracted everything. `_parse_llm_json` now sanitizes each fragment at the single parse chokepoint (keeping only dict entries and coercing a non-list value to `[]`), so the cache writer, the adaptive-retry merge, and the CLI merge are all protected in one place.
- Fix: an unresolved bare npm import no longer aliases onto an unrelated same-named local file (#1638, thanks @EveX1). `import colors from "tailwindcss/colors"` in a `.tsx` file emitted an `imports_from` edge to the bare id `colors`, and build.py's pre-migration alias index (which registers every local file's bare stem) then remapped it onto an unrelated `backend/utils/colors.py` — a confident (`EXTRACTED`) cross-language phantom edge, and one per `.tsx` file sharing the import. In a real monorepo eight unrelated `.tsx` files all landed on a single Python module. Common package subpaths (`colors`, `utils`, `types`, `config`, `client`) collide this way constantly. The external-import fallback now namespaces its target with the `ref` prefix (the same J-4 convention used for tsconfig `extends`/`$ref` externals), so it can never collapse to a local file/symbol id; the ref-namespaced target has no node, so build drops it as an external reference — the correct outcome for a third-party import.
- Fix: `graph.json` node/edge ordering is now stable run-to-run for document/semantic corpora (#1632, thanks @umeshpsatwe). With a parallel LLM backend, `extract_corpus_parallel` merged chunk results in completion order, so which network call happened to return first reordered the nodes and edges even when the model returned identical content — churning `graph.json` between otherwise-identical runs. Chunks are now merged in deterministic submission order after the pool drains (matching the serial path); the progress callback still fires in completion order so long local runs aren't silent. Note: the semantic content the LLM extracts is itself nondeterministic run-to-run — this fix removes the pipeline's own ordering churn, not the model's variance.
Expand Down
38 changes: 35 additions & 3 deletions graphify/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
# Defined once in graphify.paths so the security/callflow path guards honour the
# same override (#1423).
from graphify.paths import GRAPHIFY_OUT as _GRAPHIFY_OUT
from graphify.paths import query_config_defaults as _query_config_defaults


@functools.lru_cache(maxsize=None)
Expand Down Expand Up @@ -2305,7 +2306,10 @@ def main() -> None:
print(" --dfs use depth-first instead of breadth-first")
print(" --context C explicit edge-context filter (repeatable)")
print(" --budget N cap output at N tokens (default 2000)")
print(" --depth N traversal depth (default 2)")
print(" --graph <path> path to graph.json (default graphify-out/graph.json)")
print(" (defaults for --budget/--depth can be set per-project in")
print(" graphify-out/config.json: {\"query\": {\"default_budget\": N, \"default_depth\": N}}; CLI flags override)")
print(" affected \"X\" reverse traversal to find nodes impacted by X")
print(" --relation R edge relation to traverse in reverse (repeatable)")
print(" --depth N reverse traversal depth (default 2)")
Expand Down Expand Up @@ -2842,7 +2846,7 @@ def main() -> None:
sys.exit(1)
elif cmd == "query":
if len(sys.argv) < 3:
print("Usage: graphify query \"<question>\" [--dfs] [--context C] [--budget N] [--graph path]", file=sys.stderr)
print("Usage: graphify query \"<question>\" [--dfs] [--context C] [--budget N] [--depth N] [--graph path]", file=sys.stderr)
sys.exit(1)
from graphify.serve import _query_graph_text
from graphify.security import sanitize_label
Expand All @@ -2851,7 +2855,15 @@ def main() -> None:

question = sys.argv[2]
use_dfs = "--dfs" in sys.argv
# Built-in defaults, optionally seeded from graphify-out/config.json;
# CLI flags below still override the config (#1654).
budget = 2000
depth = 2
_cfg_defaults = _query_config_defaults()
if "budget" in _cfg_defaults:
budget = _cfg_defaults["budget"]
if "depth" in _cfg_defaults:
depth = _cfg_defaults["depth"]
graph_path = _default_graph_path()
context_filters: list[str] = []
args = sys.argv[3:]
Expand All @@ -2871,6 +2883,26 @@ def main() -> None:
print(f"error: --budget must be an integer", file=sys.stderr)
sys.exit(1)
i += 1
elif args[i] == "--depth" and i + 1 < len(args):
try:
depth = int(args[i + 1])
except ValueError:
print(f"error: --depth must be an integer", file=sys.stderr)
sys.exit(1)
if depth <= 0:
print("error: --depth must be a positive integer", file=sys.stderr)
sys.exit(1)
i += 2
elif args[i].startswith("--depth="):
try:
depth = int(args[i].split("=", 1)[1])
except ValueError:
print(f"error: --depth must be an integer", file=sys.stderr)
sys.exit(1)
if depth <= 0:
print("error: --depth must be a positive integer", file=sys.stderr)
sys.exit(1)
i += 1
elif args[i] == "--context" and i + 1 < len(args):
context_filters.append(args[i + 1])
i += 2
Expand Down Expand Up @@ -2922,7 +2954,7 @@ def main() -> None:
G,
question,
mode=_mode,
depth=2,
depth=depth,
token_budget=budget,
context_filters=context_filters,
)
Expand All @@ -2932,7 +2964,7 @@ def main() -> None:
corpus=str(gp),
result=_result,
mode=_mode,
depth=2,
depth=depth,
token_budget=budget,
duration_ms=(_time.perf_counter() - _t0) * 1000,
)
Expand Down
59 changes: 59 additions & 0 deletions graphify/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from __future__ import annotations

import json
import os
import re
from pathlib import Path, PurePosixPath
Expand Down Expand Up @@ -232,3 +233,61 @@ def default_graph_json() -> str:
the path is passed explicitly (#1423).
"""
return str(out_path("graph.json"))


def query_config_defaults(config_path: Path | None = None) -> dict[str, int]:
"""Per-project ``query`` defaults read from ``graphify-out/config.json``.

Returns any ``budget``/``depth`` overrides the sidecar declares, as a dict
that may contain either, both, or neither key. The values seed the CLI's
built-in defaults before flag parsing, so a CLI flag still wins (#1654).

The file may nest the settings under a ``"query"`` object (the documented
shape) or place them at the top level, and either the
``default_budget``/``default_depth`` or bare ``budget``/``depth`` spelling
is accepted::

{"query": {"default_budget": 4000, "default_depth": 3}}

A missing file, unreadable file, malformed JSON, wrong top-level type, or
non-positive/non-integer values all degrade to an empty dict so a bad
config never crashes a query. A whole-valued float (``4000.0``) is coerced
to ``int``; a fractional float (``4000.5``), bool, string, or null is
rejected. When both a nested and a flat value are present the nested
``query`` object wins.
"""
defaults: dict[str, int] = {}
target = config_path if config_path is not None else out_path("config.json")
try:
raw = json.loads(Path(target).read_text(encoding="utf-8"))
except (OSError, ValueError):
return defaults
if not isinstance(raw, dict):
return defaults
section = raw.get("query")
if not isinstance(section, dict):
section = {}

def _pick(*keys: str) -> int | None:
for source in (section, raw):
for key in keys:
value = source.get(key)
# bool is an int subclass; reject it up front so True/False can
# never read as 1/0.
if isinstance(value, bool):
continue
if isinstance(value, int) and value > 0:
return value
# Accept a whole-valued float (4000.0 -> 4000) from a hand-written
# config; reject a fractional one (4000.5) and non-positive values.
if isinstance(value, float) and value.is_integer() and value > 0:
return int(value)
return None

budget = _pick("default_budget", "budget")
if budget is not None:
defaults["budget"] = budget
depth = _pick("default_depth", "depth")
if depth is not None:
defaults["depth"] = depth
return defaults
80 changes: 80 additions & 0 deletions tests/test_paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@

from __future__ import annotations

import json

import pytest

from graphify.paths import (
_is_test_path,
disambiguate_ambiguous_candidates,
query_config_defaults,
)


Expand Down Expand Up @@ -97,3 +100,80 @@ def test_disambiguate_path_proximity_same_dir() -> None:
"pkg/a/caller.py",
)
assert winner == "near"


# --- query_config_defaults (per-project config.json, #1654) -----------------


def _write_config(tmp_path, data) -> None:
(tmp_path / "config.json").write_text(json.dumps(data), encoding="utf-8")


def test_query_config_defaults_nested_query_object(tmp_path) -> None:
_write_config(tmp_path, {"query": {"default_budget": 4000, "default_depth": 3}})
assert query_config_defaults(tmp_path / "config.json") == {"budget": 4000, "depth": 3}


def test_query_config_defaults_flat_keys(tmp_path) -> None:
_write_config(tmp_path, {"budget": 1234, "depth": 5})
assert query_config_defaults(tmp_path / "config.json") == {"budget": 1234, "depth": 5}


def test_query_config_defaults_partial(tmp_path) -> None:
_write_config(tmp_path, {"query": {"default_depth": 4}})
assert query_config_defaults(tmp_path / "config.json") == {"depth": 4}


def test_query_config_defaults_nested_wins_over_flat(tmp_path) -> None:
_write_config(tmp_path, {"query": {"default_budget": 4000}, "budget": 9999})
assert query_config_defaults(tmp_path / "config.json") == {"budget": 4000}


def test_query_config_defaults_missing_file(tmp_path) -> None:
assert query_config_defaults(tmp_path / "does-not-exist.json") == {}


def test_query_config_defaults_malformed_json(tmp_path) -> None:
(tmp_path / "config.json").write_text("{not valid json", encoding="utf-8")
assert query_config_defaults(tmp_path / "config.json") == {}


def test_query_config_defaults_rejects_bad_values(tmp_path) -> None:
# non-int, bool, zero, and negative values are all ignored.
_write_config(
tmp_path,
{"query": {"default_budget": "lots", "default_depth": -1}, "budget": True, "depth": 0},
)
assert query_config_defaults(tmp_path / "config.json") == {}


def test_query_config_defaults_non_dict_top_level(tmp_path) -> None:
(tmp_path / "config.json").write_text(json.dumps([1, 2, 3]), encoding="utf-8")
assert query_config_defaults(tmp_path / "config.json") == {}


def test_query_config_defaults_whole_valued_float_accepted(tmp_path) -> None:
# A hand-written config often carries floats; a whole-valued one coerces.
_write_config(tmp_path, {"query": {"default_budget": 4000.0, "default_depth": 3.0}})
assert query_config_defaults(tmp_path / "config.json") == {"budget": 4000, "depth": 3}


def test_query_config_defaults_fractional_float_rejected(tmp_path) -> None:
# A fractional float can't be an integer depth/budget, so it degrades.
_write_config(tmp_path, {"query": {"default_budget": 4000.5, "default_depth": 2.5}})
assert query_config_defaults(tmp_path / "config.json") == {}


def test_query_config_defaults_absolute_graphify_out(tmp_path, monkeypatch) -> None:
# With no explicit path, the reader resolves via out_path(), which honours
# an absolute GRAPHIFY_OUT override (#1423 / #686).
import graphify.paths as paths

out_dir = tmp_path / "shared" / "graphify-out"
out_dir.mkdir(parents=True)
(out_dir / "config.json").write_text(
json.dumps({"query": {"default_budget": 7000, "default_depth": 4}}),
encoding="utf-8",
)
monkeypatch.setattr(paths, "GRAPHIFY_OUT", str(out_dir))
assert query_config_defaults() == {"budget": 7000, "depth": 4}
Loading