Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file.

### Nx

- Add `Nx_core.Cache_dir` module with consolidated cache directory utilities respecting `RAVEN_CACHE_ROOT`, `XDG_CACHE_HOME`, and `HOME` fallback, replacing project-specific cache logic across the whole raven ecosystem (#133, @Arsalaan-Alam)
- Add `Nx_io.save_txt` / `Nx_io.load_txt` with NumPy-compatible formatting, comments, and dtype support (#120, @six-shot)
- Clarify `reshape` documentation to match its view-only semantics (@tmattio)
- Provide `nx.top`, `rune.top`, and `hugin.top` libraries that auto-install pretty printers in the OCaml toplevel and update Quill to load them (@tmattio)
Expand Down Expand Up @@ -54,6 +55,7 @@ All notable changes to this project will be documented in this file.
- Checkpointing overhaul: versioned `Train_state` with schema tagging, explicit `Checkpoint.{Snapshot,Artifact,Manifest,Repository}` (retention, tags, metadata), and simple save/load helpers for snapshots and params. (@tmattio)
- Overhaul dataset combinators: derive tensor specs from Rune dtype, fix sampling/window bugs, validate weighted sampling, and respect `drop_remainder` (@tmattio)
- Make dataset `prefetch` truly asynchronous with background domains and allow reusing an external Domainslib pool via `parallel_map ~pool` (@tmattio)
- Update BERT and GPT-2 tokenizer cache to use `Nx.Cache` for consistent cache directory resolution (#133, @Arsalaan-Alam)
- Honor text dataset encodings via incremental Uutf decoding (#122, @Satarupa22-SD).
- Preserve empty sequential modules when unflattening so indices stay aligned for checkpoint round-tripping (@tmattio)
- Prevent `Training.fit`/`evaluate` from consuming entire datasets eagerly and fail fast when a dataset yields no batches, avoiding hangs and division-by-zero crashes (@tmattio)
Expand Down Expand Up @@ -115,6 +117,7 @@ All notable changes to this project will be documented in this file.

### Nx-datasets

- Migrate to `Nx.Cache` for cache directory resolution, enabling consistent behavior. (#133, @Arsalaan-Alam)
- Fix cache directory resolution to respect `RAVEN_CACHE_ROOT` (or fall back to `XDG_CACHE_HOME`/`HOME`), allowing custom cache locations. (#128, @Arsalaan-Alam)
- Switch CIFAR-10 loader to the binary archive so parsing succeeds again (@tmattio)
- Add a CIFAR-10 example (@tmattio)
Expand Down
11 changes: 1 addition & 10 deletions kaun/lib/kaun-models/bert.ml
Original file line number Diff line number Diff line change
Expand Up @@ -83,16 +83,7 @@ module Tokenizer = struct

let download_vocab_file model_id =
(* Download vocab file from HuggingFace if not present *)
let cache_dir =
match Sys.getenv_opt "XDG_CACHE_HOME" with
| Some dir -> dir
| None -> (
match Sys.getenv_opt "HOME" with
| Some home -> Filename.concat home ".cache"
| None -> "/tmp/.cache")
in
let kaun_cache = Filename.concat cache_dir "kaun" in
let vocab_cache = Filename.concat kaun_cache "vocab" in
let vocab_cache = Nx_core.Cache_dir.get_path_in_cache ~scope:["models"; "bert"] "vocab" in
let vocab_file = Filename.concat vocab_cache (model_id ^ "-vocab.txt") in

(* Create cache directory if it doesn't exist *)
Expand Down
2 changes: 1 addition & 1 deletion kaun/lib/kaun-models/dune
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(library
(public_name kaun.models)
(name kaun_models)
(libraries rune kaun kaun.huggingface nx yojson saga))
(libraries rune kaun kaun.huggingface nx nx.core yojson saga))
12 changes: 1 addition & 11 deletions kaun/lib/kaun-models/gpt2.ml
Original file line number Diff line number Diff line change
Expand Up @@ -81,17 +81,7 @@ module Tokenizer = struct

let download_vocab_and_merges model_id =
(* Download vocab and merges files from HuggingFace if not present *)
let cache_dir =
match Sys.getenv_opt "XDG_CACHE_HOME" with
| Some dir -> dir
| None -> (
match Sys.getenv_opt "HOME" with
| Some home -> Filename.concat home ".cache"
| None -> "/tmp/.cache")
in
let kaun_cache = Filename.concat cache_dir "kaun" in
let gpt2_cache = Filename.concat kaun_cache "gpt2" in
let model_cache = Filename.concat gpt2_cache model_id in
let model_cache = Nx_core.Cache_dir.get_path_in_cache ~scope:["models"; "gpt2"] model_id in
let vocab_file = Filename.concat model_cache "vocab.json" in
let merges_file = Filename.concat model_cache "merges.txt" in

Expand Down
13 changes: 1 addition & 12 deletions nx-datasets/lib/dataset_utils.ml
Original file line number Diff line number Diff line change
Expand Up @@ -51,19 +51,8 @@ let mkdir_p path perm =
initial_prefix components);
()

let get_cache_base_dir ?(getenv = Sys.getenv_opt) () =
match getenv "RAVEN_CACHE_ROOT" with
| Some dir when dir <> "" -> dir
| _ ->
let xdg = Xdg.create ~env:getenv () in
Filename.concat (Xdg.cache_dir xdg) "raven"

let get_cache_dir ?(getenv = Sys.getenv_opt) dataset_name =
let base = get_cache_base_dir ~getenv () in
let path = List.fold_left Filename.concat base [ "datasets"; dataset_name ] in
let sep = Filename.dir_sep.[0] in
if path <> "" && path.[String.length path - 1] = sep then path
else path ^ Filename.dir_sep
Nx_core.Cache_dir.get_path_in_cache ~getenv ~scope:["datasets"] dataset_name

let mkdir_p dir =
try mkdir_p dir 0o755 with Unix.Unix_error (Unix.EEXIST, _, _) -> ()
Expand Down
24 changes: 6 additions & 18 deletions nx-datasets/lib/dataset_utils.mli
Original file line number Diff line number Diff line change
@@ -1,17 +1,12 @@
(** Utilities for downloading and managing datasets. *)

val get_cache_dir : ?getenv:(string -> string option) -> string -> string
(** Return the platform-specific cache directory path for the given dataset.
(** [get_cache_dir ?getenv dataset_name] returns the cache directory path for
the given dataset.

The cache directory is resolved using the following priority order: 1.
[RAVEN_CACHE_ROOT] environment variable (highest priority; absolute cache
root) 2. [XDG_CACHE_HOME] environment variable (if RAVEN_CACHE_ROOT not set)
3. [$HOME/.cache] (fallback, default behavior)

The resolved path will be "[cache_root]/datasets/[dataset_name]/", where
[cache_root] is either [RAVEN_CACHE_ROOT] or
"[XDG_CACHE_HOME or HOME]/raven", with platform-appropriate directory
separators and a trailing separator.
This is a convenience wrapper around {!Nx_core.Cache_dir.get_path_in_cache}
with [~scope:\["datasets"\]]. See {!Nx_core.Cache_dir.get_path_in_cache} for
details on cache directory resolution and environment variable priority.

{2 Parameters}
- dataset_name: the name of the dataset.
Expand All @@ -21,14 +16,7 @@ val get_cache_dir : ?getenv:(string -> string option) -> string -> string

@param getenv
optional environment lookup function (defaults to [Sys.getenv_opt]) to
facilitate testing.

{2 Environment Variables}
- [RAVEN_CACHE_ROOT]: Custom cache directory root (overrides all other
settings)
- [XDG_CACHE_HOME]: XDG Base Directory cache location (standard on
Linux/Unix)
- [HOME]: User home directory (used for fallback cache location) *)
facilitate testing. *)

val download_file : string -> string -> unit
(** Download a file from a URL to a destination path.
Expand Down
2 changes: 1 addition & 1 deletion nx-datasets/lib/dune
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
(library
(name nx_datasets)
(public_name nx-datasets)
(libraries unix zip curl csv nx bigarray_ext logs xdg))
(libraries unix zip curl csv nx nx.core bigarray_ext logs))
1 change: 0 additions & 1 deletion nx-datasets/vendor/dune

This file was deleted.

16 changes: 16 additions & 0 deletions nx/lib/core/cache_dir.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
(* cache_dir.ml *)

let get_root ?(getenv = Sys.getenv_opt) () =
match getenv "RAVEN_CACHE_ROOT" with
| Some dir when dir <> "" -> dir
| _ ->
let xdg = Xdg.create ~env:getenv () in
Filename.concat (Xdg.cache_dir xdg) "raven"

let get_path_in_cache ?(getenv = Sys.getenv_opt) ~scope name =
let base = get_root ~getenv () in
let path = List.fold_left Filename.concat base (scope @ [ name ]) in
let sep = Filename.dir_sep.[0] in
if path <> "" && path.[String.length path - 1] = sep then path
else path ^ Filename.dir_sep

58 changes: 58 additions & 0 deletions nx/lib/core/cache_dir.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
(** Cache directory utilities for the Raven ecosystem. *)

val get_root : ?getenv:(string -> string option) -> unit -> string
(** [get_root ?getenv ()] returns the base cache directory for Raven.

The cache directory is resolved using the following priority order:
1. [RAVEN_CACHE_ROOT] environment variable (highest priority; absolute cache root)
2. [XDG_CACHE_HOME] environment variable (if RAVEN_CACHE_ROOT not set)
3. [$HOME/.cache] (fallback, default behavior)

The resolved path will be [RAVEN_CACHE_ROOT] or
"[XDG_CACHE_HOME or HOME/.cache]/raven".

@param getenv
optional environment lookup function (defaults to [Sys.getenv_opt]) to
facilitate testing.

{2 Environment Variables}
- [RAVEN_CACHE_ROOT]: Custom cache directory root (overrides all other settings)
- [XDG_CACHE_HOME]: XDG Base Directory cache location (standard on Linux/Unix)
- [HOME]: User home directory (used for fallback cache location) *)

val get_path_in_cache :
?getenv:(string -> string option) -> scope:string list -> string -> string
(** [get_path_in_cache ?getenv ~scope name] returns the cache directory path for
a specific component.

{2 Parameters}
- [scope]: list of directory names forming the scope (e.g. [\["datasets"\]],
[\["models"; "bert"\]])
- [name]: the specific name within that scope (e.g. "iris", "gpt2")

{2 Returns}
- the cache directory path, including trailing slash.

@param getenv
optional environment lookup function (defaults to [Sys.getenv_opt]) to
facilitate testing.

{2 Examples}

Getting cache directory for the iris dataset:
{[
let cache_dir = Nx_core.Cache_dir.get_path_in_cache ~scope:["datasets"] "iris" in
(* With default environment: ~/.cache/raven/datasets/iris/ *)
]}

Getting cache directory with custom root:
{[
let getenv var =
if var = "RAVEN_CACHE_ROOT" then Some "/tmp/my-cache" else None
in
let cache_dir =
Nx_core.Cache_dir.get_path_in_cache ~getenv ~scope:["models"] "bert-base-uncased"
in
(* Result: /tmp/my-cache/models/bert-base-uncased/ *)
]} *)

2 changes: 1 addition & 1 deletion nx/lib/core/dune
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
(library
(name nx_core)
(public_name nx.core)
(libraries str bigarray_ext)
(libraries str bigarray_ext xdg)
(instrumentation
(backend landmarks)))
1 change: 1 addition & 0 deletions nx/lib/core/nx_core.ml
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ module View = View
module Backend_intf = Backend_intf
module Make_frontend = Frontend.Make
module Error = Error
module Cache_dir = Cache_dir
File renamed without changes.
2 changes: 1 addition & 1 deletion nx-datasets/vendor/xdg/dune → nx/vendor/xdg/dune
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
(library
(name xdg)
(public_name nx-datasets.xdg)
(public_name nx.xdg)
(libraries unix)
(foreign_stubs
(language c)
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading