hyperparameter tunning arguments added

rxng8 · rxng8 · commit 04e1343096bb · 2025-03-09T20:48:00.000-04:00
diff --git a/ngclearn/utils/analysis/attentive_probe.py b/ngclearn/utils/analysis/attentive_probe.py
@@ -219,7 +219,9 @@ class AttentiveProbe(Probe):
     """
     def __init__(
             self, dkey, source_seq_length, input_dim, out_dim, num_heads=8, attn_dim=64,
-            target_seq_length=1, learnable_query_dim=32, batch_size=1, hid_dim=32, use_LN=True, use_LN_input=False, use_softmax=True, **kwargs
+            target_seq_length=1, learnable_query_dim=32, batch_size=1, hid_dim=32,
+            use_LN=True, use_LN_input=False, use_softmax=True, dropout=0.5, eta=0.0002,
+            eta_decay=0.0, min_eta=1e-5, **kwargs
     ):
         super().__init__(dkey, batch_size, **kwargs)
         assert attn_dim % num_heads == 0, f"`attn_dim` must be divisible by `num_heads`. Got {attn_dim} and {num_heads}."
@@ -232,9 +234,9 @@ def __init__(
         self.use_softmax = use_softmax
         self.use_LN = use_LN
         self.use_LN_input = use_LN_input
-        self.dropout = 0.5
+        self.dropout = dropout
 
-        sigma = 0.05
+        sigma = 0.02
         ## cross-attention parameters
         Wq = random.normal(subkeys[0], (learnable_query_dim, attn_dim)) * sigma
         bq = random.normal(subkeys[1], (1, attn_dim)) * sigma
@@ -287,7 +289,10 @@ def __init__(
         self.grad_fx = jax.value_and_grad(eval_attention_probe, argnums=1, has_aux=True) #, allow_int=True)
         ## set up update rule/optimizer
         self.optim_params = adam.adam_init(self.probe_params)
-        self.eta = 0.0002 #0.001
+        # Learning rate scheduling
+        self.eta = eta #0.001
+        self.eta_decay = eta_decay
+        self.min_eta = min_eta
 
         # Finally, the dkey for the noise_key
         self.noise_key = subkeys[24]
@@ -319,5 +324,7 @@ def update(self, embeddings, labels, dkey=None):
         self.optim_params, self.probe_params = adam.adam_step(
             self.optim_params, self.probe_params, grads, eta=self.eta
         )
+
+        self.eta = max(self.min_eta, self.eta - self.eta_decay * self.eta)
         return loss, predictions
 
diff --git a/ngclearn/utils/analysis/probe.py b/ngclearn/utils/analysis/probe.py
@@ -150,9 +150,14 @@ def fit(self, dataset, dev_dataset=None, n_iter=50, patience=20):
                 L = (_L * x_mb.shape[0]) + L ## we remove the batch division from loss w.r.t. x_mb/y_mb
 
             if dev_data is not None:
-                print(f"\r{ii} L = {L / Ns:.3f} Acc = {acc / Ns:.2f}  Dev.Acc = {best_acc:.2f}", end="")
+                print_string = f"\r{ii} L = {L / Ns:.3f} Acc = {acc / Ns:.2f}  Dev.Acc = {best_acc:.2f}"
             else:
-                print(f"\r{ii} L = {L / Ns:.3f} Acc = {acc / Ns:.2f}", end="")
+                print_string = f"\r{ii} L = {L / Ns:.3f} Acc = {acc / Ns:.2f}"
+
+            if hasattr(self, "eta"):
+                print_string += f"  LR = {getattr(self, 'eta'):.6f}"
+            
+            print(print_string, end = "")
 
             acc = acc / Ns
             L = L / Ns ## compute current loss over (train) dataset