-
Notifications
You must be signed in to change notification settings - Fork 217
Open
Labels
bugSomething isn't workingSomething isn't working
Description
Xunit Test hangs indefinitely during L-BFGS optimization when using version 0.105.2
FYI: Not using the latest version 0.106.0 because it no longer support Compute Capability: 6.1
using System;
using TorchSharp;
using TorchSharp.Modules;
using Xunit;
using static TorchSharp.torch;
public class LbfgsCudaStressTests
{
private readonly Device _gpu = CUDA;
private readonly Device _cpu = CPU;
[Fact]
public void Lbfgs_Should_Not_Hang_On_CUDA()
{
if (!cuda.is_available())
return; // Skip on machines without CUDA
using var scope = NewDisposeScope();
var device = _gpu;
// Simple 1-parameter model
var w = torch.randn(new long[] { 1 }, device: device, requiresGrad: true);
var optimizer = optim.LBFGS(new[] { w }, lr: 1.0);
// Stress loop: L-BFGS is known to hang when repeatedly calling step() on CUDA
for (int i = 0; i < 200; i++)
{
bool completed = false;
// Timeout guard: if L-BFGS hangs, this iteration will never finish
var task = System.Threading.Tasks.Task.Run(() =>
{
optimizer.step(() =>
{
// Simple quadratic loss: (w - 3)^2
var loss = (w - 3).pow(2).sum();
loss.backward();
return loss;
});
completed = true;
});
// Wait 2 seconds per iteration — enough to detect a hang
if (!task.Wait(TimeSpan.FromSeconds(2)))
{
Assert.False(true, $"L-BFGS hang detected on CUDA at iteration {i}");
}
Assert.True(completed);
}
}
}Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working