DEEPDIP-project · SCiarella · Feb 12, 2025 · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025
diff --git a/ext/NavierStokes/callback.jl b/ext/NavierStokes/callback.jl
@@ -50,7 +50,8 @@ function create_callback(
         figfile = nothing)
     if callbackstate === nothing
         # Initialize the callback state
-        callbackstate = (; θmin = θ, loss_min = eltype(θ)(Inf), lhist_val = [],
+        # To store data coming from CUDA device, we have to serialize them to CPU
+        callbackstate = (; θmin = Array(θ), loss_min = eltype(Array(θ))(Inf), lhist_val = [],
             lhist_train = [], lhist_nomodel = [])
     end
     if nunroll === nothing && batch_size === nothing
@@ -78,7 +79,7 @@ function create_callback(
             l_val = loss_function(model, p, st, (y1, y2))[1]
             # check if this set of p produces a lower validation loss
             l_val < callbackstate.loss_min &&
-                (callbackstate = (; callbackstate..., θmin = p, loss_min = l_val))
+                (callbackstate = (; callbackstate..., θmin = Array(p), loss_min = l_val))
             @info "[$(step)] Validation Loss: $(l_val)"
             no_model_loss = loss_function(model, callbackstate.θmin .* 0, st, (y1, y2))[1]
             @info "[$(step)] Validation Loss (no model): $(no_model_loss)"

diff --git a/simulations/Benchmark/benchmark.jl b/simulations/Benchmark/benchmark.jl
@@ -130,7 +130,7 @@ T = eval(Meta.parse(conf["T"]))
 if CUDA.functional()
     ## For running on a CUDA compatible GPU
     @info "Running on CUDA"
-    @info CUDA.versioninfo()
+    cuda_active = true
     backend = CUDABackend()
     CUDA.allowscalar(false)
     device = x -> adapt(CuArray, x)
@@ -140,6 +140,7 @@ else
     ## Consider reducing the sizes of DNS, LES, and CNN layers if
     ## you want to test run on a laptop.
     @warn "Running on CPU"
+    cuda_active = false
     backend = CPU()
     device = identity
     clean() = nothing
@@ -166,7 +167,7 @@ dns_seeds_test = dns_seeds[ntrajectory:ntrajectory]
 
 # Create data
 docreatedata = conf["docreatedata"]
-docreatedata && createdata(; params, seeds = dns_seeds, outdir, taskid)
+docreatedata && createdata(; params, seeds = dns_seeds, outdir, taskid, backend)
 @info "Data generated"
 
 # Computational time
@@ -201,15 +202,15 @@ setups = map(nles -> getsetup(; params, nles), params.nles);
 using Lux:relu
 closure, θ_start, st = NS.load_model(conf)
 # Get the same model structure in INS format
-closure_INS, θ_INS = cnn(;
+closure_INS, θ_INS = NeuralClosure.cnn(;
     setup = setups[1],
     radii = conf["closure"]["radii"],
     channels = conf["closure"]["channels"],
     activations = [eval(Meta.parse(func)) for func in conf["closure"]["activations"]],
     use_bias = conf["closure"]["use_bias"],
     rng = eval(Meta.parse(conf["closure"]["rng"])),
 )
-@assert θ_start == θ_INS
+@assert device(θ_start) == device(θ_INS)
 
 @info "Initialized CNN with $(length(θ_start)) parameters"
 
@@ -478,12 +479,12 @@ let
         ## No model
         dudt_nomod = NS.create_right_hand_side(
             setup, psolver)
-        err_post = create_loss_post_lux(dudt_nomod; sciml_solver = Tsit5(), dt = dt)
+        err_post = create_loss_post_lux(dudt_nomod; sciml_solver = Tsit5(), dt = dt, use_cuda = CUDA.functional())
         epost.nomodel[I] = err_post(closure, θ_cnn_post[I].*0 , st, data)[1]
         # with closure
         dudt = NS.create_right_hand_side_with_closure(
             setup, psolver, closure, st)
-        err_post = create_loss_post_lux(dudt; sciml_solver = Tsit5(), dt = dt)
+        err_post = create_loss_post_lux(dudt; sciml_solver = Tsit5(), dt = dt, use_cuda = CUDA.functional())
         epost.cnn_prior[I] = err_post(closure, device(θ_cnn_prior[ig, ifil]), st, data)[1]
         epost.cnn_post[I] =  err_post(closure, device(θ_cnn_post[I]), st, data)[1]
         clean()

diff --git a/simulations/Benchmark/configs/conf_3.yaml b/simulations/Benchmark/configs/conf_3.yaml
@@ -28,20 +28,20 @@ closure:
   type: cnn
   radii: [2, 2, 2, 2, 2]
   channels: [24, 24, 24, 24, 2]
-  activations: ["tanh", "tanh", "tanh", "tanh", "identity"]
+  activations: ["relu", "relu", "relu", "relu", "identity"]
   use_bias: [true, true, true, true, false]
   rng: "Xoshiro(seeds.θ_start)"
 priori:
   dotrain: true
-  nepoch: 200
+  nepoch: 210
   batchsize: 32
   opt: "OptimiserChain(Adam(T(1.0e-2)), ClipGrad(1))"
   do_plot: false
   plot_train: false
 posteriori:
   dotrain: true
   projectorders: "(ProjectOrder.Last, )"
-  nepoch: 20
+  nepoch: 50
   opt: "OptimiserChain(Adam(T(1.0e-3)), ClipGrad(1))"
   nunroll: 3
   nunroll_valid: 5

diff --git a/simulations/Benchmark/job_a100.sh b/simulations/Benchmark/job_a100.sh
@@ -5,10 +5,10 @@
 #SBATCH --cpus-per-task=18
 #SBATCH --gpus=1
 #SBATCH --partition=gpu_a100
-#SBATCH --time=05:00:00
+#SBATCH --time=00:30:00
 #SBATCH --mail-type=BEGIN,END
 # #SBATCH --mail-user=s.ciarella@esciencecenter.nl
-#SBATCH --array=1-8
+#SBATCH --array=1-1
 
 module load 2023
 module load juliaup/1.14.5-GCCcore-12.3.0
@@ -26,7 +26,5 @@ export CONF_FILE=$1
 
 cd $HOME/CoupledNODE.jl/simulations/Benchmark
 
-julia --project -t auto -e 'using Pkg; Pkg.update()'
-
 julia --project -t auto benchmark.jl
 
diff --git a/simulations/Benchmark/src/train.jl b/simulations/Benchmark/src/train.jl
@@ -4,7 +4,7 @@ function getdatafile(outdir, nles, filter, seed)
 end
 
 "Create data files."
-createdata(; params, seeds, outdir, taskid) =
+createdata(; params, seeds, outdir, taskid, backend) =
     for (iseed, seed) in enumerate(seeds)
         if isnothing(taskid) || iseed == taskid
             @info "Creating DNS trajectory for seed $(repr(seed))"
@@ -24,7 +24,8 @@ createdata(; params, seeds, outdir, taskid) =
             @info "Data file $(filenames[1]) already exists. Skipping."
             continue
         end
-        data = create_les_data(; params..., rng = Xoshiro(seed), filenames, Δt = params.Δt)
+        data = create_les_data(;
+            params..., rng = Xoshiro(seed), filenames, Δt = params.Δt, backend = backend)
         @info("Trajectory info:",
             data[1].comptime/60,
             length(data[1].t),
@@ -122,6 +123,18 @@ function trainprior(;
             callbackstate = trainstate = nothing
             nepochs_left = nepoch
         end
+        @info "----------------------"
+        @info "----------------------"
+        @info "----------------------"
+        @warn callbackstate
+        @info "----------------------"
+        @info "----------------------"
+        @info "----------------------"
+        @warn trainstate
+        @info "----------------------"
+        @info "----------------------"
+        @info "----------------------"
+        exit()
 
         callbackstate, callback = NS.create_callback(
             closure, θ, io_valid[itotal], loss, st;
@@ -135,7 +148,7 @@ function trainprior(;
             l, trainstate = CoupledNODE.train(
                 closure, θ, st, dataloader_prior, loss; tstate = trainstate,
                 nepochs = nepochs_left,
-                alg = opt, cpu = params.backend == CPU(), callback = callback)
+                alg = opt, cpu = !CUDA.functional(), callback = callback)
         end
         save_object(checkfile, (callbackstate = callbackstate, trainstate = trainstate))
 
@@ -179,7 +192,7 @@ function trainpost(;
         do_plot = false,
         plot_train = false
 )
-    device(x) = CUDA.functional() ? adapt(params.backend, x) : x
+    device(x) = adapt(params.backend, x)
     itotal = 0
     for projectorder in projectorders,
         (ifil, Φ) in enumerate(params.filters),
@@ -207,7 +220,7 @@ function trainpost(;
         setup = []
         for nl in nles
             x = ntuple(α -> LinRange(T(0.0), T(1.0), nl + 1), params.D)
-            push!(setup, Setup(; x = x, Re = params.Re))
+            push!(setup, Setup(; x = x, Re = params.Re, params.backend))
         end
 
         # Read the data in the format expected by the CoupledNODE
@@ -232,7 +245,8 @@ function trainpost(;
 
         dudt_nn = NS.create_right_hand_side_with_closure(
             setup[1], psolver, closure, st)
-        loss = create_loss_post_lux(dudt_nn; sciml_solver = Tsit5(), dt = dt)
+        loss = create_loss_post_lux(
+            dudt_nn; sciml_solver = Tsit5(), dt = dt, use_cuda = CUDA.functional())
 
         if loadcheckpoint && isfile(checkfile)
             callbackstate, trainstate, epochs_trained = CoupledNODE.load_checkpoint(checkfile)
@@ -252,7 +266,7 @@ function trainpost(;
         else
             l, trainstate = CoupledNODE.train(
                 closure, θ, st, dataloader_post, loss; tstate = trainstate, nepochs = nepochs_left,
-                alg = opt, cpu = params.backend == CPU(), callback = callback)
+                alg = opt, cpu = !CUDA.functional(), callback = callback)
         end
         save_object(checkfile, (callbackstate = callbackstate, trainstate = trainstate))
 

diff --git a/simulations/Benchmark/update_julia b/simulations/Benchmark/update_julia
@@ -1,14 +1,14 @@
 #!/bin/bash
 #SBATCH --job-name=julia_update
 #SBATCH --partition=gpu
-#SBATCH --time=01:00:00
+#SBATCH --time=00:20:00
 #SBATCH --nodes=1
 #SBATCH --ntasks=1
 #SBATCH --cpus-per-task=1
 #SBATCH --gpus-per-node=1
 #SBATCH --output=update_julia.out
 
+module load 2023
+module load juliaup/1.14.5-GCCcore-12.3.0
 
-# Run Julia and update the environment
-julia -e 'using Pkg; Pkg.activate("."); Pkg.add(url="https://github.com/DEEPDIP-project/CoupledNODE.jl.git"); Pkg.add(url="https://github.com/DEEPDIP-project/NeuralClosure.jl.git"); Pkg.update(); Pkg.resolve()'
-
+srun --unbuffered julia --project -t auto -e 'using Pkg; Pkg.update(); Pkg.resolve()'