Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions ext/NavierStokes/callback.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ function create_callback(
figfile = nothing)
if callbackstate === nothing
# Initialize the callback state
callbackstate = (; θmin = θ, loss_min = eltype(θ)(Inf), lhist_val = [],
# To store data coming from CUDA device, we have to serialize them to CPU
callbackstate = (; θmin = Array(θ), loss_min = eltype(Array(θ))(Inf), lhist_val = [],
lhist_train = [], lhist_nomodel = [])
end
if nunroll === nothing && batch_size === nothing
Expand Down Expand Up @@ -78,7 +79,7 @@ function create_callback(
l_val = loss_function(model, p, st, (y1, y2))[1]
# check if this set of p produces a lower validation loss
l_val < callbackstate.loss_min &&
(callbackstate = (; callbackstate..., θmin = p, loss_min = l_val))
(callbackstate = (; callbackstate..., θmin = Array(p), loss_min = l_val))
@info "[$(step)] Validation Loss: $(l_val)"
no_model_loss = loss_function(model, callbackstate.θmin .* 0, st, (y1, y2))[1]
@info "[$(step)] Validation Loss (no model): $(no_model_loss)"
Expand Down
13 changes: 7 additions & 6 deletions simulations/Benchmark/benchmark.jl
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ T = eval(Meta.parse(conf["T"]))
if CUDA.functional()
## For running on a CUDA compatible GPU
@info "Running on CUDA"
@info CUDA.versioninfo()
cuda_active = true
backend = CUDABackend()
CUDA.allowscalar(false)
device = x -> adapt(CuArray, x)
Expand All @@ -140,6 +140,7 @@ else
## Consider reducing the sizes of DNS, LES, and CNN layers if
## you want to test run on a laptop.
@warn "Running on CPU"
cuda_active = false
backend = CPU()
device = identity
clean() = nothing
Expand All @@ -166,7 +167,7 @@ dns_seeds_test = dns_seeds[ntrajectory:ntrajectory]

# Create data
docreatedata = conf["docreatedata"]
docreatedata && createdata(; params, seeds = dns_seeds, outdir, taskid)
docreatedata && createdata(; params, seeds = dns_seeds, outdir, taskid, backend)
@info "Data generated"

# Computational time
Expand Down Expand Up @@ -201,15 +202,15 @@ setups = map(nles -> getsetup(; params, nles), params.nles);
using Lux:relu
closure, θ_start, st = NS.load_model(conf)
# Get the same model structure in INS format
closure_INS, θ_INS = cnn(;
closure_INS, θ_INS = NeuralClosure.cnn(;
setup = setups[1],
radii = conf["closure"]["radii"],
channels = conf["closure"]["channels"],
activations = [eval(Meta.parse(func)) for func in conf["closure"]["activations"]],
use_bias = conf["closure"]["use_bias"],
rng = eval(Meta.parse(conf["closure"]["rng"])),
)
@assert θ_start == θ_INS
@assert device(θ_start) == device(θ_INS)

@info "Initialized CNN with $(length(θ_start)) parameters"

Expand Down Expand Up @@ -478,12 +479,12 @@ let
## No model
dudt_nomod = NS.create_right_hand_side(
setup, psolver)
err_post = create_loss_post_lux(dudt_nomod; sciml_solver = Tsit5(), dt = dt)
err_post = create_loss_post_lux(dudt_nomod; sciml_solver = Tsit5(), dt = dt, use_cuda = CUDA.functional())
epost.nomodel[I] = err_post(closure, θ_cnn_post[I].*0 , st, data)[1]
# with closure
dudt = NS.create_right_hand_side_with_closure(
setup, psolver, closure, st)
err_post = create_loss_post_lux(dudt; sciml_solver = Tsit5(), dt = dt)
err_post = create_loss_post_lux(dudt; sciml_solver = Tsit5(), dt = dt, use_cuda = CUDA.functional())
epost.cnn_prior[I] = err_post(closure, device(θ_cnn_prior[ig, ifil]), st, data)[1]
epost.cnn_post[I] = err_post(closure, device(θ_cnn_post[I]), st, data)[1]
clean()
Expand Down
6 changes: 3 additions & 3 deletions simulations/Benchmark/configs/conf_3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,20 @@ closure:
type: cnn
radii: [2, 2, 2, 2, 2]
channels: [24, 24, 24, 24, 2]
activations: ["tanh", "tanh", "tanh", "tanh", "identity"]
activations: ["relu", "relu", "relu", "relu", "identity"]
use_bias: [true, true, true, true, false]
rng: "Xoshiro(seeds.θ_start)"
priori:
dotrain: true
nepoch: 200
nepoch: 210
batchsize: 32
opt: "OptimiserChain(Adam(T(1.0e-2)), ClipGrad(1))"
do_plot: false
plot_train: false
posteriori:
dotrain: true
projectorders: "(ProjectOrder.Last, )"
nepoch: 20
nepoch: 50
opt: "OptimiserChain(Adam(T(1.0e-3)), ClipGrad(1))"
nunroll: 3
nunroll_valid: 5
Expand Down
6 changes: 2 additions & 4 deletions simulations/Benchmark/job_a100.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
#SBATCH --cpus-per-task=18
#SBATCH --gpus=1
#SBATCH --partition=gpu_a100
#SBATCH --time=05:00:00
#SBATCH --time=00:30:00
#SBATCH --mail-type=BEGIN,END
# #SBATCH --mail-user=s.ciarella@esciencecenter.nl
#SBATCH --array=1-8
#SBATCH --array=1-1

module load 2023
module load juliaup/1.14.5-GCCcore-12.3.0
Expand All @@ -26,7 +26,5 @@ export CONF_FILE=$1

cd $HOME/CoupledNODE.jl/simulations/Benchmark

julia --project -t auto -e 'using Pkg; Pkg.update()'

julia --project -t auto benchmark.jl

28 changes: 21 additions & 7 deletions simulations/Benchmark/src/train.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ function getdatafile(outdir, nles, filter, seed)
end

"Create data files."
createdata(; params, seeds, outdir, taskid) =
createdata(; params, seeds, outdir, taskid, backend) =
for (iseed, seed) in enumerate(seeds)
if isnothing(taskid) || iseed == taskid
@info "Creating DNS trajectory for seed $(repr(seed))"
Expand All @@ -24,7 +24,8 @@ createdata(; params, seeds, outdir, taskid) =
@info "Data file $(filenames[1]) already exists. Skipping."
continue
end
data = create_les_data(; params..., rng = Xoshiro(seed), filenames, Δt = params.Δt)
data = create_les_data(;
params..., rng = Xoshiro(seed), filenames, Δt = params.Δt, backend = backend)
@info("Trajectory info:",
data[1].comptime/60,
length(data[1].t),
Expand Down Expand Up @@ -122,6 +123,18 @@ function trainprior(;
callbackstate = trainstate = nothing
nepochs_left = nepoch
end
@info "----------------------"
@info "----------------------"
@info "----------------------"
@warn callbackstate
@info "----------------------"
@info "----------------------"
@info "----------------------"
@warn trainstate
@info "----------------------"
@info "----------------------"
@info "----------------------"
exit()

callbackstate, callback = NS.create_callback(
closure, θ, io_valid[itotal], loss, st;
Expand All @@ -135,7 +148,7 @@ function trainprior(;
l, trainstate = CoupledNODE.train(
closure, θ, st, dataloader_prior, loss; tstate = trainstate,
nepochs = nepochs_left,
alg = opt, cpu = params.backend == CPU(), callback = callback)
alg = opt, cpu = !CUDA.functional(), callback = callback)
end
save_object(checkfile, (callbackstate = callbackstate, trainstate = trainstate))

Expand Down Expand Up @@ -179,7 +192,7 @@ function trainpost(;
do_plot = false,
plot_train = false
)
device(x) = CUDA.functional() ? adapt(params.backend, x) : x
device(x) = adapt(params.backend, x)
itotal = 0
for projectorder in projectorders,
(ifil, Φ) in enumerate(params.filters),
Expand Down Expand Up @@ -207,7 +220,7 @@ function trainpost(;
setup = []
for nl in nles
x = ntuple(α -> LinRange(T(0.0), T(1.0), nl + 1), params.D)
push!(setup, Setup(; x = x, Re = params.Re))
push!(setup, Setup(; x = x, Re = params.Re, params.backend))
end

# Read the data in the format expected by the CoupledNODE
Expand All @@ -232,7 +245,8 @@ function trainpost(;

dudt_nn = NS.create_right_hand_side_with_closure(
setup[1], psolver, closure, st)
loss = create_loss_post_lux(dudt_nn; sciml_solver = Tsit5(), dt = dt)
loss = create_loss_post_lux(
dudt_nn; sciml_solver = Tsit5(), dt = dt, use_cuda = CUDA.functional())

if loadcheckpoint && isfile(checkfile)
callbackstate, trainstate, epochs_trained = CoupledNODE.load_checkpoint(checkfile)
Expand All @@ -252,7 +266,7 @@ function trainpost(;
else
l, trainstate = CoupledNODE.train(
closure, θ, st, dataloader_post, loss; tstate = trainstate, nepochs = nepochs_left,
alg = opt, cpu = params.backend == CPU(), callback = callback)
alg = opt, cpu = !CUDA.functional(), callback = callback)
end
save_object(checkfile, (callbackstate = callbackstate, trainstate = trainstate))

Expand Down
8 changes: 4 additions & 4 deletions simulations/Benchmark/update_julia
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
#!/bin/bash
#SBATCH --job-name=julia_update
#SBATCH --partition=gpu
#SBATCH --time=01:00:00
#SBATCH --time=00:20:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --gpus-per-node=1
#SBATCH --output=update_julia.out

module load 2023
module load juliaup/1.14.5-GCCcore-12.3.0

# Run Julia and update the environment
julia -e 'using Pkg; Pkg.activate("."); Pkg.add(url="https://github.com/DEEPDIP-project/CoupledNODE.jl.git"); Pkg.add(url="https://github.com/DEEPDIP-project/NeuralClosure.jl.git"); Pkg.update(); Pkg.resolve()'

srun --unbuffered julia --project -t auto -e 'using Pkg; Pkg.update(); Pkg.resolve()'
Loading