update cuda backend + cleanups

samayala22 · Jan 5, 2025 · aee1e8c · aee1e8c
1 parent 684bd6c
commit aee1e8c
Show file tree

Hide file tree

Showing 13 changed files with 685 additions and 129 deletions.
diff --git a/data/2dof.py b/data/2dof.py
@@ -493,7 +493,7 @@ def format_subplot(fig, row, col, xlabel, ylabel):
         # Dimensionless parameters
         dt_nd = 0.2
         # t_final_nd = U_vel * 200.0
-        t_final_nd = 1000.0
+        t_final_nd = 100.0
         vec_t_nd = np.arange(0, t_final_nd, dt_nd)
         n = len(vec_t_nd)
 

diff --git a/data/theodorsen.py b/data/theodorsen.py
@@ -2,6 +2,7 @@
 import matplotlib.pyplot as plt
 import scipy.integrate as spi
 import scipy.special as scsp
+from pathlib import Path
 
 EPS_sqrt_f = np.sqrt(1.19209e-07)
 
@@ -26,14 +27,18 @@ def uvlm_data(filename):
     uvlm_t = []
     uvlm_z = []
     uvlm_alpha = []
-    with open(f"build/windows/x64/release/{filename}.txt", "r") as f:
-        k = float(f.readline()) # get reduced frequency of the simulation
-        for line in f:
-            time_step, z, cl, alpha = line.split()
-            uvlm_t.append(float(time_step))
-            uvlm_z.append(float(z))
-            uvlm_cl.append(float(cl))
-            uvlm_alpha.append(float(alpha))
+    filepath = f"build/windows/x64/debug/{filename}.txt"
+    if Path(filepath).exists():
+        with open(filepath, "r") as f:
+            k = float(f.readline()) # get reduced frequency of the simulation
+            for line in f:
+                time_step, z, cl, alpha = line.split()
+                uvlm_t.append(float(time_step))
+                uvlm_z.append(float(z))
+                uvlm_cl.append(float(cl))
+                uvlm_alpha.append(float(alpha))
+    else:
+        print(f"File {filepath} not found")
 
     uvlm_alpha = np.array(uvlm_alpha)
     uvlm_cl = np.array(uvlm_cl)
@@ -68,7 +73,8 @@ def uvlm_data(filename):
 )
 
 files = [
-    "cl_data",
+    "cl_data_CPU",
+    "cl_data_CUDA",
     # "cl_data_025",
     # "cl_data_050",
     # "cl_data_075",

diff --git a/mesh/cylinder.x b/mesh/cylinder.x
diff --git a/tests/ut_tensor.cpp b/tests/ut_tensor.cpp
@@ -17,20 +17,6 @@
 
 using namespace vlm;
 
-void print3d(const TensorView<f32, 3, Location::Host>& tensor) {
-    // Print the 3D tensor
-    for (i64 z = 0; z < tensor.shape(2); z++) {
-        std::printf("Layer %lld:\n", z);
-        for (i64 x = 0; x < tensor.shape(0); x++) {
-            for (i64 y = 0; y < tensor.shape(1); y++) {
-                std::printf("%6.1f ", tensor(x, y, z));
-            }
-            std::printf("\n");
-        }
-        std::printf("\n");
-    }
-}
-
 int main(int, char**) {
     const std::vector<std::string> backends = get_available_backends();
 

diff --git a/tests/uvlm_2dof.cpp b/tests/uvlm_2dof.cpp
@@ -33,7 +33,6 @@
 #define COUPLED 1
 // #define COUPLED_VTU_IO 1
 // #define COUPLED_NOAERO 1
-// #define COUPLED_NOAERO_NOHEAVE 1
 
 using namespace vlm;
 using namespace linalg::ostream_overloads;
@@ -274,14 +273,6 @@ void UVLM_2DOF::alloc_buffers() {
 //     return alpha;
 // }
 
-// def alpha_freeplay(alpha, M0 = 0.0, Mf = 0.0, delta = np.radians(0.5), a_f = np.radians(0.25)):
-//     if (alpha < a_f):
-//         return M0 + alpha - a_f
-//     elif (alpha >= a_f and alpha <= (a_f + delta)):
-//         return M0 + Mf * (alpha - a_f)
-//     else: # alpha > a_F + delta
-//         return M0 + alpha - a_f + delta * (Mf - 1)
-
 f32 torsional_func(f32 alpha, f32 M0 = 0.0f, f32 Mf = 0.0f, f32 delta = to_radians(0.5f), f32 a_f = to_radians(0.25f)) {
     if (alpha < a_f) {
         return M0 + alpha - a_f;
@@ -367,11 +358,7 @@ void UVLM_2DOF::initialize_structure_data(const UVLM_2DOF_Vars& vars, const i64
 
     M_hv(0, 0) = 1.0f;
     M_hv(1, 0) = vars.x_a / pow(vars.r_a, 2);
-    #ifdef COUPLED_NOAERO_NOHEAVE
-    M_hv(0, 1) = 0.0f;
-    #else
     M_hv(0, 1) = vars.x_a;
-    #endif
     M_hv(1, 1) = 1.0f;
 
     C_hv(0, 0) = 2.0f * vars.zeta_h * (vars.omega / vars.U_a);
@@ -494,8 +481,7 @@ void UVLM_2DOF::run(const Assembly& assembly, const UVLM_2DOF_Vars& vars, f32 t_
     const f32 dy = verts_first_wing(0, -1, 1) - verts_first_wing(0, -2, 1);
     const f32 dz = verts_first_wing(0, -1, 2) - verts_first_wing(0, -2, 2);
     const f32 last_panel_chord = std::sqrt(dx*dx + dy*dy + dz*dz);
-    // const f32 dt_nd = last_panel_chord / linalg::length(assembly.kinematics()->linear_velocity(0.0f, {0.f, 0.f, 0.f}));
-    const f32 dt_nd = last_panel_chord;
+    const f32 dt_nd = last_panel_chord; // nd freestream = 1
     const i64 t_steps = static_cast<i64>(t_final_nd / dt_nd);
 
     std::cout << "dt_nd: " << dt_nd << "\n";
@@ -661,13 +647,6 @@ void UVLM_2DOF::run(const Assembly& assembly, const UVLM_2DOF_Vars& vars, f32 t_
                 dt_nd // dimensionless dt
             );
 
-            // NOTE: this doesnt work on the GPU backend
-            #ifdef COUPLED_NOAERO_NOHEAVE
-            du.view()(0) = 0.0f;
-            dv.view()(0) = 0.0f;
-            da.view()(0) = 0.0f;
-            #endif
-
             du.view().to(du_h.view());
 
             u_d.view().slice(All, i).to(u_d.view().slice(All, i+1));
@@ -875,7 +854,7 @@ int main() {
 
     const f32 flutter_speed = 6.285f;
     const f32 flutter_ratio = 0.2f;
-    const f32 t_final_nd = 1000.f;
+    const f32 t_final_nd = 100.f;
 
     UVLM_2DOF_Vars vars;
     vars.b = 0.5f;

diff --git a/tests/uvlm_theodorsen.cpp b/tests/uvlm_theodorsen.cpp
@@ -15,7 +15,7 @@ using namespace vlm;
 using namespace linalg::ostream_overloads;
 
 int main() {
-    const std::vector<std::string> meshes = {"../../../../mesh/infinite_rectangular_5x10.x"};
+    const std::vector<std::string> meshes = {"../../../../mesh/infinite_rectangular_10x5.x"};
     const std::vector<std::string> backends = get_available_backends();
 
     auto solvers = tiny::make_combination(meshes, backends);

diff --git a/vlm/backends/cpu/src/vlm_backend_cpu.cpp b/vlm/backends/cpu/src/vlm_backend_cpu.cpp
@@ -200,6 +200,7 @@ void CPU_BLAS::axpy(const f32 alpha, const TensorView<f32, 2, Location::Device>&
 }
 
 BackendCPU::BackendCPU() : Backend(create_memory_manager(), create_blas()) {
+    name = "CPU";
     print_cpu_info();
 }
 
@@ -499,10 +500,7 @@ linalg::float3 BackendCPU::coeff_cm(
             const linalg::float3 v1 = {verts_wing(i+1, j, 0), verts_wing(i+1, j, 1), verts_wing(i+1, j, 2)}; // right leading vortex line
             const linalg::float3 force = {forces(i, j, 0), forces(i, j, 1), forces(i, j, 2)};
 
-            // const linalg::float3 dst_to_ref = ref_pt - 0.5f * (v0 + v1);
-            // cm += linalg::cross(force, dst_to_ref);
-
-            const linalg::float3 f_applied = 0.5f * (v0 + v1);
+            const linalg::float3 f_applied = 0.5f * (v0 + v1); // force applied at the center of leading edge vortex line
             const linalg::float3 lever = f_applied - ref_pt;
             cm += linalg::cross(lever, force);
         }

diff --git a/vlm/backends/cuda/include/vlm_backend_cuda.hpp b/vlm/backends/cuda/include/vlm_backend_cuda.hpp
@@ -15,11 +15,37 @@ class BackendCUDA final : public Backend {
         void rhs_assemble_wake_influence(TensorView<f32, 1, Location::Device>& rhs, const MultiTensorView2D<Location::Device>& gamma_wake, const MultiTensorView3D<Location::Device>& colloc, const MultiTensorView3D<Location::Device>& normals, const MultiTensorView3D<Location::Device>& verts_wake, i32 iteration) override;
         void displace_wake_rollup(MultiTensorView3D<Location::Device>& wake_rollup, const MultiTensorView3D<Location::Device>& verts_wake, const MultiTensorView3D<Location::Device>& verts_wing, const MultiTensorView2D<Location::Device>& gamma_wing, const MultiTensorView2D<Location::Device>& gamma_wake, f32 dt, i32 iteration) override;
 
-        // Per mesh kernels
+        // TODO: deprecate
         f32 coeff_steady_cl_single(const TensorView3D<Location::Device>& verts_wing, const TensorView2D<Location::Device>& gamma_delta, const FlowData& flow, f32 area) override;
         f32 coeff_steady_cd_single(const TensorView3D<Location::Device>& verts_wake, const TensorView2D<Location::Device>& gamma_wake, const FlowData& flow, f32 area) override;
-        f32 coeff_unsteady_cl_single(const TensorView3D<Location::Device>& verts_wing, const TensorView2D<Location::Device>& gamma_delta, const TensorView2D<Location::Device>& gamma, const TensorView2D<Location::Device>& gamma_prev, const TensorView3D<Location::Device>& local_velocities, const TensorView2D<Location::Device>& areas, const TensorView3D<Location::Device>& normals, const linalg::float3& freestream, f32 dt, f32 area) override;
-        void coeff_unsteady_cl_single_forces(const TensorView3D<Location::Device>& verts_wing, const TensorView2D<Location::Device>& gamma_delta, const TensorView2D<Location::Device>& gamma, const TensorView2D<Location::Device>& gamma_prev, const TensorView3D<Location::Device>& velocities, const TensorView2D<Location::Device>& areas, const TensorView3D<Location::Device>& normals, TensorView3D<Location::Device>& forces, const linalg::float3& freestream, f32 dt) override;
+
+        void forces_unsteady(
+            const TensorView3D<Location::Device>& verts_wing,
+            const TensorView2D<Location::Device>& gamma_delta,
+            const TensorView2D<Location::Device>& gamma,
+            const TensorView2D<Location::Device>& gamma_prev,
+            const TensorView3D<Location::Device>& velocities,
+            const TensorView2D<Location::Device>& areas,
+            const TensorView3D<Location::Device>& normals,
+            const TensorView3D<Location::Device>& forces,
+            f32 dt
+        ) override;
+        f32 coeff_cl(
+            const TensorView3D<Location::Device>& forces,
+            const linalg::float3& lift_axis,
+            const linalg::float3& freestream,
+            const f32 rho,
+            const f32 area
+        ) override;
+        linalg::float3 coeff_cm(
+            const TensorView3D<Location::Device>& forces,
+            const TensorView3D<Location::Device>& verts_wing,
+            const linalg::float3& ref_pt,
+            const linalg::float3& freestream,
+            const f32 rho,
+            const f32 area,
+            const f32 mac
+        ) override;
 
         void mesh_metrics(const f32 alpha_rad, const MultiTensorView3D<Location::Device>& verts_wing, MultiTensorView3D<Location::Device>& colloc, MultiTensorView3D<Location::Device>& normals, MultiTensorView2D<Location::Device>& areas) override;
         f32 mesh_mac(const TensorView3D<Location::Device>& verts_wing, const TensorView2D<Location::Device>& areas) override;
@@ -31,6 +57,17 @@ class BackendCUDA final : public Backend {
         // std::unique_ptr<Kernels> create_kernels() override;
         std::unique_ptr<LU> create_lu_solver() override;
         std::unique_ptr<BLAS> create_blas() override;
+
+        // Intermediate values for reduction
+        // Still not certain if this is the best way
+        // Currently it makes the functions that use these values not thread safe
+        // So multiple instances of the same function cannot run in parallel
+        f32* d_cl;
+        f32* d_cd;
+        f32* d_cm_x;
+        f32* d_cm_y;
+        f32* d_cm_z;
+        f32* d_mac;
 };
 
 } // namespace vlm