Skip to content

Commit

Permalink
Merge pull request #402 from hiddenSymmetries/ag/pr_vectorized_second…
Browse files Browse the repository at this point in the history
…_derivative_volume

Ag/pr vectorized second derivative volume
  • Loading branch information
andrewgiuliani authored Apr 15, 2024
2 parents 807404a + d6fab6a commit 38a20cb
Show file tree
Hide file tree
Showing 2 changed files with 173 additions and 1 deletion.
140 changes: 139 additions & 1 deletion src/simsoptpp/surface.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#include "surface.h"
#include <Eigen/Dense>
#include "simdhelpers.h"
#include "vec3dsimd.h"


template<class Array>
Array surface_vjp_contraction(const Array& mat, const Array& v){
Expand Down Expand Up @@ -642,12 +645,145 @@ void Surface<Array>::dvolume_by_dcoeff_impl(Array& data) {
data(m) = temp(m);
}
}

#if defined(USE_XSIMD)

template<class Array>
void Surface<Array>::d2volume_by_dcoeffdcoeff_impl(Array& data) {
// this vectorized version of d2volume_by_dcoeffdcoeff computes the second derivative of
// the surface normal on the fly, which alleviates memory requirements.
constexpr int simd_size = xsimd::simd_type<double>::size;
data *= 0.;
auto nor = this->normal();
auto xyz = this->gamma();
auto dxyz_dc = this->dgamma_by_dcoeff();
auto dnor_dc = this->dnormal_by_dcoeff();
auto d2nor_dcdc = this->d2normal_by_dcoeffdcoeff();
auto dg1_dc = this->dgammadash1_by_dcoeff();
auto dg2_dc = this->dgammadash2_by_dcoeff();
int ndofs = num_dofs();

auto dg1x_dc = AlignedPaddedVec(ndofs, 0);
auto dg1y_dc = AlignedPaddedVec(ndofs, 0);
auto dg1z_dc = AlignedPaddedVec(ndofs, 0);

auto dg2x_dc = AlignedPaddedVec(ndofs, 0);
auto dg2y_dc = AlignedPaddedVec(ndofs, 0);
auto dg2z_dc = AlignedPaddedVec(ndofs, 0);

auto dnorx_dc = AlignedPaddedVec(ndofs, 0);
auto dnory_dc = AlignedPaddedVec(ndofs, 0);
auto dnorz_dc = AlignedPaddedVec(ndofs, 0);

auto dxyzx_dc = AlignedPaddedVec(ndofs, 0);
auto dxyzy_dc = AlignedPaddedVec(ndofs, 0);
auto dxyzz_dc = AlignedPaddedVec(ndofs, 0);

for (int i = 0; i < numquadpoints_phi; ++i) {
for (int j = 0; j < numquadpoints_theta; ++j) {
simd_t xyzij0(xyz(i,j,0));
simd_t xyzij1(xyz(i,j,1));
simd_t xyzij2(xyz(i,j,2));

// load the tangents, normals, and derivatives wrt to surface coeffs into aligned and padded memory
for (int n = 0; n < ndofs; ++n) {
dg1x_dc[n] = dg1_dc(i, j, 0, n);
dg1y_dc[n] = dg1_dc(i, j, 1, n);
dg1z_dc[n] = dg1_dc(i, j, 2, n);

dg2x_dc[n] = dg2_dc(i, j, 0, n);
dg2y_dc[n] = dg2_dc(i, j, 1, n);
dg2z_dc[n] = dg2_dc(i, j, 2, n);

dnorx_dc[n] = dnor_dc(i, j, 0, n);
dnory_dc[n] = dnor_dc(i, j, 1, n);
dnorz_dc[n] = dnor_dc(i, j, 2, n);

dxyzx_dc[n] = dxyz_dc(i, j, 0, n);
dxyzy_dc[n] = dxyz_dc(i, j, 1, n);
dxyzz_dc[n] = dxyz_dc(i, j, 2, n);
}

for (int m = 0; m < ndofs; ++m){
simd_t dg1_dc_ij0m(dg1_dc(i, j, 0, m));
simd_t dg1_dc_ij1m(dg1_dc(i, j, 1, m));
simd_t dg1_dc_ij2m(dg1_dc(i, j, 2, m));

simd_t dg2_dc_ij0m(dg2_dc(i, j, 0, m));
simd_t dg2_dc_ij1m(dg2_dc(i, j, 1, m));
simd_t dg2_dc_ij2m(dg2_dc(i, j, 2, m));

simd_t dxyz_dc_ij0m(dxyz_dc(i, j, 0, m));
simd_t dxyz_dc_ij1m(dxyz_dc(i, j, 1, m));
simd_t dxyz_dc_ij2m(dxyz_dc(i, j, 2, m));

simd_t dnor_dc_ij0m(dnor_dc(i, j, 0, m));
simd_t dnor_dc_ij1m(dnor_dc(i, j, 1, m));
simd_t dnor_dc_ij2m(dnor_dc(i, j, 2, m));

for (int n = 0; n < ndofs; n+=simd_size){
// load into aligned and padded memory into batches
simd_t dg1_dc_ij0n = xs::load_aligned(&dg1x_dc[n]);
simd_t dg1_dc_ij1n = xs::load_aligned(&dg1y_dc[n]);
simd_t dg1_dc_ij2n = xs::load_aligned(&dg1z_dc[n]);

simd_t dg2_dc_ij0n = xs::load_aligned(&dg2x_dc[n]);
simd_t dg2_dc_ij1n = xs::load_aligned(&dg2y_dc[n]);
simd_t dg2_dc_ij2n = xs::load_aligned(&dg2z_dc[n]);

simd_t dnor_dc_ij0n = xs::load_aligned(&dnorx_dc[n]);
simd_t dnor_dc_ij1n = xs::load_aligned(&dnory_dc[n]);
simd_t dnor_dc_ij2n = xs::load_aligned(&dnorz_dc[n]);

simd_t dxyz_dc_ij0n = xs::load_aligned(&dxyzx_dc[n]);
simd_t dxyz_dc_ij1n = xs::load_aligned(&dxyzy_dc[n]);
simd_t dxyz_dc_ij2n = xs::load_aligned(&dxyzz_dc[n]);

// compute d2nor_dcdc on the fly
//data(i, j, 0, m, n) = dg1_dc(i, j, 1, m)*dg2_dc(i, j, 2, n) - dg1_dc(i, j, 2, m)*dg2_dc(i, j, 1, n);
//data(i, j, 0, m, n) += dg1_dc(i, j, 1, n)*dg2_dc(i, j, 2, m) - dg1_dc(i, j, 2, n)*dg2_dc(i, j, 1, m);
//data(i, j, 1, m, n) = dg1_dc(i, j, 2, m)*dg2_dc(i, j, 0, n) - dg1_dc(i, j, 0, m)*dg2_dc(i, j, 2, n);
//data(i, j, 1, m, n) += dg1_dc(i, j, 2, n)*dg2_dc(i, j, 0, m) - dg1_dc(i, j, 0, n)*dg2_dc(i, j, 2, m);
//data(i, j, 2, m, n) = dg1_dc(i, j, 0, m)*dg2_dc(i, j, 1, n) - dg1_dc(i, j, 1, m)*dg2_dc(i, j, 0, n);
//data(i, j, 2, m, n) += dg1_dc(i, j, 0, n)*dg2_dc(i, j, 1, m) - dg1_dc(i, j, 1, n)*dg2_dc(i, j, 0, m);
auto d2nor_dcdc_ij0mn = xsimd::fms(dg1_dc_ij1m, dg2_dc_ij2n, dg1_dc_ij2m*dg2_dc_ij1n);
d2nor_dcdc_ij0mn += xsimd::fms(dg1_dc_ij1n, dg2_dc_ij2m, dg1_dc_ij2n*dg2_dc_ij1m);
auto d2nor_dcdc_ij1mn = xsimd::fms(dg1_dc_ij2m, dg2_dc_ij0n, dg1_dc_ij0m*dg2_dc_ij2n);
d2nor_dcdc_ij1mn += xsimd::fms(dg1_dc_ij2n, dg2_dc_ij0m, dg1_dc_ij0n*dg2_dc_ij2m);
auto d2nor_dcdc_ij2mn = xsimd::fms(dg1_dc_ij0m, dg2_dc_ij1n, dg1_dc_ij1m*dg2_dc_ij0n);
d2nor_dcdc_ij2mn += xsimd::fms(dg1_dc_ij0n, dg2_dc_ij1m, dg1_dc_ij1n*dg2_dc_ij0m);

// now compute d2volume_by_dcoeffdcoeff
//data(m,n) += (1./3) * (dxyz_dc(i,j,0,m)*dnor_dc(i,j,0,n)
// +dxyz_dc(i,j,1,m)*dnor_dc(i,j,1,n)
// +dxyz_dc(i,j,2,m)*dnor_dc(i,j,2,n));
//data(m,n) += (1./3) * (xyz(i,j,0)*d2nor_dcdc(i,j,0,m,n) + dxyz_dc(i,j,0,n) * dnor_dc(i,j,0,m)
// +xyz(i,j,1)*d2nor_dcdc(i,j,1,m,n) + dxyz_dc(i,j,1,n) * dnor_dc(i,j,1,m)
// +xyz(i,j,2)*d2nor_dcdc(i,j,2,m,n) + dxyz_dc(i,j,2,n) * dnor_dc(i,j,2,m));
auto temp = xsimd::fma(dxyz_dc_ij0m, dnor_dc_ij0n, dxyz_dc_ij1m*dnor_dc_ij1n);
auto data1 = (1./3) * xsimd::fma(dxyz_dc_ij2m, dnor_dc_ij2n, temp);
auto data2 = (1./3) * (xsimd::fma(xyzij0, d2nor_dcdc_ij0mn , dxyz_dc_ij0n * dnor_dc_ij0m)
+xsimd::fma(xyzij1, d2nor_dcdc_ij1mn , dxyz_dc_ij1n * dnor_dc_ij1m)
+xsimd::fma(xyzij2, d2nor_dcdc_ij2mn , dxyz_dc_ij2n * dnor_dc_ij2m) );

int jjlimit = std::min(simd_size, ndofs-n);
for(int jj=0; jj<jjlimit; jj++){
data(m, n+jj) += data1[jj]+data2[jj];
}
}
}
}
}
data *= 1./ (numquadpoints_phi*numquadpoints_theta);
}

#else

template<class Array>
void Surface<Array>::d2volume_by_dcoeffdcoeff_impl(Array& data) {
data *= 0.;
auto nor = this->normal();
auto dnor_dc = this->dnormal_by_dcoeff();
auto d2nor_dcdc = this->d2normal_by_dcoeffdcoeff(); // uses a lot of memory for moderate surface complexity
auto xyz = this->gamma();
auto dxyz_dc = this->dgamma_by_dcoeff();
int ndofs = num_dofs();
Expand All @@ -668,6 +804,8 @@ void Surface<Array>::d2volume_by_dcoeffdcoeff_impl(Array& data) {
data *= 1./ (numquadpoints_phi*numquadpoints_theta);
}

#endif

#include "xtensor-python/pyarray.hpp" // Numpy bindings
typedef xt::pyarray<double> Array;
template class Surface<Array>;
34 changes: 34 additions & 0 deletions tests/geo/test_surface_taylor.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,40 @@ def test_volume_coefficient_second_derivative(self):
s = get_surface(surfacetype, stellsym)
self.subtest_volume_coefficient_second_derivative(s)

def test_volume_coefficient_second_derivative2(self):
"""
Taylor test for the second derivative of the volume w.r.t. the dofs
"""

# defining a local function get_random_surface because it let's me control number of quadpoints, and modes
def get_random_surface(surfacetype, stellsym, mpol, ntor, nphi, ntheta):
from .surface_test_helpers import get_surface as get_surface_ext
s = get_surface_ext(surfacetype, stellsym, mpol=mpol, ntor=ntor, nphi=nphi, ntheta=ntheta)
dofs = s.get_dofs()
np.random.seed(2)
rand_scale = 0.01
s.x = dofs + rand_scale * np.random.rand(len(dofs))
print(surfacetype, stellsym, mpol, ntor, nphi, ntheta)
return s

for surfacetype in self.surfacetypes:
for stellsym in [True, False]:
with self.subTest(surfacetype=surfacetype, stellsym=stellsym):
s1 = get_random_surface(surfacetype, stellsym, mpol=1, ntor=1, nphi=1, ntheta=1)
self.subtest_volume_coefficient_second_derivative(s1)

s2 = get_random_surface(surfacetype, stellsym, mpol=1, ntor=1, nphi=1, ntheta=1)
self.subtest_volume_coefficient_second_derivative(s2)

s3 = get_random_surface(surfacetype, stellsym, mpol=1, ntor=1, nphi=2, ntheta=1)
self.subtest_volume_coefficient_second_derivative(s3)

s4 = get_random_surface(surfacetype, stellsym, mpol=1, ntor=1, nphi=1, ntheta=2)
self.subtest_volume_coefficient_second_derivative(s4)

s5 = get_random_surface(surfacetype, stellsym, mpol=4, ntor=4, nphi=31, ntheta=30)
self.subtest_volume_coefficient_second_derivative(s5)

def subtest_volume_coefficient_second_derivative(self, s):
coeffs = s.x
s.invalidate_cache()
Expand Down

0 comments on commit 38a20cb

Please sign in to comment.