flatironinstitute · DiamonDinoia · Aug 2, 2024 · Jul 3, 2024 · Jul 4, 2024 · Jul 8, 2024
diff --git a/CHANGELOG b/CHANGELOG
@@ -52,6 +52,16 @@ V 2.3.0beta (7/24/24)
   Added support for Windows (llvm, msvc), Linux (llvm, gcc) and MacOS (llvm, gcc).
 * cmake support for both ducc0 and fftw
 * cmake adding nvcc and msvc optimization flags
+* cuFINUFFT binsize is now a function of the shared memory available where
+  possible.
+* cuFINUFFT GM 1D sorts using thrust::sort instead of bin-sort.
+* cuFINUFFT using the new normalized Horner coefficients and added support
+  for 1.25.
+* cuFINUFFT new compile flags for extra-vectorization, flushing single
+  precision denormals to 0 and using fma where possible.
+* cuFINUFFT using intrinsics in foldrescale and other places to increase
+  performance
+* cuFINUFFT using SM90 float2 vector atomicAdd where supported
 
 V 2.2.0 (12/12/23)
 

diff --git a/devel/gen_all_horner_C_code.m b/devel/gen_all_horner_C_code.m
@@ -12,12 +12,12 @@
 
 for upsampfac = [2.0, 1.25];   % sigma: either 2 (default) or low (eg 5/4)
   fprintf('upsampfac = %g...\n',upsampfac)
-  
+
   ws = 2:16;
-  opts.wpad = true;    % pad kernel eval to multiple of 4
+  opts.wpad = false;    % pad kernel eval to multiple of 4
 
-  if upsampfac==2, fid = fopen('../src/ker_horner_allw_loop_constexpr.c','w');
-  else, fid = fopen('../src/ker_lowupsampfac_horner_allw_loop_constexpr.c','w');
+  if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc','w');
+  else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc','w');
   end
   fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n'));
   fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n'));
@@ -27,9 +27,9 @@
     fprintf('w=%d\td=%d\tbeta=%.3g\n',w,d,beta);
     str = gen_ker_horner_loop_C_code(w,d,beta,opts);
     if j==1                                % write switch statement
-      fwrite(fid,sprintf('  if constexpr(w==%d) {\n',w));
+      fwrite(fid,sprintf('  if (w==%d) {\n',w));
     else
-      fwrite(fid,sprintf('  } else if constexpr(w==%d) {\n',w));
+      fwrite(fid,sprintf('  } else if (w==%d) {\n',w));
     end
     for i=1:numel(str); fwrite(fid,['    ',str{i}]); end
   end

diff --git a/devel/gen_ker_horner_loop_C_code.m b/devel/gen_ker_horner_loop_C_code.m
@@ -38,9 +38,9 @@
   width = w;
 end
 for n=1:d+1                 % loop over poly coeff powers
-  s = sprintf('FLT c%d[] = {%.16E',n-1, C(n,1));
+  s = sprintf('constexpr FLT c%d[] = {%.16E',n-1, C(n,1));
   for i=2:width            % loop over segments
-    s = sprintf('%s, %.16E', s, C(n,i));      
+    s = sprintf('%s, %.16E', s, C(n,i));
   end
   str{n} = [s sprintf('};\n')];
 end

diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h
@@ -32,6 +32,38 @@ template<typename T>
 void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
                                    T *fwkerhalf, finufft_spread_opts opts);
 
+template<typename T>
+std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y,
+                                   int bin_size_z);
+
+template<typename T>
+void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts);
+
+template<typename T, typename V>
+auto cufinufft_set_shared_memory(V *kernel, const int dim,
+                                 const cufinufft_plan_t<T> &d_plan) {
+  int device_id;
+  cudaGetDevice(&device_id);
+  const auto shared_mem_required =
+      shared_memory_required<T>(dim, d_plan.spopts.nspread, d_plan.opts.gpu_binsizex,
+                                d_plan.opts.gpu_binsizey, d_plan.opts.gpu_binsizez);
+  int shared_mem_per_block{};
+  const auto err = cudaDeviceGetAttribute(
+      &shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
+  if (err != cudaSuccess) {
+    return err;
+  }
+  if (shared_mem_required > shared_mem_per_block) {
+    fprintf(stderr,
+            "Error: Shared memory required per block is %zu bytes, but the device "
+            "supports only %d bytes.\n",
+            shared_mem_required, shared_mem_per_block);
+    return err;
+  }
+  return cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                              shared_mem_required);
+}
+
 } // namespace common
 } // namespace cufinufft
 #endif
diff --git a/include/cufinufft/contrib/helper_cuda.h b/include/cufinufft/contrib/helper_cuda.h
@@ -58,13 +58,14 @@ static inline cudaError_t cudaFreeWrapper(T *devPtr, cudaStream_t stream,
   return pool_supported ? cudaFreeAsync(devPtr, stream) : cudaFree(devPtr);
 }
 
-#define RETURN_IF_CUDA_ERROR                                         \
-  {                                                                  \
-    cudaError_t err = cudaGetLastError();                            \
-    if (err != cudaSuccess) {                                        \
-      printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err)); \
-      return FINUFFT_ERR_CUDA_FAILURE;                               \
-    }                                                                \
+#define RETURN_IF_CUDA_ERROR                                                         \
+  {                                                                                  \
+    cudaError_t err = cudaGetLastError();                                            \
+    if (err != cudaSuccess) {                                                        \
+      printf("[%s] Error: %s in %s at line %d\n", __func__, cudaGetErrorString(err), \
+             __FILE__, __LINE__);                                                    \
+      return FINUFFT_ERR_CUDA_FAILURE;                                               \
+    }                                                                                \
   }
 
 #define CUDA_FREE_AND_NULL(val, stream, pool_supported)                              \