flatironinstitute · chaithyagr · Nov 26, 2024 · Dec 6, 2024 · Dec 6, 2024 · Dec 6, 2024
diff --git a/include/finufft/utils.h b/include/finufft/utils.h
@@ -12,6 +12,22 @@
 namespace finufft {
 namespace utils {
 
+// Simple helper to wrap pointer in std::vector and release it later
+template <class T>
+void wrapArrayInVector( T *sourceArray, size_t arraySize, std::vector<T, xsimd::aligned_allocator<T, 64> > &targetVector ) {
+  typename std::_Vector_base<T, xsimd::aligned_allocator<T, 64> >::_Vector_impl *vectorPtr =
+    (typename std::_Vector_base<T, xsimd::aligned_allocator<T, 64> >::_Vector_impl *)((void *) &targetVector);
+  vectorPtr->_M_start = sourceArray;
+  vectorPtr->_M_finish = vectorPtr->_M_end_of_storage = vectorPtr->_M_start + arraySize;
+}
+
+template <class T>
+void releaseVectorWrapper( std::vector<T, xsimd::aligned_allocator<T, 64> > &targetVector ) {
+  typename std::_Vector_base<T, xsimd::aligned_allocator<T, 64> >::_Vector_impl *vectorPtr =
+        (typename std::_Vector_base<T, xsimd::aligned_allocator<T, 64> >::_Vector_impl *)((void *) &targetVector);
+  vectorPtr->_M_start = vectorPtr->_M_finish = vectorPtr->_M_end_of_storage = NULL;
+}
+
 // ahb's low-level array helpers
 template<typename T>
 FINUFFT_EXPORT T FINUFFT_CDECL relerrtwonorm(BIGINT n, const std::complex<T> *a,

diff --git a/include/finufft_opts.h b/include/finufft_opts.h
@@ -12,6 +12,7 @@
   int modeord; // (type 1,2 only): 0 CMCL-style increasing mode order
                //                  1 FFT-style mode order
   int chkbnds; // [DEPRECATED] 0 don't check NU pts in [-3pi,3pi), 1 do (<few % slower)
+  int spreadinterponly; // 0 (default) do full NUFFT, 1 just spread/interp
 
   // diagnostic opts...
   int debug;        // 0 silent, 1 some timing/debug, or 2 more
@@ -23,7 +24,7 @@
  int fftw;               // plan flags to FFTW (FFTW_ESTIMATE=64, FFTW_MEASURE=0,...)
  int spread_sort;        // spreader: 0 don't sort, 1 do, or 2 heuristic choice
  int spread_kerevalmeth; // spreader: 0 exp(sqrt()), 1 Horner piecewise poly (faster)
  int spread_kerpad;      // (exp(sqrt()) only): 0 don't pad kernel to 4n, 1 do
  double upsampfac;       // upsampling ratio sigma: 2.0 std, 1.25 small FFT, 0.0 auto
  int spread_thread;      // (vectorized ntr>1 only): 0 auto, 1 seq multithreaded,
                          //                          2 parallel single-thread spread

diff --git a/python/finufft/finufft/_finufft.py b/python/finufft/finufft/_finufft.py
@@ -73,6 +73,7 @@ class FinufftOpts(ctypes.Structure):
 
 FinufftOpts._fields_ = [('modeord', c_int),
                       ('chkbnds', c_int),
+                      ('spreadinterponly', c_int),
                       ('debug', c_int),
                       ('spread_debug', c_int),
                       ('showwarn', c_int),

diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp
@@ -530,6 +530,7 @@ void finufft_default_opts_t(finufft_opts *o)
   o->spread_kerpad      = 1;
   o->upsampfac          = 0.0;
   o->spread_thread      = 0;
+  o->spreadinterponly = 0;
   o->maxbatchsize       = 0;
   o->spread_nthr_atomic = -1;
   o->spread_max_sp_size = 0;
@@ -560,8 +561,9 @@ FINUFFT_PLAN_T<TF>::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i
     printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n",
            __func__);
 
-  fftPlan = std::make_unique<Finufft_FFT_plan<TF>>(
-      opts.fftw_lock_fun, opts.fftw_unlock_fun, opts.fftw_lock_data);
+  if (!opts.spreadinterponly) // Dont make plans if only spread or interpolate
+    fftPlan = std::make_unique<Finufft_FFT_plan<TF>>(
+        opts.fftw_lock_fun, opts.fftw_unlock_fun, opts.fftw_lock_data);
 
   if ((type != 1) && (type != 2) && (type != 3)) {
     fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type);
@@ -668,66 +670,72 @@ FINUFFT_PLAN_T<TF>::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i
                 __func__, (double)(EPSILON * mu));
     }
 
-    // determine fine grid sizes, sanity check..
     int nfier = set_nf_type12(ms, opts, spopts, &nf1);
     if (nfier) throw nfier; // nf too big; we're done
-    phiHat1.resize(nf1 / 2 + 1);
     if (dim > 1) {
       nfier = set_nf_type12(mt, opts, spopts, &nf2);
       if (nfier) throw nfier;
-      phiHat2.resize(nf2 / 2 + 1);
     }
     if (dim > 2) {
       nfier = set_nf_type12(mu, opts, spopts, &nf3);
       if (nfier) throw nfier;
-      phiHat3.resize(nf3 / 2 + 1);
-    }
-
-    if (opts.debug) { // "long long" here is to avoid warnings with printf...
-      printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) "
-             "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n               ntrans=%d nthr=%d "
-             "batchSize=%d ",
-             __func__, dim, type, (long long)ms, (long long)mt, (long long)mu,
-             (long long)nf1, (long long)nf2, (long long)nf3, ntrans, nthr, batchSize);
-      if (batchSize == 1) // spread_thread has no effect in this case
-        printf("\n");
-      else
-        printf(" spread_thread=%d\n", opts.spread_thread);
     }
+    if(!opts.spreadinterponly) // We dont need fseries if it is spreadinterponly.
+    {
+      // determine fine grid sizes, sanity check..
+      phiHat1.resize(nf1 / 2 + 1);
+      if (dim > 1) {
+        phiHat2.resize(nf2 / 2 + 1);
+      }
+      if (dim > 2) {
+        phiHat3.resize(nf3 / 2 + 1);
+      }
 
-    // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim
-    CNTime timer;
-    timer.start();
-    onedim_fseries_kernel(nf1, phiHat1, spopts);
-    if (dim > 1) onedim_fseries_kernel(nf2, phiHat2, spopts);
-    if (dim > 2) onedim_fseries_kernel(nf3, phiHat3, spopts);
-    if (opts.debug)
-      printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, spopts.nspread,
-             timer.elapsedsec());
+      if (opts.debug) { // "long long" here is to avoid warnings with printf...
+        printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) "
+               "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n               ntrans=%d nthr=%d "
+               "batchSize=%d ",
+               __func__, dim, type, (long long)ms, (long long)mt, (long long)mu,
+               (long long)nf1, (long long)nf2, (long long)nf3, ntrans, nthr, batchSize);
+        if (batchSize == 1) // spread_thread has no effect in this case
+          printf("\n");
+        else
+          printf(" spread_thread=%d\n", opts.spread_thread);
+      }
 
-    nf = nf1 * nf2 * nf3; // fine grid total number of points
-    if (nf * batchSize > MAX_NF) {
-      fprintf(
-          stderr,
-          "[%s] fwBatch would be bigger than MAX_NF, not attempting memory allocation!\n",
-          __func__);
-      throw int(FINUFFT_ERR_MAXNALLOC);
+      // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim
+      CNTime timer;
+      timer.start();
+      onedim_fseries_kernel(nf1, phiHat1, spopts);
+      if (dim > 1) onedim_fseries_kernel(nf2, phiHat2, spopts);
+      if (dim > 2) onedim_fseries_kernel(nf3, phiHat3, spopts);
+      if (opts.debug)
+        printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, spopts.nspread,
+               timer.elapsedsec());
+
+      nf = nf1 * nf2 * nf3; // fine grid total number of points
+      if (nf * batchSize > MAX_NF) {
+        fprintf(
+            stderr,
+            "[%s] fwBatch would be bigger than MAX_NF, not attempting memory allocation!\n",
+            __func__);
+        throw int(FINUFFT_ERR_MAXNALLOC);
+      }  
+
+      timer.restart();
+      fwBatch.resize(nf * batchSize); // the big workspace
+      if (opts.debug)
+        printf("[%s] fwBatch %.2fGB alloc:   \t%.3g s\n", __func__,
+               (double)1E-09 * sizeof(std::complex<TF>) * nf * batchSize,
+               timer.elapsedsec());
+
+      timer.restart(); // plan the FFTW
+      const auto ns = gridsize_for_fft(this);
+      fftPlan->plan(ns, batchSize, fwBatch.data(), fftSign, opts.fftw, nthr_fft);
+      if (opts.debug)
+        printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, opts.fftw, nthr_fft,
+               timer.elapsedsec());
     }
-
-    timer.restart();
-    fwBatch.resize(nf * batchSize); // the big workspace
-    if (opts.debug)
-      printf("[%s] fwBatch %.2fGB alloc:   \t%.3g s\n", __func__,
-             (double)1E-09 * sizeof(std::complex<TF>) * nf * batchSize,
-             timer.elapsedsec());
-
-    timer.restart(); // plan the FFTW
-    const auto ns = gridsize_for_fft(this);
-    fftPlan->plan(ns, batchSize, fwBatch.data(), fftSign, opts.fftw, nthr_fft);
-    if (opts.debug)
-      printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, opts.fftw, nthr_fft,
-             timer.elapsedsec());
-
   } else { // -------------------------- type 3 (no planning) ------------
 
     if (opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans);
@@ -1041,19 +1049,30 @@ int FINUFFT_PLAN_T<TF>::execute(std::complex<TF> *cj, std::complex<TF> *fk) {
       // STEP 1: (varies by type)
       timer.restart();
       if (type == 1) { // type 1: spread NU pts X, weights cj, to fw grid
+        if (opts.spreadinterponly)
+          wrapArrayInVector(fkb, thisBatchSize*N, this->fwBatch);
         spreadinterpSortedBatch<TF>(thisBatchSize, this, cjb);
         t_sprint += timer.elapsedsec();
-      } else { //  type 2: amplify Fourier coeffs fk into 0-padded fw
+        // Stop here if it is spread interp only.
+        if (opts.spreadinterponly)
+        {
+          releaseVectorWrapper(this->fwBatch);
+          continue;
+        }
+      } else if(!opts.spreadinterponly) { //  type 2: amplify Fourier coeffs fk into 0-padded fw, but dont do it if it is spread interp only.
         deconvolveBatch<TF>(thisBatchSize, this, fkb);
         t_deconv += timer.elapsedsec();
       }
-
-      // STEP 2: call the FFT on this batch
-      timer.restart();
-      do_fft(this);
-      t_fft += timer.elapsedsec();
-      if (opts.debug > 1) printf("\tFFT exec:\t\t%.3g s\n", timer.elapsedsec());
-
+      if (!opts.spreadinterponly) // Do FFT only if its not spread interp only.
+      {
+        // STEP 2: call the FFT on this batch
+        timer.restart();
+        do_fft(this);
+        t_fft += timer.elapsedsec();
+        if (opts.debug > 1) printf("\tFFT exec:\t\t%.3g s\n", timer.elapsedsec());
+      }
+      else
+        wrapArrayInVector(fkb, thisBatchSize*N, this->fwBatch);
       // STEP 3: (varies by type)
       timer.restart();
       if (type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk
@@ -1063,6 +1082,9 @@ int FINUFFT_PLAN_T<TF>::execute(std::complex<TF> *cj, std::complex<TF> *fk) {
         spreadinterpSortedBatch<TF>(thisBatchSize, this, cjb);
         t_sprint += timer.elapsedsec();
       }
+      // Release the fwBatch vector to prevent double freeing of memory.
+      if(opts.spreadinterponly)
+        releaseVectorWrapper(this->fwBatch);
     } // ........end b loop
 
     if (opts.debug) { // report total times in their natural order...

diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
@@ -176,7 +176,7 @@
  return output;
 }

 template<typename T> FINUFFT_ALWAYS_INLINE auto xsimd_to_array(const T &vec) noexcept {
  constexpr auto alignment = T::arch_type::alignment();
  alignas(alignment) std::array<typename T::value_type, T::size> array{};
  vec.store_aligned(array.data());
@@ -218,7 +218,7 @@
   limitation Marco Barbone, 8.5.2024 Changed it from a Macro to an inline function
 */
 template<typename T>
 static FINUFFT_ALWAYS_INLINE T fold_rescale(const T x, const UBIGINT N) noexcept {
  const T result = x * T(INV_2PI) + T(0.5);
  return (result - floor(result)) * T(N);
 }
@@ -279,7 +279,7 @@
 template<typename T, uint8_t w, uint8_t upsampfact,
         class simd_type =
             xsimd::make_sized_batch_t<T, find_optimal_simd_width<T, w>()>> // aka ns
 static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner(
    T *FINUFFT_RESTRICT ker, T x, const finufft_spread_opts &opts) noexcept
 /* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
 x_j = x + j,  for j=0,..,w-1.  Thus x in [-w/2,-w/2+1].   w is aka ns.
@@ -330,7 +330,7 @@
    simd_type k_prev, k_sym{0};
    for (uint8_t i{0}, offset = offset_start; i < end_idx;
         i += simd_size, offset -= simd_size) {
      auto k_odd = [i]() constexpr noexcept {
        if constexpr (if_odd_degree) {
          return simd_type::load_aligned(padded_coeffs[0].data() + i);
        } else {
@@ -814,7 +814,7 @@
 template<uint8_t ns, uint8_t kerevalmeth, class T,
         class simd_type = xsimd::make_sized_batch_t<T, find_optimal_simd_width<T, ns>()>,
         typename... V>
 static FINUFFT_ALWAYS_INLINE auto ker_eval(
    T *FINUFFT_RESTRICT ker, const finufft_spread_opts &opts, const V... elems) noexcept {
  /* Utility function that allows to move the kernel evaluation outside the spreader for
     clarity
@@ -2160,7 +2160,7 @@
               upsampfac);
       return FINUFFT_ERR_HORNER_WRONG_BETA;
     }
-    if (upsampfac <= 1.0) { // no digits would result
+    if (upsampfac < 1.0) { // no digits would result
       fprintf(stderr, "FINUFFT setup_spreader: error, upsampfac=%.3g is <=1.0\n",
               upsampfac);
       return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;