diff --git a/include/finufft_opts.h b/include/finufft_opts.h
index 372bbbb6f..d8334ffba 100644
--- a/include/finufft_opts.h
+++ b/include/finufft_opts.h
@@ -12,6 +12,7 @@ typedef struct finufft_opts { // defaults see finufft_core.cpp:finufft_default_o
   int modeord; // (type 1,2 only): 0 CMCL-style increasing mode order
                //                  1 FFT-style mode order
   int chkbnds; // [DEPRECATED] 0 don't check NU pts in [-3pi,3pi), 1 do (<few % slower)
+  int spreadinterponly; // 0 (default) do full NUFFT, 1 just spread/interp
 
   // diagnostic opts...
   int debug;        // 0 silent, 1 some timing/debug, or 2 more
diff --git a/python/finufft/finufft/_finufft.py b/python/finufft/finufft/_finufft.py
index 10af6829c..7b52b43e4 100644
--- a/python/finufft/finufft/_finufft.py
+++ b/python/finufft/finufft/_finufft.py
@@ -73,6 +73,7 @@ class FinufftOpts(ctypes.Structure):
 
 FinufftOpts._fields_ = [('modeord', c_int),
                       ('chkbnds', c_int),
+                      ('spreadinterponly', c_int),
                       ('debug', c_int),
                       ('spread_debug', c_int),
                       ('showwarn', c_int),
diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp
index 9cc45e7e4..ad4b8155f 100644
--- a/src/finufft_core.cpp
+++ b/src/finufft_core.cpp
@@ -424,8 +424,7 @@ static void deconvolveshuffle3d(int dir, T prefac, std::vector<T> &ker1,
 // --------- batch helper functions for t1,2 exec: ---------------------------
 
 template<typename T>
-static int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN_T<T> *p,
-                                   std::complex<T> *cBatch)
+static int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN_T<T> *p, std::complex<T> *fwBatch, std::complex<T> *cBatch)
 /*
   Spreads (or interpolates) a batch of batchSize strength vectors in cBatch
   to (or from) the batch of fine working grids p->fwBatch, using the same set of
@@ -447,7 +446,7 @@ static int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN_T<T> *p,
 #endif
 #pragma omp parallel for num_threads(nthr_outer)
   for (int i = 0; i < batchSize; i++) {
-    std::complex<T> *fwi = p->fwBatch.data() + i * p->nf; // start of i'th fw array in
+    std::complex<T> *fwi = fwBatch + i * p->nf; // start of i'th fw array in
                                                           // wkspace
     std::complex<T> *ci = cBatch + i * p->nj; // start of i'th c array in cBatch
     spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (T *)fwi, p->nj, p->X,
@@ -530,6 +529,7 @@ void finufft_default_opts_t(finufft_opts *o)
   o->spread_kerpad      = 1;
   o->upsampfac          = 0.0;
   o->spread_thread      = 0;
+  o->spreadinterponly = 0;
   o->maxbatchsize       = 0;
   o->spread_nthr_atomic = -1;
   o->spread_max_sp_size = 0;
@@ -560,8 +560,9 @@ FINUFFT_PLAN_T<TF>::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i
     printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n",
            __func__);
 
-  fftPlan = std::make_unique<Finufft_FFT_plan<TF>>(
-      opts.fftw_lock_fun, opts.fftw_unlock_fun, opts.fftw_lock_data);
+  if (!opts.spreadinterponly) // Dont make plans if only spread or interpolate
+    fftPlan = std::make_unique<Finufft_FFT_plan<TF>>(
+        opts.fftw_lock_fun, opts.fftw_unlock_fun, opts.fftw_lock_data);
 
   if ((type != 1) && (type != 2) && (type != 3)) {
     fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type);
@@ -668,66 +669,72 @@ FINUFFT_PLAN_T<TF>::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i
                 __func__, (double)(EPSILON * mu));
     }
 
-    // determine fine grid sizes, sanity check..
     int nfier = set_nf_type12(ms, opts, spopts, &nf1);
     if (nfier) throw nfier; // nf too big; we're done
-    phiHat1.resize(nf1 / 2 + 1);
     if (dim > 1) {
       nfier = set_nf_type12(mt, opts, spopts, &nf2);
       if (nfier) throw nfier;
-      phiHat2.resize(nf2 / 2 + 1);
     }
     if (dim > 2) {
       nfier = set_nf_type12(mu, opts, spopts, &nf3);
       if (nfier) throw nfier;
-      phiHat3.resize(nf3 / 2 + 1);
-    }
-
-    if (opts.debug) { // "long long" here is to avoid warnings with printf...
-      printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) "
-             "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n               ntrans=%d nthr=%d "
-             "batchSize=%d ",
-             __func__, dim, type, (long long)ms, (long long)mt, (long long)mu,
-             (long long)nf1, (long long)nf2, (long long)nf3, ntrans, nthr, batchSize);
-      if (batchSize == 1) // spread_thread has no effect in this case
-        printf("\n");
-      else
-        printf(" spread_thread=%d\n", opts.spread_thread);
     }
+    if(!opts.spreadinterponly) // We dont need fseries if it is spreadinterponly.
+    {
+      // determine fine grid sizes, sanity check..
+      phiHat1.resize(nf1 / 2 + 1);
+      if (dim > 1) {
+        phiHat2.resize(nf2 / 2 + 1);
+      }
+      if (dim > 2) {
+        phiHat3.resize(nf3 / 2 + 1);
+      }
 
-    // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim
-    CNTime timer;
-    timer.start();
-    onedim_fseries_kernel(nf1, phiHat1, spopts);
-    if (dim > 1) onedim_fseries_kernel(nf2, phiHat2, spopts);
-    if (dim > 2) onedim_fseries_kernel(nf3, phiHat3, spopts);
-    if (opts.debug)
-      printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, spopts.nspread,
-             timer.elapsedsec());
+      if (opts.debug) { // "long long" here is to avoid warnings with printf...
+        printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) "
+               "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n               ntrans=%d nthr=%d "
+               "batchSize=%d ",
+               __func__, dim, type, (long long)ms, (long long)mt, (long long)mu,
+               (long long)nf1, (long long)nf2, (long long)nf3, ntrans, nthr, batchSize);
+        if (batchSize == 1) // spread_thread has no effect in this case
+          printf("\n");
+        else
+          printf(" spread_thread=%d\n", opts.spread_thread);
+      }
 
-    nf = nf1 * nf2 * nf3; // fine grid total number of points
-    if (nf * batchSize > MAX_NF) {
-      fprintf(
-          stderr,
-          "[%s] fwBatch would be bigger than MAX_NF, not attempting memory allocation!\n",
-          __func__);
-      throw int(FINUFFT_ERR_MAXNALLOC);
+      // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim
+      CNTime timer;
+      timer.start();
+      onedim_fseries_kernel(nf1, phiHat1, spopts);
+      if (dim > 1) onedim_fseries_kernel(nf2, phiHat2, spopts);
+      if (dim > 2) onedim_fseries_kernel(nf3, phiHat3, spopts);
+      if (opts.debug)
+        printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, spopts.nspread,
+               timer.elapsedsec());
+
+      nf = nf1 * nf2 * nf3; // fine grid total number of points
+      if (nf * batchSize > MAX_NF) {
+        fprintf(
+            stderr,
+            "[%s] fwBatch would be bigger than MAX_NF, not attempting memory allocation!\n",
+            __func__);
+        throw int(FINUFFT_ERR_MAXNALLOC);
+      }  
+      
+      timer.restart();
+      fwBatch.resize(nf * batchSize); // the big workspace
+      if (opts.debug)
+        printf("[%s] fwBatch %.2fGB alloc:   \t%.3g s\n", __func__,
+               (double)1E-09 * sizeof(std::complex<TF>) * nf * batchSize,
+               timer.elapsedsec());
+
+      timer.restart(); // plan the FFTW
+      const auto ns = gridsize_for_fft(this);
+      fftPlan->plan(ns, batchSize, fwBatch.data(), fftSign, opts.fftw, nthr_fft);
+      if (opts.debug)
+        printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, opts.fftw, nthr_fft,
+               timer.elapsedsec());
     }
-
-    timer.restart();
-    fwBatch.resize(nf * batchSize); // the big workspace
-    if (opts.debug)
-      printf("[%s] fwBatch %.2fGB alloc:   \t%.3g s\n", __func__,
-             (double)1E-09 * sizeof(std::complex<TF>) * nf * batchSize,
-             timer.elapsedsec());
-
-    timer.restart(); // plan the FFTW
-    const auto ns = gridsize_for_fft(this);
-    fftPlan->plan(ns, batchSize, fwBatch.data(), fftSign, opts.fftw, nthr_fft);
-    if (opts.debug)
-      printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, opts.fftw, nthr_fft,
-             timer.elapsedsec());
-
   } else { // -------------------------- type 3 (no planning) ------------
 
     if (opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans);
@@ -1041,28 +1048,33 @@ int FINUFFT_PLAN_T<TF>::execute(std::complex<TF> *cj, std::complex<TF> *fk) {
       // STEP 1: (varies by type)
       timer.restart();
       if (type == 1) { // type 1: spread NU pts X, weights cj, to fw grid
-        spreadinterpSortedBatch<TF>(thisBatchSize, this, cjb);
+        spreadinterpSortedBatch<TF>(thisBatchSize, this, opts.spreadinterponly? fkb: this->fwBatch.data(), cjb);
         t_sprint += timer.elapsedsec();
-      } else { //  type 2: amplify Fourier coeffs fk into 0-padded fw
+        // Stop here if it is spread interp only.
+        if (opts.spreadinterponly)
+          continue;
+      } else if(!opts.spreadinterponly) { //  type 2: amplify Fourier coeffs fk into 0-padded fw, but dont do it if it is spread interp only.
         deconvolveBatch<TF>(thisBatchSize, this, fkb);
         t_deconv += timer.elapsedsec();
       }
-
-      // STEP 2: call the FFT on this batch
-      timer.restart();
-      do_fft(this);
-      t_fft += timer.elapsedsec();
-      if (opts.debug > 1) printf("\tFFT exec:\t\t%.3g s\n", timer.elapsedsec());
-
+      if (!opts.spreadinterponly) // Do FFT only if its not spread interp only.
+      {
+        // STEP 2: call the FFT on this batch
+        timer.restart();
+        do_fft(this);
+        t_fft += timer.elapsedsec();
+        if (opts.debug > 1) printf("\tFFT exec:\t\t%.3g s\n", timer.elapsedsec());
+      }
       // STEP 3: (varies by type)
       timer.restart();
       if (type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk
         deconvolveBatch<TF>(thisBatchSize, this, fkb);
         t_deconv += timer.elapsedsec();
       } else { // type 2: interpolate unif fw grid to NU target pts
-        spreadinterpSortedBatch<TF>(thisBatchSize, this, cjb);
+        spreadinterpSortedBatch<TF>(thisBatchSize, this, opts.spreadinterponly? fkb: this->fwBatch.data(), cjb);
         t_sprint += timer.elapsedsec();
       }
+      // Release the fwBatch vector to prevent double freeing of memory.
     } // ........end b loop
 
     if (opts.debug) { // report total times in their natural order...
@@ -1113,7 +1125,7 @@ int FINUFFT_PLAN_T<TF>::execute(std::complex<TF> *cj, std::complex<TF> *fk) {
       // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid...
       timer.restart();
       spopts.spread_direction = 1;                                      // spread
-      spreadinterpSortedBatch<TF>(thisBatchSize, this, CpBatch.data()); // X are primed
+      spreadinterpSortedBatch<TF>(thisBatchSize, this, this->fwBatch.data(), CpBatch.data()); // X are primed
       t_spr += timer.elapsedsec();
 
       // STEP 2: type 2 NUFFT from fw batch to user output fk array batch...
diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index f6cf925e0..be1abfeb0 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -2160,7 +2160,7 @@ FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps
               upsampfac);
       return FINUFFT_ERR_HORNER_WRONG_BETA;
     }
-    if (upsampfac <= 1.0) { // no digits would result
+    if (upsampfac < 1.0) { // no digits would result
       fprintf(stderr, "FINUFFT setup_spreader: error, upsampfac=%.3g is <=1.0\n",
               upsampfac);
       return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;