From 3618186bb1930168d24f8ec7f7cb09a6983c3cc6 Mon Sep 17 00:00:00 2001
From: Dan Riley <Daniel.Riley@cornell.edu>
Date: Mon, 11 Oct 2021 11:17:35 -0400
Subject: [PATCH] Reorganize SiStripClusterizerConditionsGPU to avoid alignment
 issues

Make maximum strips per cluster cut configurable at runtime for
both the GPU and CPU producers

Remove GPU option to keep large clusters truncated around the barycenter,
as it complicates the code for little apparent benefit.

Fix bugs in the application of the cluster threshold and the early
cutoff of clusters larger than the limit

Squash out all add/delete commits of the same file
---
 .../interface/SiStripClustersCUDA.h           |  16 +-
 .../SiStripCluster/src/SiStripClustersCUDA.cc |  18 +-
 .../SiStripClusterizerConditionsGPU.h         | 195 ++++++++++--------
 .../src/EventSetup_Registration.cc            |   2 +-
 .../src/SiStripClusterizerConditionsGPU.cc    | 150 ++++++++------
 .../interface/SiStripClustersSOA.h            |   2 +-
 .../interface/SiStripClustersSOABase.h        |   8 +-
 .../SiStripCluster/src/SiStripClustersSOA.cc  |   5 +-
 .../interface/ThreeThresholdAlgorithm.h       |   2 +
 .../plugins/ClustersFromRawProducer.cc        |   1 -
 .../plugins/ClustersFromRawProducerGPU.cc     |   4 +-
 ...StripClusterizerConditionsGPUESProducer.cc |   2 +
 .../plugins/SiStripClustersFromSOA.cc         |  15 +-
 .../plugins/SiStripClustersSOAtoHost.cc       |   3 +-
 .../plugins/SiStripRawToClusterGPUKernel.cc   |  27 ++-
 .../plugins/SiStripRawToClusterGPUKernel.cu   | 144 ++++---------
 .../plugins/SiStripRawToClusterGPUKernel.h    |  10 +-
 .../plugins/StripDataView.cuh                 |   1 -
 .../python/DefaultClusterizer_cff.py          |   1 +
 .../python/SiStripClusterizerOnDemand_cfi.py  |   2 +
 .../src/StripClusterizerAlgorithmFactory.cc   |   1 +
 .../src/ThreeThresholdAlgorithm.cc            |   4 +-
 22 files changed, 296 insertions(+), 317 deletions(-)
diff --git a/CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h b/CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h
index dc426a2d1e44b..19645b679f774 100644
--- a/CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h
+++ b/CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h
@@ -16,7 +16,7 @@ namespace cms {
 class SiStripClustersCUDADevice : public SiStripClustersSOABase<cms::cuda::device::unique_ptr> {
 public:
   SiStripClustersCUDADevice() = default;
-  explicit SiStripClustersCUDADevice(size_t maxClusters, int clustersPerStrip, cudaStream_t stream);
+  explicit SiStripClustersCUDADevice(uint32_t maxClusters, uint32_t maxStripsPerCluster, cudaStream_t stream);
   ~SiStripClustersCUDADevice() override = default;
 
   SiStripClustersCUDADevice(const SiStripClustersCUDADevice &) = delete;
@@ -34,23 +34,25 @@ class SiStripClustersCUDADevice : public SiStripClustersSOABase<cms::cuda::devic
     float *barycenter_;
     float *charge_;
     uint32_t nClusters_;
+    uint32_t maxClusterSize_;
   };
 
   DeviceView *view() const { return view_d.get(); }
-  int nClustersHost() const { return nClusters_h; }
-  int *nClustersHostPtr() { return &nClusters_h; }
+  uint32_t nClustersHost() const { return nClustersHost_; }
+  uint32_t *nClustersHostPtr() { return &nClustersHost_; }
+  uint32_t maxClusterSizeHost() const { return maxClusterSizeHost_; }
+  uint32_t *maxClusterSizeHostPtr() { return &maxClusterSizeHost_; }
 
 private:
   cms::cuda::device::unique_ptr<DeviceView> view_d;  // "me" pointer
-  int nClusters_h;
+  uint32_t nClustersHost_;
+  uint32_t maxClusterSizeHost_;
 };
 
 class SiStripClustersCUDAHost : public SiStripClustersSOABase<cms::cuda::host::unique_ptr> {
 public:
   SiStripClustersCUDAHost() = default;
-  explicit SiStripClustersCUDAHost(const SiStripClustersCUDADevice &clusters_d,
-                                   int clustersPerStrip,
-                                   cudaStream_t stream);
+  explicit SiStripClustersCUDAHost(const SiStripClustersCUDADevice &clusters_d, cudaStream_t stream);
   ~SiStripClustersCUDAHost() override = default;
 
   SiStripClustersCUDAHost(const SiStripClustersCUDAHost &) = delete;
diff --git a/CUDADataFormats/SiStripCluster/src/SiStripClustersCUDA.cc b/CUDADataFormats/SiStripCluster/src/SiStripClustersCUDA.cc
index ff64229608038..c7a720eeaada4 100644
--- a/CUDADataFormats/SiStripCluster/src/SiStripClustersCUDA.cc
+++ b/CUDADataFormats/SiStripCluster/src/SiStripClustersCUDA.cc
@@ -1,10 +1,14 @@
 #include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 
-SiStripClustersCUDADevice::SiStripClustersCUDADevice(size_t maxClusters, int clustersPerStrip, cudaStream_t stream) {
+SiStripClustersCUDADevice::SiStripClustersCUDADevice(uint32_t maxClusters,
+                                                     uint32_t maxStripsPerCluster,
+                                                     cudaStream_t stream) {
+  maxClusterSizeHost_ = maxStripsPerCluster;
+
   clusterIndex_ = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
   clusterSize_ = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
-  clusterADCs_ = cms::cuda::make_device_unique<uint8_t[]>(maxClusters * clustersPerStrip, stream);
+  clusterADCs_ = cms::cuda::make_device_unique<uint8_t[]>(maxClusters * maxStripsPerCluster, stream);
   clusterDetId_ = cms::cuda::make_device_unique<stripgpu::detId_t[]>(maxClusters, stream);
   firstStrip_ = cms::cuda::make_device_unique<stripgpu::stripId_t[]>(maxClusters, stream);
   trueCluster_ = cms::cuda::make_device_unique<bool[]>(maxClusters, stream);
@@ -20,18 +24,18 @@ SiStripClustersCUDADevice::SiStripClustersCUDADevice(size_t maxClusters, int clu
   view->trueCluster_ = trueCluster_.get();
   view->barycenter_ = barycenter_.get();
   view->charge_ = charge_.get();
+  view->maxClusterSize_ = maxStripsPerCluster;
 
   view_d = cms::cuda::make_device_unique<DeviceView>(stream);
   cms::cuda::copyAsync(view_d, view, stream);
 }
 
-SiStripClustersCUDAHost::SiStripClustersCUDAHost(const SiStripClustersCUDADevice& clusters_d,
-                                                 int clustersPerStrip,
-                                                 cudaStream_t stream) {
+SiStripClustersCUDAHost::SiStripClustersCUDAHost(const SiStripClustersCUDADevice& clusters_d, cudaStream_t stream) {
   nClusters_ = clusters_d.nClustersHost();
+  maxClusterSize_ = clusters_d.maxClusterSizeHost();
   clusterIndex_ = cms::cuda::make_host_unique<uint32_t[]>(nClusters_, stream);
   clusterSize_ = cms::cuda::make_host_unique<uint32_t[]>(nClusters_, stream);
-  clusterADCs_ = cms::cuda::make_host_unique<uint8_t[]>(nClusters_ * clustersPerStrip, stream);
+  clusterADCs_ = cms::cuda::make_host_unique<uint8_t[]>(nClusters_ * maxClusterSize_, stream);
   clusterDetId_ = cms::cuda::make_host_unique<stripgpu::detId_t[]>(nClusters_, stream);
   firstStrip_ = cms::cuda::make_host_unique<stripgpu::stripId_t[]>(nClusters_, stream);
   trueCluster_ = cms::cuda::make_host_unique<bool[]>(nClusters_, stream);
@@ -40,7 +44,7 @@ SiStripClustersCUDAHost::SiStripClustersCUDAHost(const SiStripClustersCUDADevice
 
   cms::cuda::copyAsync(clusterIndex_, clusters_d.clusterIndex(), nClusters_, stream);
   cms::cuda::copyAsync(clusterSize_, clusters_d.clusterSize(), nClusters_, stream);
-  cms::cuda::copyAsync(clusterADCs_, clusters_d.clusterADCs(), nClusters_ * clustersPerStrip, stream);
+  cms::cuda::copyAsync(clusterADCs_, clusters_d.clusterADCs(), nClusters_ * maxClusterSize_, stream);
   cms::cuda::copyAsync(clusterDetId_, clusters_d.clusterDetId(), nClusters_, stream);
   cms::cuda::copyAsync(firstStrip_, clusters_d.firstStrip(), nClusters_, stream);
   cms::cuda::copyAsync(trueCluster_, clusters_d.trueCluster(), nClusters_, stream);
diff --git a/CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h b/CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h
index 0b6bed13fe35a..6505dd054c262 100644
--- a/CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h
+++ b/CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h
@@ -1,10 +1,14 @@
 #ifndef CalibFormats_SiStripObjects_SiStripClusterizerConditionsGPU_h
 #define CalibFormats_SiStripObjects_SiStripClusterizerConditionsGPU_h
 
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
-#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #include "DataFormats/SiStripCluster/interface/SiStripTypes.h"
 
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+
 class SiStripQuality;
 class SiStripGain;
 class SiStripNoises;
@@ -19,108 +23,117 @@ namespace stripgpu {
   static constexpr int kStripsPerFed = kChannelCount * kStripsPerChannel;
 
   __host__ __device__ inline fedId_t fedIndex(fedId_t fed) { return fed - kFedFirst; }
-  __host__ __device__ inline stripId_t stripIndex(fedCh_t channel, stripId_t strip) {
-    return channel * kStripsPerChannel + (strip % kStripsPerChannel);
+  __host__ __device__ inline std::uint32_t stripIndex(fedId_t fed, fedCh_t channel, stripId_t strip) {
+    return fedIndex(fed) * kStripsPerFed + channel * kStripsPerChannel + (strip % kStripsPerChannel);
   }
-  __host__ __device__ inline stripId_t apvIndex(fedCh_t channel, stripId_t strip) {
-    return channel * kStripsPerChannel + (strip % kStripsPerChannel) / 128;
+  __host__ __device__ inline std::uint32_t apvIndex(fedId_t fed, fedCh_t channel, stripId_t strip) {
+    return fedIndex(fed) * kApvCount + 2 * channel + (strip % kStripsPerChannel) / 128;
+  }
+  __host__ __device__ inline std::uint32_t channelIndex(fedId_t fed, fedCh_t channel) {
+    return fedIndex(fed) * kChannelCount + channel;
   }
-}  // namespace stripgpu
 
-class SiStripClusterizerConditionsGPU {
-public:
-  class DetToFed {
+  class SiStripClusterizerConditionsGPU {
   public:
-    DetToFed(stripgpu::detId_t detid, stripgpu::APVPair_t ipair, stripgpu::fedId_t fedid, stripgpu::fedCh_t fedch)
-        : detid_(detid), ipair_(ipair), fedid_(fedid), fedch_(fedch) {}
-    stripgpu::detId_t detID() const { return detid_; }
-    stripgpu::APVPair_t pair() const { return ipair_; }
-    stripgpu::fedId_t fedID() const { return fedid_; }
-    stripgpu::fedCh_t fedCh() const { return fedch_; }
-
-  private:
-    stripgpu::detId_t detid_;
-    stripgpu::APVPair_t ipair_;
-    stripgpu::fedId_t fedid_;
-    stripgpu::fedCh_t fedch_;
-  };
-  using DetToFeds = std::vector<DetToFed>;
+    class DetToFed {
+    public:
+      DetToFed(detId_t detid, APVPair_t ipair, fedId_t fedid, fedCh_t fedch)
+          : detid_(detid), ipair_(ipair), fedid_(fedid), fedch_(fedch) {}
+      detId_t detID() const { return detid_; }
+      APVPair_t pair() const { return ipair_; }
+      fedId_t fedID() const { return fedid_; }
+      fedCh_t fedCh() const { return fedch_; }
+
+    private:
+      detId_t detid_;
+      APVPair_t ipair_;
+      fedId_t fedid_;
+      fedCh_t fedch_;
+    };
+    using DetToFeds = std::vector<DetToFed>;
 
-  struct Data {
     static constexpr std::uint16_t badBit = 1 << 15;
 
-    __host__ __device__ void setStrip(stripgpu::fedId_t fed,
-                                      stripgpu::fedCh_t channel,
-                                      stripgpu::stripId_t strip,
-                                      std::uint16_t noise,
-                                      float gain,
-                                      bool bad) {
-      gain_[stripgpu::fedIndex(fed)][stripgpu::apvIndex(channel, strip)] = gain;
-      noise_[stripgpu::fedIndex(fed)][stripgpu::stripIndex(channel, strip)] = noise;
+    class Data {
+    public:
+      struct DeviceView {
+        __device__ inline detId_t detID(fedId_t fed, fedCh_t channel) const {
+          return detID_[channelIndex(fed, channel)];
+        }
+
+        __device__ inline APVPair_t iPair(fedId_t fed, fedCh_t channel) const {
+          return iPair_[channelIndex(fed, channel)];
+        }
+
+        __device__ inline float invthick(fedId_t fed, fedCh_t channel) const {
+          return invthick_[channelIndex(fed, channel)];
+        }
+
+        __device__ inline float noise(fedId_t fed, fedCh_t channel, stripId_t strip) const {
+          return 0.1f * (noise_[stripIndex(fed, channel, strip)] & ~badBit);
+        }
+
+        __device__ inline float gain(fedId_t fed, fedCh_t channel, stripId_t strip) const {
+          return gain_[apvIndex(fed, channel, strip)];
+        }
+
+        __device__ inline bool bad(fedId_t fed, fedCh_t channel, stripId_t strip) const {
+          return badBit == (noise_[stripIndex(fed, channel, strip)] & badBit);
+        }
+        const std::uint16_t* noise_;  //[kFedCount*kStripsPerFed];
+        const float* invthick_;       //[kFedCount*kChannelCount];
+        const detId_t* detID_;        //[kFedCount*kChannelCount];
+        const APVPair_t* iPair_;      //[kFedCount*kChannelCount];
+        const float* gain_;           //[kFedCount*kApvCount];
+      };
+
+      const DeviceView* deviceView() const { return deviceView_.get(); }
+
+      cms::cuda::device::unique_ptr<DeviceView> deviceView_;
+      cms::cuda::host::unique_ptr<DeviceView> hostView_;
+
+      cms::cuda::device::unique_ptr<std::uint16_t[]> noise_;  //[kFedCount*kStripsPerFed];
+      cms::cuda::device::unique_ptr<float[]> invthick_;       //[kFedCount*kChannelCount];
+      cms::cuda::device::unique_ptr<detId_t[]> detID_;        //[kFedCount*kChannelCount];
+      cms::cuda::device::unique_ptr<APVPair_t[]> iPair_;      //[kFedCount*kChannelCount];
+      cms::cuda::device::unique_ptr<float[]> gain_;           //[kFedCount*kApvCount];
+    };
+
+    SiStripClusterizerConditionsGPU(const SiStripQuality& quality,
+                                    const SiStripGain* gains,
+                                    const SiStripNoises& noises);
+    ~SiStripClusterizerConditionsGPU() = default;
+
+    // Function to return the actual payload on the memory of the current device
+    Data const& getGPUProductAsync(cudaStream_t stream) const;
+
+    const DetToFeds& detToFeds() const { return detToFeds_; }
+
+  private:
+    void setStrip(fedId_t fed, fedCh_t channel, stripId_t strip, std::uint16_t noise, float gain, bool bad) {
+      gain_[apvIndex(fed, channel, strip)] = gain;
+      noise_[stripIndex(fed, channel, strip)] = noise;
       if (bad) {
-        noise_[stripgpu::fedIndex(fed)][stripgpu::stripIndex(channel, strip)] |= badBit;
+        noise_[stripIndex(fed, channel, strip)] |= badBit;
       }
     }
 
-    __host__ __device__ void setInvThickness(stripgpu::fedId_t fed, stripgpu::fedCh_t channel, float invthick) {
-      invthick_[stripgpu::fedIndex(fed)][channel] = invthick;
-    }
-
-    __host__ __device__ stripgpu::detId_t detID(stripgpu::fedId_t fed, stripgpu::fedCh_t channel) const {
-      return detID_[stripgpu::fedIndex(fed)][channel];
-    }
-
-    __host__ __device__ stripgpu::APVPair_t iPair(stripgpu::fedId_t fed, stripgpu::fedCh_t channel) const {
-      return iPair_[stripgpu::fedIndex(fed)][channel];
+    void setInvThickness(fedId_t fed, fedCh_t channel, float invthick) {
+      invthick_[channelIndex(fed, channel)] = invthick;
     }
 
-    __host__ __device__ float invthick(stripgpu::fedId_t fed, stripgpu::fedCh_t channel) const {
-      return invthick_[stripgpu::fedIndex(fed)][channel];
-    }
-
-    __host__ __device__ float noise(stripgpu::fedId_t fed, stripgpu::fedCh_t channel, stripgpu::stripId_t strip) const {
-      return 0.1 * (noise_[stripgpu::fedIndex(fed)][stripgpu::stripIndex(channel, strip)] & !badBit);
-    }
-
-    __host__ __device__ float gain(stripgpu::fedId_t fed, stripgpu::fedCh_t channel, stripgpu::stripId_t strip) const {
-      return gain_[stripgpu::fedIndex(fed)][stripgpu::apvIndex(channel, strip)];
-    }
-
-    __host__ __device__ bool bad(stripgpu::fedId_t fed, stripgpu::fedCh_t channel, stripgpu::stripId_t strip) const {
-      return badBit == (noise_[stripgpu::fedIndex(fed)][stripgpu::stripIndex(channel, strip)] & badBit);
-    }
-
-    alignas(128) float gain_[stripgpu::kFedCount][stripgpu::kApvCount];
-    alignas(128) float invthick_[stripgpu::kFedCount][stripgpu::kChannelCount];
-    alignas(128) std::uint16_t noise_[stripgpu::kFedCount][stripgpu::kStripsPerFed];
-    alignas(128) stripgpu::detId_t detID_[stripgpu::kFedCount][stripgpu::kChannelCount];
-    alignas(128) stripgpu::APVPair_t iPair_[stripgpu::kFedCount][stripgpu::kChannelCount];
+    // Holds the data in pinned CPU memory
+    std::vector<std::uint16_t, cms::cuda::HostAllocator<std::uint16_t>> noise_;
+    std::vector<float, cms::cuda::HostAllocator<float>> invthick_;
+    std::vector<detId_t, cms::cuda::HostAllocator<detId_t>> detID_;
+    std::vector<APVPair_t, cms::cuda::HostAllocator<APVPair_t>> iPair_;
+    std::vector<float, cms::cuda::HostAllocator<float>> gain_;
+
+    // Helper that takes care of complexity of transferring the data to
+    // multiple devices
+    cms::cuda::ESProduct<Data> gpuData_;
+    DetToFeds detToFeds_;
   };
-
-  SiStripClusterizerConditionsGPU(const SiStripQuality& quality, const SiStripGain* gains, const SiStripNoises& noises);
-  ~SiStripClusterizerConditionsGPU();
-
-  // Function to return the actual payload on the memory of the current device
-  Data const* getGPUProductAsync(cudaStream_t stream) const;
-
-  const DetToFeds& detToFeds() const { return detToFeds_; }
-
-private:
-  // Holds the data in pinned CPU memory
-  Data* conditions_ = nullptr;
-
-  // Helper struct to hold all information that has to be allocated and
-  // deallocated per device
-  struct GPUData {
-    // Destructor should free all member pointers
-    ~GPUData();
-    Data* conditionsDevice = nullptr;
-  };
-
-  // Helper that takes care of complexity of transferring the data to
-  // multiple devices
-  cms::cuda::ESProduct<GPUData> gpuData_;
-  DetToFeds detToFeds_;
-};
+}  // namespace stripgpu
 
 #endif
diff --git a/CalibFormats/SiStripObjects/src/EventSetup_Registration.cc b/CalibFormats/SiStripObjects/src/EventSetup_Registration.cc
index f16361785f310..05530484f14c4 100644
--- a/CalibFormats/SiStripObjects/src/EventSetup_Registration.cc
+++ b/CalibFormats/SiStripObjects/src/EventSetup_Registration.cc
@@ -25,4 +25,4 @@ TYPELOOKUP_DATA_REG(SiStripQuality);
 TYPELOOKUP_DATA_REG(SiStripClusterizerConditions);
 
 #include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
-TYPELOOKUP_DATA_REG(SiStripClusterizerConditionsGPU);
+TYPELOOKUP_DATA_REG(stripgpu::SiStripClusterizerConditionsGPU);
diff --git a/CalibFormats/SiStripObjects/src/SiStripClusterizerConditionsGPU.cc b/CalibFormats/SiStripObjects/src/SiStripClusterizerConditionsGPU.cc
index 29c33a8ed911f..2ca12c5e38579 100644
--- a/CalibFormats/SiStripObjects/src/SiStripClusterizerConditionsGPU.cc
+++ b/CalibFormats/SiStripObjects/src/SiStripClusterizerConditionsGPU.cc
@@ -1,4 +1,5 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 
 #include "CondFormats/SiStripObjects/interface/SiStripNoises.h"
 #include "CalibFormats/SiStripObjects/interface/SiStripGain.h"
@@ -8,77 +9,94 @@
 
 #include "DataFormats/SiStripCluster/interface/SiStripClusterTools.h"
 
-SiStripClusterizerConditionsGPU::SiStripClusterizerConditionsGPU(const SiStripQuality& quality,
-                                                                 const SiStripGain* gains,
-                                                                 const SiStripNoises& noises) {
-  cudaCheck(cudaMallocHost(&conditions_, sizeof(Data)));
-  detToFeds_.clear();
-
-  // connected: map<DetID, std::vector<int>>
-  // map of KEY=detid DATA=vector of apvs, maximum 6 APVs per detector module :
-  const auto& connected = quality.cabling()->connected();
-  // detCabling: map<DetID, std::vector<const FedChannelConnection *>
-  // map of KEY=detid DATA=vector<FedChannelConnection>
-  const auto& detCabling = quality.cabling()->getDetCabling();
-
-  for (const auto& conn : connected) {
-    const auto det = conn.first;
-    if (!quality.IsModuleBad(det)) {
-      const auto detConn_it = detCabling.find(det);
-
-      if (detCabling.end() != detConn_it) {
-        for (const auto& chan : (*detConn_it).second) {
-          if (chan && chan->fedId() && chan->isConnected()) {
-            const auto detID = chan->detId();
-            const auto fedID = chan->fedId();
-            const auto fedCh = chan->fedCh();
-            const auto iPair = chan->apvPairNumber();
-
-            detToFeds_.emplace_back(detID, iPair, fedID, fedCh);
-
-            conditions_->detID_[stripgpu::fedIndex(fedID)][fedCh] = detID;
-            conditions_->iPair_[stripgpu::fedIndex(fedID)][fedCh] = iPair;
-            conditions_->setInvThickness(fedID, fedCh, siStripClusterTools::sensorThicknessInverse(detID));
-
-            auto offset = 256 * iPair;
-
-            for (auto strip = 0; strip < 256; ++strip) {
-              const auto gainRange = gains->getRange(det);
-
-              const auto detstrip = strip + offset;
-              const std::uint16_t noise = SiStripNoises::getRawNoise(detstrip, noises.getRange(det));
-              const auto gain = SiStripGain::getStripGain(detstrip, gainRange);
-              const auto bad = quality.IsStripBad(quality.getRange(det), detstrip);
-
-              // gain is actually stored per-APV, not per-strip
-              conditions_->setStrip(fedID, fedCh, strip, noise, gain, bad);
+namespace stripgpu {
+  SiStripClusterizerConditionsGPU::SiStripClusterizerConditionsGPU(const SiStripQuality& quality,
+                                                                   const SiStripGain* gains,
+                                                                   const SiStripNoises& noises)
+
+      : noise_(kFedCount * kStripsPerFed),
+        invthick_(kFedCount * kChannelCount),
+        detID_(kFedCount * kChannelCount),
+        iPair_(kFedCount * kChannelCount),
+        gain_(kFedCount * kApvCount) {
+    detToFeds_.clear();
+
+    // connected: map<DetID, std::vector<int>>
+    // map of KEY=detid DATA=vector of apvs, maximum 6 APVs per detector module :
+    const auto& connected = quality.cabling()->connected();
+    // detCabling: map<DetID, std::vector<const FedChannelConnection *>
+    // map of KEY=detid DATA=vector<FedChannelConnection>
+    const auto& detCabling = quality.cabling()->getDetCabling();
+
+    for (const auto& conn : connected) {
+      const auto det = conn.first;
+      if (!quality.IsModuleBad(det)) {
+        const auto detConn_it = detCabling.find(det);
+
+        if (detCabling.end() != detConn_it) {
+          for (const auto& chan : (*detConn_it).second) {
+            if (chan && chan->fedId() && chan->isConnected()) {
+              const auto detID = chan->detId();
+              const auto fedID = chan->fedId();
+              const auto fedCh = chan->fedCh();
+              const auto iPair = chan->apvPairNumber();
+
+              detToFeds_.emplace_back(detID, iPair, fedID, fedCh);
+
+              detID_[channelIndex(fedID, fedCh)] = detID;
+              iPair_[channelIndex(fedID, fedCh)] = iPair;
+              setInvThickness(fedID, fedCh, siStripClusterTools::sensorThicknessInverse(detID));
+
+              auto offset = 256 * iPair;
+
+              for (auto strip = 0; strip < 256; ++strip) {
+                const auto gainRange = gains->getRange(det);
+
+                const auto detstrip = strip + offset;
+                const std::uint16_t noise = SiStripNoises::getRawNoise(detstrip, noises.getRange(det));
+                const auto gain = SiStripGain::getStripGain(detstrip, gainRange);
+                const auto bad = quality.IsStripBad(quality.getRange(det), detstrip);
+
+                // gain is actually stored per-APV, not per-strip
+                setStrip(fedID, fedCh, detstrip, noise, gain, bad);
+              }
             }
           }
         }
       }
     }
-  }
 
-  std::sort(detToFeds_.begin(), detToFeds_.end(), [](const DetToFed& a, const DetToFed& b) {
-    return a.detID() < b.detID() || (a.detID() == b.detID() && a.pair() < b.pair());
-  });
-}
+    std::sort(detToFeds_.begin(), detToFeds_.end(), [](const DetToFed& a, const DetToFed& b) {
+      return a.detID() < b.detID() || (a.detID() == b.detID() && a.pair() < b.pair());
+    });
+  }
 
-SiStripClusterizerConditionsGPU::~SiStripClusterizerConditionsGPU() {
-  if (nullptr != conditions_) {
-    cudaCheck(cudaFreeHost(conditions_));
+  SiStripClusterizerConditionsGPU::Data const& SiStripClusterizerConditionsGPU::getGPUProductAsync(
+      cudaStream_t stream) const {
+    auto const& data = gpuData_.dataForCurrentDeviceAsync(stream, [this](Data& data, cudaStream_t stream) {
+      data.noise_ = cms::cuda::make_device_unique<std::uint16_t[]>(noise_.size(), stream);
+      data.invthick_ = cms::cuda::make_device_unique<float[]>(invthick_.size(), stream);
+      data.detID_ = cms::cuda::make_device_unique<detId_t[]>(detID_.size(), stream);
+      data.iPair_ = cms::cuda::make_device_unique<APVPair_t[]>(iPair_.size(), stream);
+      data.gain_ = cms::cuda::make_device_unique<float[]>(gain_.size(), stream);
+
+      cms::cuda::copyAsync(data.noise_, noise_, stream);
+      cms::cuda::copyAsync(data.invthick_, invthick_, stream);
+      cms::cuda::copyAsync(data.detID_, detID_, stream);
+      cms::cuda::copyAsync(data.iPair_, iPair_, stream);
+      cms::cuda::copyAsync(data.gain_, gain_, stream);
+
+      data.hostView_ = cms::cuda::make_host_unique<SiStripClusterizerConditionsGPU::Data::DeviceView>(stream);
+      data.hostView_->noise_ = data.noise_.get();
+      data.hostView_->invthick_ = data.invthick_.get();
+      data.hostView_->detID_ = data.detID_.get();
+      data.hostView_->iPair_ = data.iPair_.get();
+      data.hostView_->gain_ = data.gain_.get();
+
+      data.deviceView_ = cms::cuda::make_device_unique<SiStripClusterizerConditionsGPU::Data::DeviceView>(stream);
+      cms::cuda::copyAsync(data.deviceView_, data.hostView_, stream);
+    });
+
+    return data;
   }
-}
-
-SiStripClusterizerConditionsGPU::Data const* SiStripClusterizerConditionsGPU::getGPUProductAsync(
-    cudaStream_t stream) const {
-  auto const& data = gpuData_.dataForCurrentDeviceAsync(stream, [this](GPUData& data, cudaStream_t stream) {
-    // Allocate the payload object on the device memory.
-    cudaCheck(cudaMalloc(&data.conditionsDevice, sizeof(Data)));
-    cudaCheck(cudaMemcpyAsync(data.conditionsDevice, conditions_, sizeof(Data), cudaMemcpyDefault, stream));
-  });
-  // Returns the payload object on the memory of the current device
-  return data.conditionsDevice;
-}
-
-SiStripClusterizerConditionsGPU::GPUData::~GPUData() { cudaCheck(cudaFree(conditionsDevice)); }
+}  // namespace stripgpu
diff --git a/DataFormats/SiStripCluster/interface/SiStripClustersSOA.h b/DataFormats/SiStripCluster/interface/SiStripClustersSOA.h
index b277d365da38c..1d43b5f67ef64 100644
--- a/DataFormats/SiStripCluster/interface/SiStripClustersSOA.h
+++ b/DataFormats/SiStripCluster/interface/SiStripClustersSOA.h
@@ -13,7 +13,7 @@ namespace detail {
 class SiStripClustersSOA : public SiStripClustersSOABase<detail::unique_ptr> {
 public:
   SiStripClustersSOA() = default;
-  explicit SiStripClustersSOA(size_t maxClusters, int clustersPerStrip);
+  explicit SiStripClustersSOA(uint32_t maxClusters, uint32_t maxStripsPerCluster);
   ~SiStripClustersSOA() override = default;
 
   SiStripClustersSOA(const SiStripClustersSOA &) = delete;
diff --git a/DataFormats/SiStripCluster/interface/SiStripClustersSOABase.h b/DataFormats/SiStripCluster/interface/SiStripClustersSOABase.h
index dc97f262ad155..036ab7c3dd3e5 100644
--- a/DataFormats/SiStripCluster/interface/SiStripClustersSOABase.h
+++ b/DataFormats/SiStripCluster/interface/SiStripClustersSOABase.h
@@ -9,10 +9,10 @@
 template <template <typename> class T>
 class SiStripClustersSOABase {
 public:
-  static constexpr uint32_t kClusterMaxStrips = 16;
+  //static constexpr uint32_t kClusterMaxStrips = 16;
 
   SiStripClustersSOABase() = default;
-  //explicit SiStripClustersSOABase(size_t maxClusters, int clustersPerStrip);
+  //explicit SiStripClustersSOABase(uint32_t maxClusters, uint32_t maxStripsPerCluster);
   virtual ~SiStripClustersSOABase() = default;
 
   SiStripClustersSOABase(const SiStripClustersSOABase&) = delete;
@@ -23,6 +23,9 @@ class SiStripClustersSOABase {
   void setNClusters(uint32_t nClusters) { nClusters_ = nClusters; }
   uint32_t nClusters() const { return nClusters_; }
 
+  void setMaxClusterSize(uint32_t maxClusterSize) { maxClusterSize_ = maxClusterSize; }
+  uint32_t maxClusterSize() const { return maxClusterSize_; }
+
   const auto& clusterIndex() const { return clusterIndex_; }
   const auto& clusterSize() const { return clusterSize_; }
   const auto& clusterADCs() const { return clusterADCs_; }
@@ -51,5 +54,6 @@ class SiStripClustersSOABase {
   T<float[]> barycenter_;
   T<float[]> charge_;
   uint32_t nClusters_;
+  uint32_t maxClusterSize_;
 };
 #endif
diff --git a/DataFormats/SiStripCluster/src/SiStripClustersSOA.cc b/DataFormats/SiStripCluster/src/SiStripClustersSOA.cc
index 95ffc01168f36..d43d8509b5920 100644
--- a/DataFormats/SiStripCluster/src/SiStripClustersSOA.cc
+++ b/DataFormats/SiStripCluster/src/SiStripClustersSOA.cc
@@ -1,12 +1,13 @@
 #include "DataFormats/SiStripCluster/interface/SiStripClustersSOA.h"
 
-SiStripClustersSOA::SiStripClustersSOA(size_t maxClusters, int clustersPerStrip) {
+SiStripClustersSOA::SiStripClustersSOA(uint32_t maxClusters, uint32_t maxStripsPerCluster) {
   clusterIndex_ = std::make_unique<uint32_t[]>(maxClusters);
   clusterSize_ = std::make_unique<uint32_t[]>(maxClusters);
-  clusterADCs_ = std::make_unique<uint8_t[]>(maxClusters * clustersPerStrip);
+  clusterADCs_ = std::make_unique<uint8_t[]>(maxClusters * maxStripsPerCluster);
   clusterDetId_ = std::make_unique<stripgpu::detId_t[]>(maxClusters);
   firstStrip_ = std::make_unique<stripgpu::stripId_t[]>(maxClusters);
   trueCluster_ = std::make_unique<bool[]>(maxClusters);
   barycenter_ = std::make_unique<float[]>(maxClusters);
   charge_ = std::make_unique<float[]>(maxClusters);
+  maxClusterSize_ = maxStripsPerCluster;
 }
diff --git a/RecoLocalTracker/SiStripClusterizer/interface/ThreeThresholdAlgorithm.h b/RecoLocalTracker/SiStripClusterizer/interface/ThreeThresholdAlgorithm.h
index 6c9964dd2a43f..2c4551ef40550 100644
--- a/RecoLocalTracker/SiStripClusterizer/interface/ThreeThresholdAlgorithm.h
+++ b/RecoLocalTracker/SiStripClusterizer/interface/ThreeThresholdAlgorithm.h
@@ -35,6 +35,7 @@ class ThreeThresholdAlgorithm final : public StripClusterizerAlgorithm {
                           unsigned,
                           unsigned,
                           unsigned,
+                          unsigned,
                           bool removeApvShots,
                           float minGoodCharge);
 
@@ -58,6 +59,7 @@ class ThreeThresholdAlgorithm final : public StripClusterizerAlgorithm {
 
   float ChannelThreshold, SeedThreshold, ClusterThresholdSquared;
   uint8_t MaxSequentialHoles, MaxSequentialBad, MaxAdjacentBad;
+  unsigned MaxClusterSize;
   bool RemoveApvShots;
   float minGoodCharge;
 };
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducer.cc b/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducer.cc
index ce456617dbc22..80175f31c5b7d 100644
--- a/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducer.cc
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducer.cc
@@ -250,7 +250,6 @@ void SiStripClusterizerFromRaw::run(const FEDRawDataCollection& rawColl, edmNew:
 
     if (record.empty())
       record.abort();
-
   }  // end loop over dets
 }
 
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducerGPU.cc b/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducerGPU.cc
index 6caafabd2385f..16e099e9d029a 100644
--- a/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducerGPU.cc
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/ClustersFromRawProducerGPU.cc
@@ -124,7 +124,7 @@ class SiStripClusterizerFromRawGPU final : public edm::stream::EDProducer<edm::E
   edm::EDGetTokenT<FEDRawDataCollection> inputToken_;
   edm::EDPutTokenT<cms::cuda::Product<SiStripClustersCUDADevice>> outputToken_;
 
-  edm::ESGetToken<SiStripClusterizerConditionsGPU, SiStripClusterizerConditionsGPURcd> conditionsToken_;
+  edm::ESGetToken<stripgpu::SiStripClusterizerConditionsGPU, SiStripClusterizerConditionsGPURcd> conditionsToken_;
   edm::ESGetToken<SiStripClusterizerConditions, SiStripClusterizerConditionsRcd> cpuConditionsToken_;
 };
 
@@ -142,7 +142,7 @@ void SiStripClusterizerFromRawGPU::fillDescriptions(edm::ConfigurationDescriptio
   clusterizer.add("MaxSequentialHoles", 0U);
   clusterizer.add("MaxSequentialBad", 1U);
   clusterizer.add("MaxAdjacentBad", 0U);
-  clusterizer.add("KeepLargeClusters", false);
+  clusterizer.add("MaxClusterSize", 16U);
 
   edm::ParameterSetDescription clusterChargeCut;
   clusterChargeCut.add("value", -1.0);
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClusterizerConditionsGPUESProducer.cc b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClusterizerConditionsGPUESProducer.cc
index ab79270904a38..3add919b1a402 100644
--- a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClusterizerConditionsGPUESProducer.cc
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClusterizerConditionsGPUESProducer.cc
@@ -18,6 +18,8 @@
 #include "CalibFormats/SiStripObjects/interface/SiStripDetCabling.h"
 #include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
 
+using namespace stripgpu;
+
 class SiStripClusterizerConditionsGPUESProducer : public edm::ESProducer {
 public:
   SiStripClusterizerConditionsGPUESProducer(const edm::ParameterSet&);
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersFromSOA.cc b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersFromSOA.cc
index 28e7cc17f46a1..59473437821cc 100644
--- a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersFromSOA.cc
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersFromSOA.cc
@@ -56,7 +56,7 @@ class SiStripClustersFromSOA final : public edm::stream::EDProducer<> {
 
       while (i < nSeedStripsNC && detIDs[i] == detid) {
         if (trueCluster[i]) {
-          const auto size = std::min(clusterSize[i], SiStripClustersCUDADevice::kClusterMaxStrips);
+          const auto size = clusterSize[i];
           const auto firstStrip = stripIDs[i];
 
           adcs.clear();
@@ -69,19 +69,6 @@ class SiStripClustersFromSOA final : public edm::stream::EDProducer<> {
         }
         i++;
       }
-      //#define DSRDEBUG
-#ifdef DSRDEBUG
-      if (detid == 369120277) {
-        std::cout << "Printing clusters for detid " << detid << std::endl;
-        for (const auto& cluster : record) {
-          std::cout << "Cluster " << cluster.firstStrip() << ": ";
-          for (const auto& ampl : cluster.amplitudes()) {
-            std::cout << (int)ampl << " ";
-          }
-          std::cout << std::endl;
-        }
-      }
-#endif
     }
 
     output->shrink_to_fit();
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersSOAtoHost.cc b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersSOAtoHost.cc
index 5146ed1065268..8a4d0b1b4367f 100644
--- a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersSOAtoHost.cc
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripClustersSOAtoHost.cc
@@ -23,8 +23,7 @@ class SiStripSOAtoHost {
 public:
   SiStripSOAtoHost() = default;
   void makeAsync(const SiStripClustersCUDADevice& clusters_d, cudaStream_t stream) {
-    hostView_ =
-        std::make_unique<SiStripClustersCUDAHost>(clusters_d, SiStripClustersCUDAHost::kClusterMaxStrips, stream);
+    hostView_ = std::make_unique<SiStripClustersCUDAHost>(clusters_d, stream);
   }
   std::unique_ptr<SiStripClustersCUDAHost> getResults() { return std::move(hostView_); }
 
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cc b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cc
index 8a3679eca4709..2b3f1ce8e78cd 100644
--- a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cc
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cc
@@ -21,12 +21,12 @@ namespace stripgpu {
       : fedIndex_(stripgpu::kFedCount, stripgpu::invalidFed),
         channelThreshold_(conf.getParameter<double>("ChannelThreshold")),
         seedThreshold_(conf.getParameter<double>("SeedThreshold")),
-        clusterThresholdSquared_(conf.getParameter<double>("ClusterThreshold")),
+        clusterThresholdSquared_(std::pow(conf.getParameter<double>("ClusterThreshold"), 2.0f)),
         maxSequentialHoles_(conf.getParameter<unsigned>("MaxSequentialHoles")),
         maxSequentialBad_(conf.getParameter<unsigned>("MaxSequentialBad")),
         maxAdjacentBad_(conf.getParameter<unsigned>("MaxAdjacentBad")),
-        minGoodCharge_(clusterChargeCut(conf)),
-        keepLargeClusters_(conf.getParameter<bool>("KeepLargeClusters")) {
+        maxClusterSize_(conf.getParameter<unsigned>("MaxClusterSize")),
+        minGoodCharge_(clusterChargeCut(conf)) {
     fedRawDataOffsets_.reserve(stripgpu::kFedCount);
   }
 
@@ -118,14 +118,11 @@ namespace stripgpu {
     stripdata_ = std::make_unique<StripDataGPU>(max_strips, stream);
     const int max_seedstrips = kMaxSeedStrips;
 
-    auto condGPU = conditions.getGPUProductAsync(stream);
+    const auto& condGPU = conditions.getGPUProductAsync(stream);
 
-    unpackChannelsGPU(condGPU, stream);
+    unpackChannelsGPU(condGPU.deviceView(), stream);
 
-    fedRawDataGPU.reset();
-
-//#define VERIFY
-#ifdef VERIFY
+#ifdef EDM_ML_DEBUG
     auto outdata = cms::cuda::make_host_unique<uint8_t[]>(max_strips, stream);
     cms::cuda::copyAsync(outdata, stripdata_->alldataGPU_, max_strips, stream);
     cudaCheck(cudaStreamSynchronize(stream));
@@ -145,8 +142,9 @@ namespace stripgpu {
           aoff += 2;
           for (auto k = 0; k < groupLength; ++k, ++choff, ++aoff) {
             if (data[choff ^ 7] != outdata[aoff]) {
-              std::cout << "i:k " << i << ":" << k << " " << (uint32_t)data[choff ^ 7]
-                        << " != " << (uint32_t)outdata[aoff] << std::endl;
+              LogDebug("SiStripRawToClusterGPUKernel")
+                  << "Strip mismatch " << stripIndex << " i:k " << i << ":" << k << " " << (uint32_t)data[choff ^ 7]
+                  << " != " << (uint32_t)outdata[aoff] << std::endl;
             }
           }
         }
@@ -155,11 +153,12 @@ namespace stripgpu {
     outdata.reset(nullptr);
 #endif
 
+    fedRawDataGPU.reset();
     allocateSSTDataGPU(max_strips, stream);
-    setSeedStripsNCIndexGPU(condGPU, stream);
+    setSeedStripsNCIndexGPU(condGPU.deviceView(), stream);
 
-    clusters_d_ = SiStripClustersCUDADevice(max_seedstrips, kClusterMaxStrips, stream);
-    findClusterGPU(condGPU, stream);
+    clusters_d_ = SiStripClustersCUDADevice(max_seedstrips, maxClusterSize_, stream);
+    findClusterGPU(condGPU.deviceView(), stream);
 
     stripdata_.reset();
     chanlocsGPU_.reset();
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cu b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cu
index 8df09048b37b7..e124c73a14d34 100644
--- a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.cu
@@ -18,15 +18,16 @@
 #include "StripDataView.cuh"
 
 //#define GPU_DEBUG
-//#define GPU_CHECK
-//#define CENTERBARY
-#ifdef GPU_CHECK
+#ifdef EDM_ML_DEBUG
+#define GPU_CHECK
 #include <stdio.h>
 #endif
 
 namespace stripgpu {
+  using ConditionsDeviceView = SiStripClusterizerConditionsGPU::Data::DeviceView;
+
   __global__ static void unpackChannels(const ChannelLocsView *chanlocs,
-                                        const SiStripClusterizerConditionsGPU::Data *conditions,
+                                        const ConditionsDeviceView *conditions,
                                         uint8_t *alldata,
                                         uint16_t *channel,
                                         stripgpu::stripId_t *stripId) {
@@ -70,8 +71,7 @@ namespace stripgpu {
 
   __device__ constexpr int maxseeds() { return kMaxSeedStrips; }
 
-  __global__ static void setSeedStripsGPU(StripDataView *sst_data_d,
-                                          const SiStripClusterizerConditionsGPU::Data *conditions) {
+  __global__ static void setSeedStripsGPU(StripDataView *sst_data_d, const ConditionsDeviceView *conditions) {
     const int nStrips = sst_data_d->nStrips;
     const auto __restrict__ chanlocs = sst_data_d->chanlocs;
     const uint8_t *__restrict__ adc = sst_data_d->adc;
@@ -96,14 +96,14 @@ namespace stripgpu {
         const fedCh_t channel = chanlocs->fedCh(chan);
         const float noise_i = conditions->noise(fed, channel, strip);
         const uint8_t adc_i = adc[i];
+
         seedStripsMask[i] = (adc_i >= static_cast<uint8_t>(noise_i * seedThreshold)) ? 1 : 0;
         seedStripsNCMask[i] = seedStripsMask[i];
       }
     }
   }
 
-  __global__ static void setNCSeedStripsGPU(StripDataView *sst_data_d,
-                                            const SiStripClusterizerConditionsGPU::Data *conditions) {
+  __global__ static void setNCSeedStripsGPU(StripDataView *sst_data_d, const ConditionsDeviceView *conditions) {
     const int nStrips = sst_data_d->nStrips;
     const auto __restrict__ chanlocs = sst_data_d->chanlocs;
     const uint16_t *__restrict__ channels = sst_data_d->channel;
@@ -145,7 +145,7 @@ namespace stripgpu {
   }
 
   __global__ static void findLeftRightBoundaryGPU(StripDataView *sst_data_d,
-                                                  const SiStripClusterizerConditionsGPU::Data *conditions,
+                                                  const ConditionsDeviceView *conditions,
                                                   SiStripClustersCUDADevice::DeviceView *clust_data_d) {
     const int nStrips = sst_data_d->nStrips;
     const int *__restrict__ seedStripsNCIndex = sst_data_d->seedStripsNCIndex;
@@ -198,7 +198,7 @@ namespace stripgpu {
         auto testDet = chanlocs->detID(testchan);
         auto sameDetLeft = det == testDet;
 
-        while (sameDetLeft && rangeLeft >= 0 && rangeLeft <= maxSequentialHoles && size < clusterSizeLimit) {
+        while (sameDetLeft && rangeLeft >= 0 && rangeLeft <= maxSequentialHoles && size < clusterSizeLimit + 1) {
           testchan = channels[testIndex];
           const auto testFed = chanlocs->fedID(testchan);
           const auto testChannel = chanlocs->fedCh(testchan);
@@ -241,7 +241,7 @@ namespace stripgpu {
         auto testDet = chanlocs->detID(testchan);
         auto sameDetRight = det == testDet;
 
-        while (sameDetRight && rangeRight >= 0 && rangeRight <= maxSequentialHoles && size < clusterSizeLimit) {
+        while (sameDetRight && rangeRight >= 0 && rangeRight <= maxSequentialHoles && size < clusterSizeLimit + 1) {
           testchan = channels[testIndex];
           const auto testFed = chanlocs->fedID(testchan);
           const auto testChannel = chanlocs->fedCh(testchan);
@@ -282,21 +282,18 @@ namespace stripgpu {
   }
 
   __global__ static void checkClusterConditionGPU(StripDataView *sst_data_d,
-                                                  const SiStripClusterizerConditionsGPU::Data *conditions,
+                                                  const ConditionsDeviceView *conditions,
                                                   SiStripClustersCUDADevice::DeviceView *clust_data_d) {
     const uint16_t *__restrict__ stripId = sst_data_d->stripId;
     const auto __restrict__ chanlocs = sst_data_d->chanlocs;
     const uint16_t *__restrict__ channels = sst_data_d->channel;
     const uint8_t *__restrict__ adc = sst_data_d->adc;
     const float minGoodCharge = sst_data_d->minGoodCharge;  //1620.0;
-    const int nSeedStripsNC = clust_data_d->nClusters_;
+    const auto nSeedStripsNC = clust_data_d->nClusters_;
+    const auto __restrict__ clusterIndexLeft = clust_data_d->clusterIndex_;
 
-    auto __restrict__ clusterIndexLeft = clust_data_d->clusterIndex_;
     auto __restrict__ clusterSize = clust_data_d->clusterSize_;
     auto __restrict__ clusterADCs = clust_data_d->clusterADCs_;
-#ifdef CENTERBARY
-    auto __restrict__ firstStrip = clust_data_d->firstStrip_;
-#endif
     auto __restrict__ trueCluster = clust_data_d->trueCluster_;
     auto __restrict__ barycenter = clust_data_d->barycenter_;
     auto __restrict__ charge = clust_data_d->charge_;
@@ -341,86 +338,29 @@ namespace stripgpu {
               if (adc_j < 254) {
                 adc_j = (charge > 1022 ? 255 : (charge > 253 ? 254 : charge));
               }
+              clusterADCs[j * nSeedStripsNC + i] = adc_j;
+
               adcSum += static_cast<float>(adc_j);
               sumx += j * adc_j;
               suma += adc_j;
               j++;
             }
-          }
+          }  // loop over cluster strips
+          charge[i] = adcSum;
           const auto chan = channels[left];
           const fedId_t fed = chanlocs->fedID(chan);
           const fedCh_t channel = chanlocs->fedCh(chan);
-          clusterSize[i] = j;
-          charge[i] = adcSum;
           trueCluster[i] = (adcSum * conditions->invthick(fed, channel)) > minGoodCharge;
+          const auto det = chanlocs->detID(chan);
           const auto bary_i = static_cast<float>(sumx) / static_cast<float>(suma);
           barycenter[i] = static_cast<float>(stripId[left] & stripIndexMask) + bary_i + 0.5f;
-
-#ifdef CENTERBARY
-          int low = left;
-          int high = left + size;
-          if (size > kClusterMaxStrips) {
-            low = std::max(static_cast<int>(left + bary_i - kClusterMaxStrips / 2 - 0.5f), left);
-            while (stripId[low] == stripgpu::invalidStrip) {
-              low++;
-            }
-            high = low + kClusterMaxStrips;
-            if (high > left + size) {
-              high = left + size;
-              low = high - kClusterMaxStrips;
-            }
-#ifdef GPU_CHECK
-            if (not(low >= left && high <= (left + size))) {
-              printf("left %d low %d center %f %f high %d right %d\n",
-                     left,
-                     low,
-                     barycenter[i],
-                     left + bary_i,
-                     high,
-                     left + size);
-              assert(low >= left && high <= (left + size));
-            }
-#endif
-            clusterIndexLeft[i] = low;
-            firstStrip[i] = stripId[low];
-          }
-#else
-          int low = left;
-          int high = left + size;
-#endif
-          j = 0;
-          for (int index = low; index < high; index++) {
-            const auto chan = channels[index];
-            const auto fed = chanlocs->fedID(chan);
-            const auto channel = chanlocs->fedCh(chan);
-            const auto strip = stripId[index];
-#ifdef GPU_CHECK
-            if (fed == stripgpu::invalidFed) {
-              printf("Invalid fed index %d\n", index);
-            }
-#endif
-            if (strip != stripgpu::invalidStrip) {
-              const float gain_j = conditions->gain(fed, channel, strip);
-
-              uint8_t adc_j = adc[index];
-              const int charge = static_cast<int>(static_cast<float>(adc_j) / gain_j + 0.5f);
-
-              if (adc_j < 254)
-                adc_j = (charge > 1022 ? 255 : (charge > 253 ? 254 : charge));
-              if (j < kClusterMaxStrips) {
-                clusterADCs[j * nSeedStripsNC + i] = adc_j;
-              }
-              j++;
-            }
-          }
-          clusterSize[i] = size;
-        }  // not a duplicate
-      }    // trueCluster[i]
+          clusterSize[i] = j;
+        }  // not a duplicate cluster
+      }    // trueCluster[i] is true
     }      // i < nSeedStripsNC
   }
 
-  void SiStripRawToClusterGPUKernel::unpackChannelsGPU(const SiStripClusterizerConditionsGPU::Data *conditions,
-                                                       cudaStream_t stream) {
+  void SiStripRawToClusterGPUKernel::unpackChannelsGPU(const ConditionsDeviceView *conditions, cudaStream_t stream) {
     constexpr int nthreads = 128;
     const auto channels = chanlocsGPU_->size();
     const auto nblocks = (channels + nthreads - 1) / nthreads;
@@ -453,14 +393,13 @@ namespace stripgpu {
     sst_data_d_->maxSequentialBad = maxSequentialBad_;
     sst_data_d_->maxAdjacentBad = maxAdjacentBad_;
     sst_data_d_->minGoodCharge = minGoodCharge_;
-    sst_data_d_->clusterSizeLimit = keepLargeClusters_ ? 256 : kClusterMaxStrips;
+    sst_data_d_->clusterSizeLimit = maxClusterSize_;
 
     pt_sst_data_d_ = cms::cuda::make_device_unique<stripgpu::StripDataView>(stream);
     cms::cuda::copyAsync(pt_sst_data_d_, sst_data_d_, stream);
   }
 
-  void SiStripRawToClusterGPUKernel::findClusterGPU(const SiStripClusterizerConditionsGPU::Data *conditions,
-                                                    cudaStream_t stream) {
+  void SiStripRawToClusterGPUKernel::findClusterGPU(const ConditionsDeviceView *conditions, cudaStream_t stream) {
     const int nthreads = 128;
     const int nStrips = sst_data_d_->nStrips;
     const int nSeeds = std::min(kMaxSeedStrips, nStrips);
@@ -470,7 +409,6 @@ namespace stripgpu {
     auto cpu_index = cms::cuda::make_host_unique<int[]>(nStrips, stream);
     auto cpu_strip = cms::cuda::make_host_unique<uint16_t[]>(nStrips, stream);
     auto cpu_adc = cms::cuda::make_host_unique<uint8_t[]>(nStrips, stream);
-    auto cpu_noise = cms::cuda::make_host_unique<float[]>(nStrips, stream);
 
     cudaCheck(cudaMemcpyAsync(
         cpu_strip.get(), sst_data_d_->stripId, nStrips * sizeof(uint16_t), cudaMemcpyDeviceToHost, stream));
@@ -494,8 +432,11 @@ namespace stripgpu {
     cudaCheck(cudaGetLastError());
 #endif
 
-    cudaCheck(cudaMemcpyAsync(
-        clusters_d_.nClustersHostPtr(), &(clust_data_d->nClusters_), sizeof(int), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaMemcpyAsync(clusters_d_.nClustersHostPtr(),
+                              &(clust_data_d->nClusters_),
+                              sizeof(clust_data_d->nClusters_),
+                              cudaMemcpyDeviceToHost,
+                              stream));
 
     checkClusterConditionGPU<<<nblocks, nthreads, 0, stream>>>(pt_sst_data_d_.get(), conditions, clust_data_d);
     cudaCheck(cudaGetLastError());
@@ -507,26 +448,27 @@ namespace stripgpu {
 
 #ifdef GPU_DEBUG
     cudaStreamSynchronize(stream);
-    auto clust_data =
-        std::make_unique<SiStripClustersCUDAHost>(clusters_d_, SiStripClustersCUDAHost::kClusterMaxStrips, stream);
+    auto clust_data = std::make_unique<SiStripClustersCUDAHost>(clusters_d_, stream);
     cudaStreamSynchronize(stream);
 
-    auto clusterIndexLeft = clust_data->clusterIndex().get();
-    auto clusterSize = clust_data->clusterSize().get();
-    auto trueCluster = clust_data->trueCluster().get();
-    auto clusterADCs = clust_data->clusterADCs().get();
-    auto detids = clust_data->clusterDetId().get();
+    const auto clusterIndexLeft = clust_data->clusterIndex().get();
+    const auto clusterSize = clust_data->clusterSize().get();
+    const auto trueCluster = clust_data->trueCluster().get();
+    const auto clusterADCs = clust_data->clusterADCs().get();
+    const auto detids = clust_data->clusterDetId().get();
+    const auto charge = clust_data->charge().get();
 
-    const int nSeedStripsNC = clusters_d_.nClusters_h;
+    const auto nSeedStripsNC = clusters_d_.nClustersHost();
     std::cout << "findClusterGPU nSeedStripsNC=" << nSeedStripsNC << std::endl;
 
-    for (int i = 0; i < nSeedStripsNC; i++) {
+    for (auto i = 0U; i < nSeedStripsNC; i++) {
       if (trueCluster[i]) {
         int left = clusterIndexLeft[i];
         uint32_t size = clusterSize[i];
         const auto detid = detids[i];
-        std::cout << "i=" << i << " detId " << detid << " left " << left << " size " << size << " : ";
-        size = std::min(size, kClusterMaxStrips);
+        std::cout << "i=" << i << " detId " << detid << " left " << left << " size " << size << " charge " << charge[i]
+                  << ": ";
+        size = std::min(size, maxClusterSize_);
         for (uint32_t j = 0; j < size; j++) {
           std::cout << (unsigned int)clusterADCs[j * nSeedStripsNC + i] << " ";
         }
@@ -536,7 +478,7 @@ namespace stripgpu {
 #endif
   }
 
-  void SiStripRawToClusterGPUKernel::setSeedStripsNCIndexGPU(const SiStripClusterizerConditionsGPU::Data *conditions,
+  void SiStripRawToClusterGPUKernel::setSeedStripsNCIndexGPU(const ConditionsDeviceView *conditions,
                                                              cudaStream_t stream) {
 #ifdef GPU_DEBUG
     int nStrips = sst_data_d_->nStrips;
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.h b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.h
index e24336aac338a..8c0a4d1e453a0 100644
--- a/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/SiStripRawToClusterGPUKernel.h
@@ -48,13 +48,15 @@ namespace stripgpu {
     SiStripClustersCUDADevice getResults(cudaStream_t stream);
 
   private:
+    using ConditionsDeviceView = SiStripClusterizerConditionsGPU::Data::DeviceView;
+
     void reset();
-    void unpackChannelsGPU(const SiStripClusterizerConditionsGPU::Data* conditions, cudaStream_t stream);
+    void unpackChannelsGPU(const ConditionsDeviceView* conditions, cudaStream_t stream);
     void allocateSSTDataGPU(int max_strips, cudaStream_t stream);
     void freeSSTDataGPU(cudaStream_t stream);
 
-    void setSeedStripsNCIndexGPU(const SiStripClusterizerConditionsGPU::Data* conditions, cudaStream_t stream);
-    void findClusterGPU(const SiStripClusterizerConditionsGPU::Data* conditions, cudaStream_t stream);
+    void setSeedStripsNCIndexGPU(const ConditionsDeviceView* conditions, cudaStream_t stream);
+    void findClusterGPU(const ConditionsDeviceView* conditions, cudaStream_t stream);
 
     std::vector<stripgpu::fedId_t> fedIndex_;
     std::vector<size_t> fedRawDataOffsets_;
@@ -71,8 +73,8 @@ namespace stripgpu {
     SiStripClustersCUDADevice clusters_d_;
     float channelThreshold_, seedThreshold_, clusterThresholdSquared_;
     uint8_t maxSequentialHoles_, maxSequentialBad_, maxAdjacentBad_;
+    uint32_t maxClusterSize_;
     float minGoodCharge_;
-    bool keepLargeClusters_;
   };
 }  // namespace stripgpu
 #endif
diff --git a/RecoLocalTracker/SiStripClusterizer/plugins/StripDataView.cuh b/RecoLocalTracker/SiStripClusterizer/plugins/StripDataView.cuh
index a16f1f446f29c..f814f0176f635 100644
--- a/RecoLocalTracker/SiStripClusterizer/plugins/StripDataView.cuh
+++ b/RecoLocalTracker/SiStripClusterizer/plugins/StripDataView.cuh
@@ -9,7 +9,6 @@ struct ChannelLocsView;
 
 namespace stripgpu {
   static constexpr auto kMaxSeedStrips = 200000;
-  static constexpr uint32_t kClusterMaxStrips = SiStripClustersCUDADevice::kClusterMaxStrips;
 
   struct StripDataView {
     const ChannelLocsView *chanlocs;
diff --git a/RecoLocalTracker/SiStripClusterizer/python/DefaultClusterizer_cff.py b/RecoLocalTracker/SiStripClusterizer/python/DefaultClusterizer_cff.py
index 2f0a141beb688..499ca94285b3d 100644
--- a/RecoLocalTracker/SiStripClusterizer/python/DefaultClusterizer_cff.py
+++ b/RecoLocalTracker/SiStripClusterizer/python/DefaultClusterizer_cff.py
@@ -11,6 +11,7 @@
     MaxSequentialHoles = cms.uint32(0),
     MaxSequentialBad = cms.uint32(1),
     MaxAdjacentBad = cms.uint32(0),
+    MaxClusterSize = cms.uint32(1024),
     RemoveApvShots     = cms.bool(True),
     clusterChargeCut = cms.PSet(refToPSet_ = cms.string('SiStripClusterChargeCutNone')),
     ConditionsLabel = cms.string("")
diff --git a/RecoLocalTracker/SiStripClusterizer/python/SiStripClusterizerOnDemand_cfi.py b/RecoLocalTracker/SiStripClusterizer/python/SiStripClusterizerOnDemand_cfi.py
index 050705876aa66..30dcd6826c9ff 100644
--- a/RecoLocalTracker/SiStripClusterizer/python/SiStripClusterizerOnDemand_cfi.py
+++ b/RecoLocalTracker/SiStripClusterizer/python/SiStripClusterizerOnDemand_cfi.py
@@ -19,6 +19,8 @@
                                             HybridZeroSuppressed = cms.bool(False),
                                             ProductLabel = cms.InputTag('rawDataCollector'))
 
+siStripClusterizerFromRaw.Clusterizer.MaxClusterSize = cms.uint32(16)
+
 siStripClusters = SwitchProducerCUDA(
     cpu = siStripClusterizerFromRaw,
 )
diff --git a/RecoLocalTracker/SiStripClusterizer/src/StripClusterizerAlgorithmFactory.cc b/RecoLocalTracker/SiStripClusterizer/src/StripClusterizerAlgorithmFactory.cc
index b8e38e97d4379..66728e70d0be9 100644
--- a/RecoLocalTracker/SiStripClusterizer/src/StripClusterizerAlgorithmFactory.cc
+++ b/RecoLocalTracker/SiStripClusterizer/src/StripClusterizerAlgorithmFactory.cc
@@ -19,6 +19,7 @@ std::unique_ptr<StripClusterizerAlgorithm> StripClusterizerAlgorithmFactory::cre
                                     conf.getParameter<unsigned>("MaxSequentialHoles"),
                                     conf.getParameter<unsigned>("MaxSequentialBad"),
                                     conf.getParameter<unsigned>("MaxAdjacentBad"),
+                                    conf.getParameter<unsigned>("MaxClusterSize"),
                                     conf.getParameter<bool>("RemoveApvShots"),
                                     clusterChargeCut(conf)));
   }
diff --git a/RecoLocalTracker/SiStripClusterizer/src/ThreeThresholdAlgorithm.cc b/RecoLocalTracker/SiStripClusterizer/src/ThreeThresholdAlgorithm.cc
index 7bc7fc495830e..4ee9eefc18fa0 100644
--- a/RecoLocalTracker/SiStripClusterizer/src/ThreeThresholdAlgorithm.cc
+++ b/RecoLocalTracker/SiStripClusterizer/src/ThreeThresholdAlgorithm.cc
@@ -15,6 +15,7 @@ ThreeThresholdAlgorithm::ThreeThresholdAlgorithm(
     unsigned holes,
     unsigned bad,
     unsigned adj,
+    unsigned maxClusterSize,
     bool removeApvShots,
     float minGoodCharge)
     : StripClusterizerAlgorithm(conditionsToken),
@@ -24,6 +25,7 @@ ThreeThresholdAlgorithm::ThreeThresholdAlgorithm(
       MaxSequentialHoles(holes),
       MaxSequentialBad(bad),
       MaxAdjacentBad(adj),
+      MaxClusterSize(maxClusterSize),
       RemoveApvShots(removeApvShots),
       minGoodCharge(minGoodCharge) {}
 
@@ -98,7 +100,7 @@ inline void ThreeThresholdAlgorithm::endCandidate(State& state, T& out) const {
 }
 
 inline bool ThreeThresholdAlgorithm::candidateAccepted(State const& state) const {
-  return (!state.candidateLacksSeed &&
+  return (!state.candidateLacksSeed && state.ADCs.size() <= MaxClusterSize &&
           state.noiseSquared * ClusterThresholdSquared <=
               std::pow(float(std::accumulate(state.ADCs.begin(), state.ADCs.end(), int(0))), 2.f));
 }