Reorganize SiStripClusterizerConditionsGPU to avoid alignment issues

Make maximum strips per cluster cut configurable at runtime for both the GPU and CPU producers Remove GPU option to keep large clusters truncated around the barycenter, as it complicates the code for little apparent benefit. Fix bugs in the application of the cluster threshold and the early cutoff of clusters larger than the limit Squash out all add/delete commits of the same file
cms-sw · Mar 22, 2023 · 3618186 · 3618186
1 parent 9037323
commit 3618186
Show file tree

Hide file tree

Showing 22 changed files with 296 additions and 317 deletions.
diff --git a/CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h b/CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h
@@ -16,7 +16,7 @@ namespace cms {
 class SiStripClustersCUDADevice : public SiStripClustersSOABase<cms::cuda::device::unique_ptr> {
 public:
   SiStripClustersCUDADevice() = default;
-  explicit SiStripClustersCUDADevice(size_t maxClusters, int clustersPerStrip, cudaStream_t stream);
+  explicit SiStripClustersCUDADevice(uint32_t maxClusters, uint32_t maxStripsPerCluster, cudaStream_t stream);
   ~SiStripClustersCUDADevice() override = default;
 
   SiStripClustersCUDADevice(const SiStripClustersCUDADevice &) = delete;
@@ -34,23 +34,25 @@ class SiStripClustersCUDADevice : public SiStripClustersSOABase<cms::cuda::devic
     float *barycenter_;
     float *charge_;
     uint32_t nClusters_;
+    uint32_t maxClusterSize_;
   };
 
   DeviceView *view() const { return view_d.get(); }
-  int nClustersHost() const { return nClusters_h; }
-  int *nClustersHostPtr() { return &nClusters_h; }
+  uint32_t nClustersHost() const { return nClustersHost_; }
+  uint32_t *nClustersHostPtr() { return &nClustersHost_; }
+  uint32_t maxClusterSizeHost() const { return maxClusterSizeHost_; }
+  uint32_t *maxClusterSizeHostPtr() { return &maxClusterSizeHost_; }
 
 private:
   cms::cuda::device::unique_ptr<DeviceView> view_d;  // "me" pointer
-  int nClusters_h;
+  uint32_t nClustersHost_;
+  uint32_t maxClusterSizeHost_;
 };
 
 class SiStripClustersCUDAHost : public SiStripClustersSOABase<cms::cuda::host::unique_ptr> {
 public:
   SiStripClustersCUDAHost() = default;
-  explicit SiStripClustersCUDAHost(const SiStripClustersCUDADevice &clusters_d,
-                                   int clustersPerStrip,
-                                   cudaStream_t stream);
+  explicit SiStripClustersCUDAHost(const SiStripClustersCUDADevice &clusters_d, cudaStream_t stream);
   ~SiStripClustersCUDAHost() override = default;
 
   SiStripClustersCUDAHost(const SiStripClustersCUDAHost &) = delete;

diff --git a/CUDADataFormats/SiStripCluster/src/SiStripClustersCUDA.cc b/CUDADataFormats/SiStripCluster/src/SiStripClustersCUDA.cc
@@ -1,10 +1,14 @@
 #include "CUDADataFormats/SiStripCluster/interface/SiStripClustersCUDA.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 
-SiStripClustersCUDADevice::SiStripClustersCUDADevice(size_t maxClusters, int clustersPerStrip, cudaStream_t stream) {
+SiStripClustersCUDADevice::SiStripClustersCUDADevice(uint32_t maxClusters,
+                                                     uint32_t maxStripsPerCluster,
+                                                     cudaStream_t stream) {
+  maxClusterSizeHost_ = maxStripsPerCluster;
+
   clusterIndex_ = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
   clusterSize_ = cms::cuda::make_device_unique<uint32_t[]>(maxClusters, stream);
-  clusterADCs_ = cms::cuda::make_device_unique<uint8_t[]>(maxClusters * clustersPerStrip, stream);
+  clusterADCs_ = cms::cuda::make_device_unique<uint8_t[]>(maxClusters * maxStripsPerCluster, stream);
   clusterDetId_ = cms::cuda::make_device_unique<stripgpu::detId_t[]>(maxClusters, stream);
   firstStrip_ = cms::cuda::make_device_unique<stripgpu::stripId_t[]>(maxClusters, stream);
   trueCluster_ = cms::cuda::make_device_unique<bool[]>(maxClusters, stream);
@@ -20,18 +24,18 @@ SiStripClustersCUDADevice::SiStripClustersCUDADevice(size_t maxClusters, int clu
   view->trueCluster_ = trueCluster_.get();
   view->barycenter_ = barycenter_.get();
   view->charge_ = charge_.get();
+  view->maxClusterSize_ = maxStripsPerCluster;
 
   view_d = cms::cuda::make_device_unique<DeviceView>(stream);
   cms::cuda::copyAsync(view_d, view, stream);
 }
 
-SiStripClustersCUDAHost::SiStripClustersCUDAHost(const SiStripClustersCUDADevice& clusters_d,
-                                                 int clustersPerStrip,
-                                                 cudaStream_t stream) {
+SiStripClustersCUDAHost::SiStripClustersCUDAHost(const SiStripClustersCUDADevice& clusters_d, cudaStream_t stream) {
   nClusters_ = clusters_d.nClustersHost();
+  maxClusterSize_ = clusters_d.maxClusterSizeHost();
   clusterIndex_ = cms::cuda::make_host_unique<uint32_t[]>(nClusters_, stream);
   clusterSize_ = cms::cuda::make_host_unique<uint32_t[]>(nClusters_, stream);
-  clusterADCs_ = cms::cuda::make_host_unique<uint8_t[]>(nClusters_ * clustersPerStrip, stream);
+  clusterADCs_ = cms::cuda::make_host_unique<uint8_t[]>(nClusters_ * maxClusterSize_, stream);
   clusterDetId_ = cms::cuda::make_host_unique<stripgpu::detId_t[]>(nClusters_, stream);
   firstStrip_ = cms::cuda::make_host_unique<stripgpu::stripId_t[]>(nClusters_, stream);
   trueCluster_ = cms::cuda::make_host_unique<bool[]>(nClusters_, stream);
@@ -40,7 +44,7 @@ SiStripClustersCUDAHost::SiStripClustersCUDAHost(const SiStripClustersCUDADevice
 
   cms::cuda::copyAsync(clusterIndex_, clusters_d.clusterIndex(), nClusters_, stream);
   cms::cuda::copyAsync(clusterSize_, clusters_d.clusterSize(), nClusters_, stream);
-  cms::cuda::copyAsync(clusterADCs_, clusters_d.clusterADCs(), nClusters_ * clustersPerStrip, stream);
+  cms::cuda::copyAsync(clusterADCs_, clusters_d.clusterADCs(), nClusters_ * maxClusterSize_, stream);
   cms::cuda::copyAsync(clusterDetId_, clusters_d.clusterDetId(), nClusters_, stream);
   cms::cuda::copyAsync(firstStrip_, clusters_d.firstStrip(), nClusters_, stream);
   cms::cuda::copyAsync(trueCluster_, clusters_d.trueCluster(), nClusters_, stream);

diff --git a/CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h b/CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h
@@ -1,10 +1,14 @@
 #ifndef CalibFormats_SiStripObjects_SiStripClusterizerConditionsGPU_h
 #define CalibFormats_SiStripObjects_SiStripClusterizerConditionsGPU_h
 
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
-#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
 #include "DataFormats/SiStripCluster/interface/SiStripTypes.h"
 
+#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h"
+
 class SiStripQuality;
 class SiStripGain;
 class SiStripNoises;
@@ -19,108 +23,117 @@ namespace stripgpu {
   static constexpr int kStripsPerFed = kChannelCount * kStripsPerChannel;
 
   __host__ __device__ inline fedId_t fedIndex(fedId_t fed) { return fed - kFedFirst; }
-  __host__ __device__ inline stripId_t stripIndex(fedCh_t channel, stripId_t strip) {
-    return channel * kStripsPerChannel + (strip % kStripsPerChannel);
+  __host__ __device__ inline std::uint32_t stripIndex(fedId_t fed, fedCh_t channel, stripId_t strip) {
+    return fedIndex(fed) * kStripsPerFed + channel * kStripsPerChannel + (strip % kStripsPerChannel);
   }
-  __host__ __device__ inline stripId_t apvIndex(fedCh_t channel, stripId_t strip) {
-    return channel * kStripsPerChannel + (strip % kStripsPerChannel) / 128;
+  __host__ __device__ inline std::uint32_t apvIndex(fedId_t fed, fedCh_t channel, stripId_t strip) {
+    return fedIndex(fed) * kApvCount + 2 * channel + (strip % kStripsPerChannel) / 128;
+  }
+  __host__ __device__ inline std::uint32_t channelIndex(fedId_t fed, fedCh_t channel) {
+    return fedIndex(fed) * kChannelCount + channel;
   }
-}  // namespace stripgpu
 
-class SiStripClusterizerConditionsGPU {
-public:
-  class DetToFed {
+  class SiStripClusterizerConditionsGPU {
   public:
-    DetToFed(stripgpu::detId_t detid, stripgpu::APVPair_t ipair, stripgpu::fedId_t fedid, stripgpu::fedCh_t fedch)
-        : detid_(detid), ipair_(ipair), fedid_(fedid), fedch_(fedch) {}
-    stripgpu::detId_t detID() const { return detid_; }
-    stripgpu::APVPair_t pair() const { return ipair_; }
-    stripgpu::fedId_t fedID() const { return fedid_; }
-    stripgpu::fedCh_t fedCh() const { return fedch_; }
-
-  private:
-    stripgpu::detId_t detid_;
-    stripgpu::APVPair_t ipair_;
-    stripgpu::fedId_t fedid_;
-    stripgpu::fedCh_t fedch_;
-  };
-  using DetToFeds = std::vector<DetToFed>;
+    class DetToFed {
+    public:
+      DetToFed(detId_t detid, APVPair_t ipair, fedId_t fedid, fedCh_t fedch)
+          : detid_(detid), ipair_(ipair), fedid_(fedid), fedch_(fedch) {}
+      detId_t detID() const { return detid_; }
+      APVPair_t pair() const { return ipair_; }
+      fedId_t fedID() const { return fedid_; }
+      fedCh_t fedCh() const { return fedch_; }
+
+    private:
+      detId_t detid_;
+      APVPair_t ipair_;
+      fedId_t fedid_;
+      fedCh_t fedch_;
+    };
+    using DetToFeds = std::vector<DetToFed>;
 
-  struct Data {
     static constexpr std::uint16_t badBit = 1 << 15;
 
-    __host__ __device__ void setStrip(stripgpu::fedId_t fed,
-                                      stripgpu::fedCh_t channel,
-                                      stripgpu::stripId_t strip,
-                                      std::uint16_t noise,
-                                      float gain,
-                                      bool bad) {
-      gain_[stripgpu::fedIndex(fed)][stripgpu::apvIndex(channel, strip)] = gain;
-      noise_[stripgpu::fedIndex(fed)][stripgpu::stripIndex(channel, strip)] = noise;
+    class Data {
+    public:
+      struct DeviceView {
+        __device__ inline detId_t detID(fedId_t fed, fedCh_t channel) const {
+          return detID_[channelIndex(fed, channel)];
+        }
+
+        __device__ inline APVPair_t iPair(fedId_t fed, fedCh_t channel) const {
+          return iPair_[channelIndex(fed, channel)];
+        }
+
+        __device__ inline float invthick(fedId_t fed, fedCh_t channel) const {
+          return invthick_[channelIndex(fed, channel)];
+        }
+
+        __device__ inline float noise(fedId_t fed, fedCh_t channel, stripId_t strip) const {
+          return 0.1f * (noise_[stripIndex(fed, channel, strip)] & ~badBit);
+        }
+
+        __device__ inline float gain(fedId_t fed, fedCh_t channel, stripId_t strip) const {
+          return gain_[apvIndex(fed, channel, strip)];
+        }
+
+        __device__ inline bool bad(fedId_t fed, fedCh_t channel, stripId_t strip) const {
+          return badBit == (noise_[stripIndex(fed, channel, strip)] & badBit);
+        }
+        const std::uint16_t* noise_;  //[kFedCount*kStripsPerFed];
+        const float* invthick_;       //[kFedCount*kChannelCount];
+        const detId_t* detID_;        //[kFedCount*kChannelCount];
+        const APVPair_t* iPair_;      //[kFedCount*kChannelCount];
+        const float* gain_;           //[kFedCount*kApvCount];
+      };
+
+      const DeviceView* deviceView() const { return deviceView_.get(); }
+
+      cms::cuda::device::unique_ptr<DeviceView> deviceView_;
+      cms::cuda::host::unique_ptr<DeviceView> hostView_;
+
+      cms::cuda::device::unique_ptr<std::uint16_t[]> noise_;  //[kFedCount*kStripsPerFed];
+      cms::cuda::device::unique_ptr<float[]> invthick_;       //[kFedCount*kChannelCount];
+      cms::cuda::device::unique_ptr<detId_t[]> detID_;        //[kFedCount*kChannelCount];
+      cms::cuda::device::unique_ptr<APVPair_t[]> iPair_;      //[kFedCount*kChannelCount];
+      cms::cuda::device::unique_ptr<float[]> gain_;           //[kFedCount*kApvCount];
+    };
+
+    SiStripClusterizerConditionsGPU(const SiStripQuality& quality,
+                                    const SiStripGain* gains,
+                                    const SiStripNoises& noises);
+    ~SiStripClusterizerConditionsGPU() = default;
+
+    // Function to return the actual payload on the memory of the current device
+    Data const& getGPUProductAsync(cudaStream_t stream) const;
+
+    const DetToFeds& detToFeds() const { return detToFeds_; }
+
+  private:
+    void setStrip(fedId_t fed, fedCh_t channel, stripId_t strip, std::uint16_t noise, float gain, bool bad) {
+      gain_[apvIndex(fed, channel, strip)] = gain;
+      noise_[stripIndex(fed, channel, strip)] = noise;
       if (bad) {
-        noise_[stripgpu::fedIndex(fed)][stripgpu::stripIndex(channel, strip)] |= badBit;
+        noise_[stripIndex(fed, channel, strip)] |= badBit;
       }
     }
 
-    __host__ __device__ void setInvThickness(stripgpu::fedId_t fed, stripgpu::fedCh_t channel, float invthick) {
-      invthick_[stripgpu::fedIndex(fed)][channel] = invthick;
-    }
-
-    __host__ __device__ stripgpu::detId_t detID(stripgpu::fedId_t fed, stripgpu::fedCh_t channel) const {
-      return detID_[stripgpu::fedIndex(fed)][channel];
-    }
-
-    __host__ __device__ stripgpu::APVPair_t iPair(stripgpu::fedId_t fed, stripgpu::fedCh_t channel) const {
-      return iPair_[stripgpu::fedIndex(fed)][channel];
+    void setInvThickness(fedId_t fed, fedCh_t channel, float invthick) {
+      invthick_[channelIndex(fed, channel)] = invthick;
     }
 
-    __host__ __device__ float invthick(stripgpu::fedId_t fed, stripgpu::fedCh_t channel) const {
-      return invthick_[stripgpu::fedIndex(fed)][channel];
-    }
-
-    __host__ __device__ float noise(stripgpu::fedId_t fed, stripgpu::fedCh_t channel, stripgpu::stripId_t strip) const {
-      return 0.1 * (noise_[stripgpu::fedIndex(fed)][stripgpu::stripIndex(channel, strip)] & !badBit);
-    }
-
-    __host__ __device__ float gain(stripgpu::fedId_t fed, stripgpu::fedCh_t channel, stripgpu::stripId_t strip) const {
-      return gain_[stripgpu::fedIndex(fed)][stripgpu::apvIndex(channel, strip)];
-    }
-
-    __host__ __device__ bool bad(stripgpu::fedId_t fed, stripgpu::fedCh_t channel, stripgpu::stripId_t strip) const {
-      return badBit == (noise_[stripgpu::fedIndex(fed)][stripgpu::stripIndex(channel, strip)] & badBit);
-    }
-
-    alignas(128) float gain_[stripgpu::kFedCount][stripgpu::kApvCount];
-    alignas(128) float invthick_[stripgpu::kFedCount][stripgpu::kChannelCount];
-    alignas(128) std::uint16_t noise_[stripgpu::kFedCount][stripgpu::kStripsPerFed];
-    alignas(128) stripgpu::detId_t detID_[stripgpu::kFedCount][stripgpu::kChannelCount];
-    alignas(128) stripgpu::APVPair_t iPair_[stripgpu::kFedCount][stripgpu::kChannelCount];
+    // Holds the data in pinned CPU memory
+    std::vector<std::uint16_t, cms::cuda::HostAllocator<std::uint16_t>> noise_;
+    std::vector<float, cms::cuda::HostAllocator<float>> invthick_;
+    std::vector<detId_t, cms::cuda::HostAllocator<detId_t>> detID_;
+    std::vector<APVPair_t, cms::cuda::HostAllocator<APVPair_t>> iPair_;
+    std::vector<float, cms::cuda::HostAllocator<float>> gain_;
+
+    // Helper that takes care of complexity of transferring the data to
+    // multiple devices
+    cms::cuda::ESProduct<Data> gpuData_;
+    DetToFeds detToFeds_;
   };
-
-  SiStripClusterizerConditionsGPU(const SiStripQuality& quality, const SiStripGain* gains, const SiStripNoises& noises);
-  ~SiStripClusterizerConditionsGPU();
-
-  // Function to return the actual payload on the memory of the current device
-  Data const* getGPUProductAsync(cudaStream_t stream) const;
-
-  const DetToFeds& detToFeds() const { return detToFeds_; }
-
-private:
-  // Holds the data in pinned CPU memory
-  Data* conditions_ = nullptr;
-
-  // Helper struct to hold all information that has to be allocated and
-  // deallocated per device
-  struct GPUData {
-    // Destructor should free all member pointers
-    ~GPUData();
-    Data* conditionsDevice = nullptr;
-  };
-
-  // Helper that takes care of complexity of transferring the data to
-  // multiple devices
-  cms::cuda::ESProduct<GPUData> gpuData_;
-  DetToFeds detToFeds_;
-};
+}  // namespace stripgpu
 
 #endif
diff --git a/CalibFormats/SiStripObjects/src/EventSetup_Registration.cc b/CalibFormats/SiStripObjects/src/EventSetup_Registration.cc
@@ -25,4 +25,4 @@ TYPELOOKUP_DATA_REG(SiStripQuality);
 TYPELOOKUP_DATA_REG(SiStripClusterizerConditions);
 
 #include "CalibFormats/SiStripObjects/interface/SiStripClusterizerConditionsGPU.h"
-TYPELOOKUP_DATA_REG(SiStripClusterizerConditionsGPU);
+TYPELOOKUP_DATA_REG(stripgpu::SiStripClusterizerConditionsGPU);