Add comment

derekelewis · Aug 8, 2024 · afe1989 · afe1989
1 parent 14ba838
commit afe1989
Showing 1 changed file with 4 additions and 0 deletions.
diff --git a/cuda/torchMatrixMultiply.cu b/cuda/torchMatrixMultiply.cu
@@ -23,6 +23,10 @@ torch::Tensor cuda_matrixMultiply(const torch::Tensor &a, const torch::Tensor &b
     float *b_ptr = b_contiguous.data_ptr<float>();
     float *result_ptr = result.data_ptr<float>();
 
+    // Assumes square matrices and we cast to int for simplicity
+    // and compatibility with our existing kernel code. In practice,
+    // we would need to handle non-square matrices and use an unsigned long
+    // to match PyTorch's tensor sizes.
     int dim{static_cast<int>(a.sizes()[0])};
 
     dim3 blockSize(16, 16);