diff --git a/cub/device/device_spmv.cuh b/cub/device/device_spmv.cuh new file mode 100644 index 0000000000..b5597c6df1 --- /dev/null +++ b/cub/device/device_spmv.cuh @@ -0,0 +1,127 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). + */ + +#pragma once + +#include +#include +#include + +#include "dispatch/device_histogram_dispatch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). + * \ingroup DeviceModule + * + * \par Overview + * TODO + * + * \par Usage Considerations + * \cdp_class{DeviceSpmv} + * + */ +struct DeviceSpmv +{ + /******************************************************************//** + * \name CSR matrix operations + *********************************************************************/ + //@{ + + /** + * \brief TODO + * + * \par Snippet + * TODO + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for input matrix A, input vector x, + * // and output vector y + * TODO + * + * \endcode + * + * \tparam VertexT [inferred] Integer type for vertex identifiers + * \tparam ValueT [inferred] Matrix and vector value type + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + typename VertexT, + typename ValueT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t CsrMV( + void* d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + ValueT* d_matrix_values, ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. + OffsetT* d_matrix_row_offsets, ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_matrix_column_indices and \p d_matrix_values (with the final entry being equal to \p num_nonzeros) + VertexT* d_matrix_column_indices, ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) + ValueT* d_vector_x, ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector x + ValueT* d_vector_y, ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector y + int num_rows, ///< [in] number of rows of matrix A. + int num_cols, ///< [in] number of columns of matrix A. + int num_nonzeros, ///< [in] number of nonzero elements of matrix A. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + if (!d_temp_storage) + temp_storage_bytes = 0; + + return cudaSuccess; + } + + + + //@} end member group +}; + + +/** + * \example TODO + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/experimental/histogram/histogram_cub.h b/experimental/histogram/histogram_cub.h index e7131226af..872b026508 100644 --- a/experimental/histogram/histogram_cub.h +++ b/experimental/histogram/histogram_cub.h @@ -25,7 +25,7 @@ * ******************************************************************************/ -#include +#include using namespace cub; diff --git a/experimental/matrix.h b/experimental/matrix.h index a5d68ee2bb..27fef47509 100644 --- a/experimental/matrix.h +++ b/experimental/matrix.h @@ -443,13 +443,13 @@ struct CooMatrix /** * CSR sparse format matrix */ -template +template struct CsrMatrix { int num_rows; int num_cols; int num_nonzeros; - SizeT* row_offsets; + OffsetT* row_offsets; VertexT* column_indices; ValueT* values; @@ -479,12 +479,12 @@ struct CsrMatrix num_cols = coo_matrix.num_cols; num_nonzeros = coo_matrix.num_nonzeros; - row_offsets = new SizeT[num_rows + 1]; + row_offsets = new OffsetT[num_rows + 1]; column_indices = new VertexT[num_nonzeros]; values = new ValueT[num_nonzeros]; VertexT prev_row = -1; - for (SizeT current_edge = 0; current_edge < num_nonzeros; current_edge++) + for (OffsetT current_edge = 0; current_edge < num_nonzeros; current_edge++) { VertexT current_row = coo_matrix.coo_tuples[current_edge].row; @@ -523,9 +523,9 @@ struct CsrMatrix // Scan int max_log_length = -1; - for (SizeT row = 0; row < num_rows; row++) + for (OffsetT row = 0; row < num_rows; row++) { - SizeT length = row_offsets[row + 1] - row_offsets[row]; + OffsetT length = row_offsets[row + 1] - row_offsets[row]; int log_length = -1; while (length > 0) @@ -545,7 +545,6 @@ struct CsrMatrix { printf("\tDegree 1e%d: \t%d (%.2f%%)\n", i, log_counts[i + 1], (float) log_counts[i + 1] * 100.0 / num_cols); } - printf("\n"); fflush(stdout); } @@ -556,10 +555,10 @@ struct CsrMatrix void Display() { cout << "Input Matrix:\n"; - for (SizeT row = 0; row < num_rows; row++) + for (OffsetT row = 0; row < num_rows; row++) { cout << row << ": "; - for (SizeT current_edge = row_offsets[row]; current_edge < row_offsets[row + 1]; current_edge++) + for (OffsetT current_edge = row_offsets[row]; current_edge < row_offsets[row + 1]; current_edge++) { cout << column_indices[current_edge] << " (" << values[current_edge] << "), "; } diff --git a/experimental/spmv_compare.cu b/experimental/spmv_compare.cu index d47da940cd..c9fdb18145 100644 --- a/experimental/spmv_compare.cu +++ b/experimental/spmv_compare.cu @@ -40,6 +40,7 @@ #include "matrix.h" +#include #include #include @@ -65,17 +66,17 @@ CachingDeviceAllocator g_allocator(true); // Caching allocator for device memo template < typename VertexT, typename ValueT, - typename SizeT> + typename OffsetT> void SpmvGold( - CsrMatrix& matrix_a, + CsrMatrix& matrix_a, ValueT* vector_x, ValueT* vector_y) { - for (SizeT row = 0; row < matrix_a.num_rows; ++row) + for (OffsetT row = 0; row < matrix_a.num_rows; ++row) { vector_y[row] = 0; for ( - SizeT column = matrix_a.row_offsets[row]; + OffsetT column = matrix_a.row_offsets[row]; column < matrix_a.row_offsets[row + 1]; ++column) { @@ -86,7 +87,7 @@ void SpmvGold( //--------------------------------------------------------------------- -// Test GPU SpMV execution +// GPU SpMV execution //--------------------------------------------------------------------- /** @@ -94,16 +95,16 @@ void SpmvGold( */ template < typename VertexT, - typename SizeT> + typename OffsetT> float CusparseSpmv( - int num_rows, - int num_cols, - int num_nonzeros, float* d_matrix_values, - SizeT* d_matrix_row_offsets, + OffsetT* d_matrix_row_offsets, VertexT* d_matrix_column_indices, float* d_vector_x, float* d_vector_y, + int num_rows, + int num_cols, + int num_nonzeros, int timing_iterations, cusparseHandle_t cusparse) { @@ -112,6 +113,14 @@ float CusparseSpmv( float alpha = 1.0; float beta = 0.0; + // Warmup + AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv( + cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, + num_rows, num_cols, num_nonzeros, &alpha, desc, + d_matrix_values, d_matrix_row_offsets, d_matrix_column_indices, + d_vector_x, &beta, d_vector_y)); + + // Timing float elapsed_millis = 0.0; GpuTimer gpu_timer; @@ -119,20 +128,11 @@ float CusparseSpmv( { gpu_timer.Start(); - cusparseStatus_t status = cusparseScsrmv( - cusparse, - CUSPARSE_OPERATION_NON_TRANSPOSE, - num_rows, - num_cols, - num_nonzeros, - &alpha, - desc, - d_matrix_values, - d_matrix_row_offsets, - d_matrix_column_indices, - d_vector_x, - &beta, - d_vector_y); + cusparseScsrmv( + cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, + num_rows, num_cols, num_nonzeros, &alpha, desc, + d_matrix_values, d_matrix_row_offsets, d_matrix_column_indices, + d_vector_x, &beta, d_vector_y); gpu_timer.Stop(); elapsed_millis += gpu_timer.ElapsedMillis(); @@ -148,16 +148,16 @@ float CusparseSpmv( */ template < typename VertexT, - typename SizeT> + typename OffsetT> float CusparseSpmv( - int num_rows, - int num_cols, - int num_nonzeros, double* d_matrix_values, - SizeT* d_matrix_row_offsets, + OffsetT* d_matrix_row_offsets, VertexT* d_matrix_column_indices, double* d_vector_x, double* d_vector_y, + int num_rows, + int num_cols, + int num_nonzeros, int timing_iterations, cusparseHandle_t cusparse) { @@ -166,6 +166,14 @@ float CusparseSpmv( double alpha = 1.0; double beta = 0.0; + // Warmup + AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv( + cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, + num_rows, num_cols, num_nonzeros, &alpha, desc, + d_matrix_values, d_matrix_row_offsets, d_matrix_column_indices, + d_vector_x, &beta, d_vector_y)); + + // Timing float elapsed_millis = 0.0; GpuTimer gpu_timer; @@ -173,20 +181,11 @@ float CusparseSpmv( { gpu_timer.Start(); - cusparseStatus_t status = cusparseDcsrmv( - cusparse, - CUSPARSE_OPERATION_NON_TRANSPOSE, - num_rows, - num_cols, - num_nonzeros, - &alpha, - desc, - d_matrix_values, - d_matrix_row_offsets, - d_matrix_column_indices, - d_vector_x, - &beta, - d_vector_y); + cusparseDcsrmv( + cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, + num_rows, num_cols, num_nonzeros, &alpha, desc, + d_matrix_values, d_matrix_row_offsets, d_matrix_column_indices, + d_vector_x, &beta, d_vector_y); gpu_timer.Stop(); elapsed_millis += gpu_timer.ElapsedMillis(); @@ -197,13 +196,80 @@ float CusparseSpmv( } +/** + * Run CUB SpMV + */ +template < + typename VertexT, + typename ValueT, + typename OffsetT> +float CubSpmv( + ValueT* d_matrix_values, + OffsetT* d_matrix_row_offsets, + VertexT* d_matrix_column_indices, + ValueT* d_vector_x, + ValueT* d_vector_y, + int num_rows, + int num_cols, + int num_nonzeros, + int timing_iterations) +{ + // Allocate temporary storage + size_t temp_storage_bytes = 0; + void *d_temp_storage = NULL; + + // Get amount of temporary storage needed + CubDebugExit(DeviceSpmv::CsrMV( + d_temp_storage, temp_storage_bytes, + d_matrix_values, d_matrix_row_offsets, d_matrix_column_indices, + d_vector_x, d_vector_y, + num_rows, num_cols, num_nonzeros, + (cudaStream_t) 0, false)); + + // Allocate + CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); + + // Warmup + CubDebugExit(DeviceSpmv::CsrMV( + d_temp_storage, temp_storage_bytes, + d_matrix_values, d_matrix_row_offsets, d_matrix_column_indices, + d_vector_x, d_vector_y, + num_rows, num_cols, num_nonzeros, + (cudaStream_t) 0, true)); + + // Timing + GpuTimer gpu_timer; + float elapsed_millis = 0.0; + + for(int it = 0; it < timing_iterations; ++it) + { + gpu_timer.Start(); + + CubDebugExit(DeviceSpmv::CsrMV( + d_temp_storage, temp_storage_bytes, + d_matrix_values, d_matrix_row_offsets, d_matrix_column_indices, + d_vector_x, d_vector_y, + num_rows, num_cols, num_nonzeros, + (cudaStream_t) 0, false)); + + gpu_timer.Stop(); + elapsed_millis += gpu_timer.ElapsedMillis(); + } + + return elapsed_millis / timing_iterations; +} + +//--------------------------------------------------------------------- +// Test generation +//--------------------------------------------------------------------- + /** * Run tests */ template < typename VertexT, typename ValueT, - typename SizeT> + typename OffsetT> void RunTests( std::string &mtx_filename, int grid2d, @@ -244,7 +310,7 @@ void RunTests( exit(1); } - CsrMatrix csr_matrix; + CsrMatrix csr_matrix; csr_matrix.FromCoo(coo_matrix); // Display matrix info @@ -262,40 +328,40 @@ void RunTests( // Allocate and initialize GPU problem ValueT* d_matrix_values; - SizeT* d_matrix_row_offsets; + OffsetT* d_matrix_row_offsets; VertexT* d_matrix_column_indices; ValueT* d_vector_x; ValueT* d_vector_y; g_allocator.DeviceAllocate((void **) &d_matrix_values, sizeof(ValueT) * csr_matrix.num_nonzeros); - g_allocator.DeviceAllocate((void **) &d_matrix_row_offsets, sizeof(SizeT) * (csr_matrix.num_rows + 1)); + g_allocator.DeviceAllocate((void **) &d_matrix_row_offsets, sizeof(OffsetT) * (csr_matrix.num_rows + 1)); g_allocator.DeviceAllocate((void **) &d_matrix_column_indices, sizeof(VertexT) * csr_matrix.num_nonzeros); g_allocator.DeviceAllocate((void **) &d_vector_x, sizeof(ValueT) * csr_matrix.num_cols); g_allocator.DeviceAllocate((void **) &d_vector_y, sizeof(ValueT) * csr_matrix.num_rows); CubDebugExit(cudaMemcpy(d_matrix_values, csr_matrix.values, sizeof(ValueT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice)); - CubDebugExit(cudaMemcpy(d_matrix_row_offsets, csr_matrix.row_offsets, sizeof(SizeT) * (csr_matrix.num_rows + 1), cudaMemcpyHostToDevice)); + CubDebugExit(cudaMemcpy(d_matrix_row_offsets, csr_matrix.row_offsets, sizeof(OffsetT) * (csr_matrix.num_rows + 1), cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_matrix_column_indices, csr_matrix.column_indices, sizeof(VertexT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_vector_x, vector_x, sizeof(ValueT) * csr_matrix.num_cols, cudaMemcpyHostToDevice)); double avg_millis, nz_throughput, effective_bandwidth; int compare = 0; size_t total_bytes = (csr_matrix.num_nonzeros * (sizeof(ValueT) * 2 + sizeof(VertexT))) + - (csr_matrix.num_rows) * (sizeof(SizeT) + sizeof(ValueT)); + (csr_matrix.num_rows) * (sizeof(OffsetT) + sizeof(ValueT)); // Run problem on cuSparse CubDebugExit(cudaMemset(d_vector_y, 0, sizeof(ValueT) * csr_matrix.num_rows)); - avg_millis = CusparseSpmv(csr_matrix.num_rows, csr_matrix.num_cols, - csr_matrix.num_nonzeros, d_matrix_values, d_matrix_row_offsets, - d_matrix_column_indices, d_vector_x, d_vector_y, timing_iterations, - cusparse); + avg_millis = CusparseSpmv( + d_matrix_values, d_matrix_row_offsets, d_matrix_column_indices, d_vector_x, d_vector_y, + csr_matrix.num_rows, csr_matrix.num_cols, csr_matrix.num_nonzeros, + timing_iterations, cusparse); nz_throughput = double(csr_matrix.num_nonzeros) / avg_millis / 1.0e6; effective_bandwidth = double(total_bytes) / avg_millis / 1.0e6; - printf("%s fp%d: %.3f avg ms, %.3f gflops, %.3lf effective GB/s (%.1f%% peak)\n", + printf("\n\n%s fp%d: %.3f avg ms, %.3f gflops, %.3lf effective GB/s (%.1f%% peak)\n", "cuSparse", sizeof(ValueT) * 8, avg_millis, @@ -307,9 +373,38 @@ void RunTests( printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout); AssertEquals(0, compare); + // Run problem on CUB + + CubDebugExit(cudaMemset(d_vector_y, 0, sizeof(ValueT) * csr_matrix.num_rows)); + + avg_millis = CubSpmv( + d_matrix_values, d_matrix_row_offsets, d_matrix_column_indices, d_vector_x, d_vector_y, + csr_matrix.num_rows, csr_matrix.num_cols, csr_matrix.num_nonzeros, + timing_iterations); + + nz_throughput = double(csr_matrix.num_nonzeros) / avg_millis / 1.0e6; + effective_bandwidth = double(total_bytes) / avg_millis / 1.0e6; + + printf("\n\n%s fp%d: %.3f avg ms, %.3f gflops, %.3lf effective GB/s (%.1f%% peak)\n", + "CUB", + sizeof(ValueT) * 8, + avg_millis, + 2 * nz_throughput, + effective_bandwidth, + effective_bandwidth / bandwidth_GBs * 100); + + compare = CompareDeviceResults(vector_y, d_vector_y, csr_matrix.num_rows, true, g_verbose); + printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout); + AssertEquals(0, compare); + // Cleanup - delete[] vector_x; - delete[] vector_y; + if (vector_x) delete[] vector_x; + if (vector_y) delete[] vector_y; + if (d_matrix_values) g_allocator.DeviceFree(d_matrix_values); + if (d_matrix_row_offsets) g_allocator.DeviceFree(d_matrix_row_offsets); + if (d_matrix_column_indices) g_allocator.DeviceFree(d_matrix_column_indices); + if (d_vector_x) g_allocator.DeviceFree(d_vector_x); + if (d_vector_y) g_allocator.DeviceFree(d_vector_y); }