diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1e85edb8474..6f3bbc75487 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -191,7 +191,6 @@ set(CUGRAPH_SOURCES src/community/legacy/leiden.cu src/community/legacy/ktruss.cu src/community/legacy/ecg.cu - src/community/legacy/triangles_counting.cu src/community/legacy/extract_subgraph_by_vertex.cu src/community/legacy/egonet.cu src/sampling/random_walks.cu diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp index ef7c9a4fd61..754b0f30b83 100644 --- a/cpp/include/cugraph/algorithms.hpp +++ b/cpp/include/cugraph/algorithms.hpp @@ -743,26 +743,6 @@ std::unique_ptr> minimum_spanning_t legacy::GraphCSRView const& graph, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -namespace triangle { -/** - * @brief Count the number of triangles in the graph - * - * @throws cugraph::logic_error when an error occurs. - * - * @tparam VT Type of vertex identifiers. Supported value : int (signed, - * 32-bit) - * @tparam ET Type of edge identifiers. Supported value : int (signed, - * 32-bit) - * @tparam WT Type of edge weights. Supported values : float or double. - * - * @param[in] graph input graph object (CSR) - * - * @return The number of triangles - */ -template -uint64_t triangle_count(legacy::GraphCSRView const& graph); -} // namespace triangle - namespace subgraph { /** * @brief Extract subgraph by vertices diff --git a/cpp/src/community/legacy/triangles_counting.cu b/cpp/src/community/legacy/triangles_counting.cu deleted file mode 100644 index bc94a3cde9e..00000000000 --- a/cpp/src/community/legacy/triangles_counting.cu +++ /dev/null @@ -1,858 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include - -#include - -#include "cub/cub.cuh" - -#define TH_CENT_K_LOCLEN (34) -#define WP_LEN_TH1 (24) -#define WP_LEN_TH2 (2) - -#if WP_LEN_TH1 > 32 -#error WP_LEN_TH1 must be <= 32! -#endif - -#define MIN(x, y) (((x) < (y)) ? (x) : (y)) -#define MAX(x, y) (((x) > (y)) ? (x) : (y)) - -#define THREADS (128) -#define DIV_UP(a, b) (((a) + ((b)-1)) / (b)) -#define BITSOF(x) (sizeof(*x) * 8) - -#define BLK_BWL0 (128) - -#define DEG_THR1 (3.5) -#define DEG_THR2 (38.0) - -namespace cugraph { -namespace triangle { - -namespace { // anonym. - -template -struct type_utils; - -template <> -struct type_utils { - typedef int LOCINT; -}; - -template <> -struct type_utils { - typedef uint64_t LOCINT; -}; - -template -struct spmat_t { - T N; - T nnz; - T nrows; - const T* roff_d; - const T* rows_d; - const T* cols_d; - bool is_lower_triangular; -}; - -template -size_t bitmap_roundup(size_t n) -{ - size_t size = DIV_UP(n, 8 * sizeof(T)); - size = size_t{8} * DIV_UP(size * sizeof(T), 8); - size /= sizeof(T); - return size; -} - -template -static inline void cubSum(InputIteratorT d_in, - OutputIteratorT d_out, - int num_items, - cudaStream_t stream = 0, - bool debug_synchronous = false) -{ - size_t temp_storage_bytes = 0; - - cub::DeviceReduce::Sum( - nullptr, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous); - RAFT_CHECK_CUDA(stream); - - rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); - - cub::DeviceReduce::Sum( - d_temp_storage.data(), temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous); - RAFT_CHECK_CUDA(stream); - - return; -} - -template -static inline void cubIf(InputIteratorT d_in, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, - int num_items, - SelectOp select_op, - cudaStream_t stream = 0, - bool debug_synchronous = false) -{ - size_t temp_storage_bytes = 0; - - cub::DeviceSelect::If(nullptr, - temp_storage_bytes, - d_in, - d_out, - d_num_selected_out, - num_items, - select_op, - stream, - debug_synchronous); - RAFT_CHECK_CUDA(stream); - - rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); - - cub::DeviceSelect::If(d_temp_storage.data(), - temp_storage_bytes, - d_in, - d_out, - d_num_selected_out, - num_items, - select_op, - stream, - debug_synchronous); - RAFT_CHECK_CUDA(stream); - - return; -} - -////////////////////////////////////////////////////////////////////////////////////////// -template -__device__ T __block_bcast(const T v, const int x) -{ - __shared__ T shv; - - __syncthreads(); - if (threadIdx.x == x) shv = v; - __syncthreads(); - - return shv; -} - -template -__device__ __forceinline__ T block_sum(T v) -{ - __shared__ T sh[BDIM_X * BDIM_Y / WSIZE]; - - const int lid = threadIdx.x % 32; - const int wid = threadIdx.x / 32 + ((BDIM_Y > 1) ? threadIdx.y * (BDIM_X / 32) : 0); - -#pragma unroll - for (int i = WSIZE / 2; i; i >>= 1) { - v += __shfl_down_sync(raft::warp_full_mask(), v, i); - } - if (lid == 0) sh[wid] = v; - - __syncthreads(); - if (wid == 0) { - v = (lid < (BDIM_X * BDIM_Y / WSIZE)) ? sh[lid] : 0; - -#pragma unroll - for (int i = (BDIM_X * BDIM_Y / WSIZE) / 2; i; i >>= 1) { - v += __shfl_down_sync(raft::warp_full_mask(), v, i); - } - } - return v; -} - -////////////////////////////////////////////////////////////////////////////////////////// -template -__global__ void tricnt_b2b_k(const ROW_T ner, - const ROW_T* __restrict__ rows, - const OFF_T* __restrict__ roff, - const ROW_T* __restrict__ cols, - CNT_T* __restrict__ ocnt, - MAP_T* __restrict__ bmapL0, - const size_t bmldL0, - MAP_T* __restrict__ bmapL1, - const size_t bmldL1) -{ - CNT_T __cnt = 0; - - bmapL1 += bmldL1 * blockIdx.x; - bmapL0 += bmldL0 * blockIdx.x; - for (ROW_T bid = blockIdx.x; bid < ner; bid += gridDim.x) { - const OFF_T rbeg = roff[rows[bid]]; - const OFF_T rend = roff[rows[bid] + 1]; - - ROW_T firstcol = 0; - ROW_T lastcol = 0; - - for (OFF_T i = rbeg; i < rend; i += BDIM) { - const ROW_T c = (i + threadIdx.x < rend) ? cols[i + threadIdx.x] : -1; - - __syncthreads(); - if (c > -1) { - atomicOr(bmapL1 + c / BITSOF(bmapL1), ((MAP_T)1) << (c % BITSOF(bmapL1))); - atomicOr(bmapL0 + c / BWL0 / BITSOF(bmapL0), ((MAP_T)1) << ((c / BWL0) % BITSOF(bmapL0))); - } - __syncthreads(); - -#pragma unroll - for (int j = 0; j < BDIM; j++) { - const ROW_T curc = __block_bcast(c, j); - if (curc == -1) break; - - lastcol = curc; - if ((i == rbeg) && !j) { - firstcol = curc; - continue; - } - const OFF_T soff = roff[curc]; - const OFF_T eoff = roff[curc + 1]; - - for (OFF_T k = eoff - 1; k >= soff; k -= BDIM) { - if (k - (int)threadIdx.x < soff) break; - - const ROW_T cc = __ldg(cols + k - threadIdx.x); - if (cc < firstcol) break; - - MAP_T mm = ((MAP_T)1) << ((cc / BWL0) % BITSOF(bmapL0)); - if (0 == (bmapL0[cc / BWL0 / BITSOF(bmapL0)] & mm)) continue; - - mm = ((MAP_T)1) << (cc % BITSOF(bmapL1)); - if (bmapL1[cc / BITSOF(bmapL1)] & mm) { __cnt++; } - } - } - } - - lastcol /= 64; - firstcol /= 64; - - __syncthreads(); - for (int i = rbeg; i < rend; i += BDIM) { - if (i + threadIdx.x < rend) { - ROW_T c = cols[i + threadIdx.x]; - bmapL1[c / BITSOF(bmapL1)] = 0; - bmapL0[c / BWL0 / BITSOF(bmapL0)] = 0; - } - } - __syncthreads(); - } - - __cnt = block_sum(__cnt); - if (threadIdx.x == 0) ocnt[blockIdx.x] = __cnt; - - return; -} - -template -void tricnt_b2b(T nblock, - spmat_t* m, - uint64_t* ocnt_d, - unsigned int* bmapL0_d, - size_t bmldL0, - unsigned int* bmapL1_d, - size_t bmldL1, - cudaStream_t stream) -{ - // still best overall (with no psum) - tricnt_b2b_k<<>>( - m->nrows, m->rows_d, m->roff_d, m->cols_d, ocnt_d, bmapL0_d, bmldL0, bmapL1_d, bmldL1); - RAFT_CHECK_CUDA(stream); - return; -} - -////////////////////////////////////////////////////////////////////////////////////////// -template -__device__ __forceinline__ T block_sum_sh(T v, T* sh) -{ - const int lid = threadIdx.x % 32; - const int wid = threadIdx.x / 32 + ((BDIM_Y > 1) ? threadIdx.y * (BDIM_X / 32) : 0); - -#pragma unroll - for (int i = WSIZE / 2; i; i >>= 1) { - v += __shfl_down_sync(raft::warp_full_mask(), v, i); - } - if (lid == 0) sh[wid] = v; - - __syncthreads(); - if (wid == 0) { - v = (lid < (BDIM_X * BDIM_Y / WSIZE)) ? sh[lid] : 0; - -#pragma unroll - for (int i = (BDIM_X * BDIM_Y / WSIZE) / 2; i; i >>= 1) { - v += __shfl_down_sync(raft::warp_full_mask(), v, i); - } - } - return v; -} - -template -__global__ void tricnt_bsh_k(const ROW_T ner, - const ROW_T* __restrict__ rows, - const OFF_T* __restrict__ roff, - const ROW_T* __restrict__ cols, - CNT_T* __restrict__ ocnt, - const size_t bmld) -{ - CNT_T __cnt = 0; - extern __shared__ unsigned int shm[]; - - for (int i = 0; i < bmld; i += BDIM) { - if (i + threadIdx.x < bmld) { shm[i + threadIdx.x] = 0; } - } - - for (ROW_T bid = blockIdx.x; bid < ner; bid += gridDim.x) { - const OFF_T rbeg = roff[rows[bid]]; - const OFF_T rend = roff[rows[bid] + 1]; - - ROW_T firstcol = 0; - ROW_T lastcol = 0; - - for (OFF_T i = rbeg; i < rend; i += BDIM) { - const ROW_T c = (i + threadIdx.x < rend) ? cols[i + threadIdx.x] : -1; - - __syncthreads(); - if (c > -1) atomicOr(shm + c / BITSOF(shm), 1u << (c % BITSOF(shm))); - __syncthreads(); - -#pragma unroll - for (int j = 0; j < BDIM; j++) { - const ROW_T curc = __block_bcast(c, j); - if (curc == -1) break; - - lastcol = curc; - if ((i == rbeg) && !j) { - firstcol = curc; - continue; - } - - const OFF_T soff = roff[curc]; - const OFF_T eoff = roff[curc + 1]; - for (OFF_T k = eoff - 1; k >= soff; k -= BDIM) { - if (k - (int)threadIdx.x < soff) break; - - const ROW_T cc = __ldg(cols + k - threadIdx.x); - if (cc < firstcol) break; - - const unsigned int mm = 1u << (cc % BITSOF(shm)); - if (shm[cc / BITSOF(shm)] & mm) { __cnt++; } - } - } - } - lastcol /= 64; - firstcol /= 64; - - __syncthreads(); - if (lastcol - firstcol < rend - rbeg) { - for (int i = firstcol; i <= lastcol; i += BDIM) { - if (i + threadIdx.x <= lastcol) { ((unsigned long long*)shm)[i + threadIdx.x] = 0ull; } - } - } else { - for (int i = rbeg; i < rend; i += BDIM) { - if (i + threadIdx.x < rend) { shm[cols[i + threadIdx.x] / BITSOF(shm)] = 0; } - } - } - __syncthreads(); - } - __cnt = block_sum_sh(__cnt, (uint64_t*)shm); - if (threadIdx.x == 0) ocnt[blockIdx.x] = __cnt; - - return; -} - -template -void tricnt_bsh(T nblock, spmat_t* m, uint64_t* ocnt_d, size_t bmld, cudaStream_t stream) -{ - tricnt_bsh_k<<>>( - m->nrows, m->rows_d, m->roff_d, m->cols_d, ocnt_d, bmld); - RAFT_CHECK_CUDA(stream); - return; -} - -//////////////////////////////////////////////////////////////////////////////////////// -template -__global__ void tricnt_wrp_ps_k(const ROW_T ner, - const ROW_T* __restrict__ rows, - const OFF_T* __restrict__ roff, - const ROW_T* __restrict__ cols, - CNT_T* __restrict__ ocnt, - MAP_T* __restrict__ bmap, - const size_t bmld) -{ - __shared__ OFF_T sho[NWARP][WSIZE]; - __shared__ ROW_T shs[NWARP][WSIZE]; - __shared__ ROW_T shc[NWARP][WSIZE]; - - CNT_T __cnt = 0; - ROW_T wid = blockIdx.x * blockDim.y + threadIdx.y; - - bmap += bmld * wid; - for (; wid < ner; wid += gridDim.x * blockDim.y) { - const OFF_T rbeg = roff[rows[wid]]; - const OFF_T rend = roff[rows[wid] + 1]; - - // RLEN_THR1 <= 32 - if (rend - rbeg <= RLEN_THR1) { - const int nloc = rend - rbeg; - - OFF_T soff; - OFF_T eoff; - if (threadIdx.x < nloc) { - const ROW_T c = cols[rbeg + threadIdx.x]; - shc[threadIdx.y][threadIdx.x] = c; - soff = roff[c]; - eoff = roff[c + 1]; - } - - int mysm = -1; - -#pragma unroll - for (int i = 1; i < RLEN_THR1; i++) { - if (i == nloc) break; - - const OFF_T csoff = __shfl_sync(raft::warp_full_mask(), soff, i); - const OFF_T ceoff = __shfl_sync(raft::warp_full_mask(), eoff, i); - - if (ceoff - csoff < RLEN_THR2) { - if (threadIdx.x == i) mysm = i; - continue; - } - for (OFF_T k = ceoff - 1; k >= csoff; k -= WSIZE) { - if (k - (int)threadIdx.x < csoff) break; - - const ROW_T cc = cols[k - threadIdx.x]; - if (cc < shc[threadIdx.y][0]) break; - for (int j = i - 1; j >= 0; j--) { - if (cc == shc[threadIdx.y][j]) { __cnt++; } - } - } - } - if (mysm > -1) { - for (OFF_T k = eoff - 1; k >= soff; k--) { - const ROW_T cc = cols[k]; - if (cc < shc[threadIdx.y][0]) break; - for (int j = mysm - 1; j >= 0; j--) { - if (cc == shc[threadIdx.y][j]) { __cnt++; } - } - } - } - } else { - ROW_T firstcol = cols[rbeg]; - ROW_T lastcol = cols[rend - 1]; - for (OFF_T i = rbeg; i < rend; i += 32) { - const ROW_T c = (i + threadIdx.x < rend) ? cols[i + threadIdx.x] : -1; - - if (c > -1) atomicOr(bmap + c / BITSOF(bmap), ((MAP_T)1) << (c % BITSOF(bmap))); - } - - for (OFF_T i = rbeg; i < rend; i += 32) { - const ROW_T c = (i + threadIdx.x < rend) ? cols[i + threadIdx.x] : -1; - sho[threadIdx.y][threadIdx.x] = (c > -1) ? roff[c] : 0; - shc[threadIdx.y][threadIdx.x] = c; - - ROW_T len = (c > -1) ? roff[c + 1] - sho[threadIdx.y][threadIdx.x] : 0; - ROW_T lensum = len; - -#pragma unroll - for (int j = 1; j < 32; j <<= 1) { - lensum += (threadIdx.x >= j) * (__shfl_up_sync(raft::warp_full_mask(), lensum, j)); - } - shs[threadIdx.y][threadIdx.x] = lensum - len; - - lensum = __shfl_sync(raft::warp_full_mask(), lensum, 31); - - int k = WSIZE - 1; - for (int j = lensum - 1; j >= 0; j -= WSIZE) { - if (j < threadIdx.x) break; - - // bisect-right - for (; k >= 0; k--) { - if (shs[threadIdx.y][k] <= j - threadIdx.x) break; - } - - const ROW_T cc = - __ldg(cols + (sho[threadIdx.y][k] + j - threadIdx.x - shs[threadIdx.y][k])); - - if (cc < shc[threadIdx.y][k]) continue; - - const MAP_T mm = ((MAP_T)1) << (cc % BITSOF(bmap)); - if (bmap[cc / BITSOF(bmap)] & mm) { __cnt++; } - } - } - lastcol /= 64; - firstcol /= 64; - - if (lastcol - firstcol < rend - rbeg) { - for (int i = firstcol; i <= lastcol; i += WSIZE) { - if (i + threadIdx.x <= lastcol) { ((unsigned long long*)bmap)[i + threadIdx.x] = 0ull; } - } - } else { - for (int i = rbeg; i < rend; i += WSIZE) { - if (i + threadIdx.x < rend) { bmap[cols[i + threadIdx.x] / BITSOF(bmap)] = 0; } - } - } - } - } - __syncthreads(); - __cnt = block_sum(__cnt); - if (threadIdx.x == 0 && threadIdx.y == 0) { ocnt[blockIdx.x] = __cnt; } - return; -} - -template -void tricnt_wrp( - T nblock, spmat_t* m, uint64_t* ocnt_d, unsigned int* bmap_d, size_t bmld, cudaStream_t stream) -{ - dim3 block(32, THREADS / 32); - tricnt_wrp_ps_k<32, THREADS / 32, WP_LEN_TH1, WP_LEN_TH2> - <<>>(m->nrows, m->rows_d, m->roff_d, m->cols_d, ocnt_d, bmap_d, bmld); - RAFT_CHECK_CUDA(stream); - return; -} - -////////////////////////////////////////////////////////////////////////////////////////// -template -__global__ void tricnt_thr_k(const ROW_T ner, - const ROW_T* __restrict__ rows, - const OFF_T* __restrict__ roff, - const ROW_T* __restrict__ cols, - CNT_T* __restrict__ ocnt) -{ - CNT_T __cnt = 0; - const ROW_T tid = blockIdx.x * BDIM + threadIdx.x; - - for (ROW_T rid = tid; rid < ner; rid += gridDim.x * BDIM) { - const ROW_T r = rows[rid]; - - const OFF_T rbeg = roff[r]; - const OFF_T rend = roff[r + 1]; - const ROW_T rlen = rend - rbeg; - - if (!rlen) continue; - if (rlen <= LOCLEN) { - int nloc = 0; - ROW_T loc[LOCLEN]; - -#pragma unroll - for (nloc = 0; nloc < LOCLEN; nloc++) { - if (rbeg + nloc >= rend) break; - loc[nloc] = __ldg(cols + rbeg + nloc); - } - -#pragma unroll - for (int i = 1; i < LOCLEN; i++) { - if (i == nloc) break; - - const ROW_T c = loc[i]; - const OFF_T soff = roff[c]; - const OFF_T eoff = roff[c + 1]; - - for (OFF_T k = eoff - 1; k >= soff; k--) { - const ROW_T cc = __ldg(cols + k); - if (cc < loc[0]) break; - - for (int j = i - 1; j >= 0; j--) { - if (cc == loc[j]) __cnt++; - } - } - } - } else { - const ROW_T minc = cols[rbeg]; - for (int i = 1; i < rlen; i++) { - const ROW_T c = __ldg(cols + rbeg + i); - const OFF_T soff = roff[c]; - const OFF_T eoff = roff[c + 1]; - - for (OFF_T k = eoff - 1; k >= soff; k--) { - const ROW_T cc = __ldg(cols + k); - if (cc < minc) break; - - for (int j = i - 1; j >= 0; j--) { - if (cc == __ldg(cols + rbeg + j)) __cnt++; - } - } - } - } - } - - __syncthreads(); - __cnt = block_sum(__cnt); - if (threadIdx.x == 0) ocnt[blockIdx.x] = __cnt; - - return; -} - -template -void tricnt_thr(T nblock, spmat_t* m, uint64_t* ocnt_d, cudaStream_t stream) -{ - cudaFuncSetCacheConfig(tricnt_thr_k::LOCINT, - typename type_utils::LOCINT, - uint64_t>, - cudaFuncCachePreferL1); - - tricnt_thr_k - <<>>(m->nrows, m->rows_d, m->roff_d, m->cols_d, ocnt_d); - RAFT_CHECK_CUDA(stream); - return; -} - -///////////////////////////////////////////////////////////////// -template -struct NonEmptyRow { - const IndexType* p_roff; - __host__ __device__ NonEmptyRow(const IndexType* roff) : p_roff(roff) {} - __host__ __device__ __forceinline__ bool operator()(const IndexType& a) const - { - return (p_roff[a] < p_roff[a + 1]); - } -}; - -template -void create_nondangling_vector( - const T* roff, T* p_nonempty, T* n_nonempty, size_t n, cudaStream_t stream) -{ - if (n <= 0) return; - thrust::counting_iterator it(0); - NonEmptyRow temp_func(roff); - rmm::device_vector out_num(*n_nonempty); - - cubIf(it, p_nonempty, out_num.data().get(), n, temp_func, stream); - cudaMemcpy(n_nonempty, out_num.data().get(), sizeof(*n_nonempty), cudaMemcpyDeviceToHost); - RAFT_CHECK_CUDA(stream); -} - -template -uint64_t reduce(uint64_t* v_d, T n, cudaStream_t stream) -{ - rmm::device_vector tmp(1); - - cubSum(v_d, tmp.data().get(), n, stream); - RAFT_CHECK_CUDA(stream); - - return tmp[0]; -} - -template -class TrianglesCount { - private: - uint64_t m_triangles_number; - spmat_t m_mat; - int m_shared_mem_per_block{}; - int m_multi_processor_count{}; - int m_max_threads_per_multi_processor{}; - - rmm::device_vector m_seq; - - cudaStream_t m_stream; - - bool m_done; - - void tcount_bsh(); - void tcount_b2b(); - void tcount_wrp(); - void tcount_thr(); - - public: - // Simple constructor - TrianglesCount(IndexType num_vertices, - IndexType num_edges, - IndexType const* row_offsets, - IndexType const* col_indices, - cudaStream_t stream = NULL); - - void count(); - inline uint64_t get_triangles_count() const { return m_triangles_number; } -}; - -template -TrianglesCount::TrianglesCount(IndexType num_vertices, - IndexType num_edges, - IndexType const* row_offsets, - IndexType const* col_indices, - cudaStream_t stream) - : m_mat{num_vertices, num_edges, num_vertices, row_offsets, nullptr, col_indices}, - m_stream{stream}, - m_done{true} -{ - int device_id; - cudaGetDevice(&device_id); - - cudaDeviceGetAttribute(&m_shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id); - RAFT_CHECK_CUDA(m_stream); - cudaDeviceGetAttribute(&m_multi_processor_count, cudaDevAttrMultiProcessorCount, device_id); - RAFT_CHECK_CUDA(m_stream); - cudaDeviceGetAttribute( - &m_max_threads_per_multi_processor, cudaDevAttrMaxThreadsPerMultiProcessor, device_id); - RAFT_CHECK_CUDA(m_stream); - - m_seq.resize(m_mat.N, IndexType{0}); - create_nondangling_vector(m_mat.roff_d, m_seq.data().get(), &(m_mat.nrows), m_mat.N, m_stream); - m_mat.rows_d = m_seq.data().get(); -} - -template -void TrianglesCount::tcount_bsh() -{ - CUGRAPH_EXPECTS(not(m_shared_mem_per_block * 8 < m_mat.nrows), - "Number of vertices too high for TrainglesCount."); - /// if (m_shared_mem_per_block * 8 < (size_t)m_mat.nrows) { - /// FatalError("Number of vertices too high to use this kernel!", NVGRAPH_ERR_BAD_PARAMETERS); - ///} - - size_t bmld = bitmap_roundup(m_mat.N); - int nblock = m_mat.nrows; - - rmm::device_vector ocnt_d(nblock, uint64_t{0}); - - tricnt_bsh(nblock, &m_mat, ocnt_d.data().get(), bmld, m_stream); - m_triangles_number = reduce(ocnt_d.data().get(), nblock, m_stream); -} - -template -void TrianglesCount::tcount_b2b() -{ - // allocate a big enough array for output - - rmm::device_vector ocnt_d(m_mat.nrows, uint64_t{0}); - - size_t bmldL1 = bitmap_roundup(m_mat.N); - - size_t free_bytes, total_bytes; - cudaMemGetInfo(&free_bytes, &total_bytes); - RAFT_CHECK_CUDA(m_stream); - - size_t nblock_available = (free_bytes * 95 / 100) / (sizeof(uint32_t) * bmldL1); - - int nblock = static_cast(MIN(nblock_available, static_cast(m_mat.nrows))); - - // allocate level 1 bitmap - rmm::device_vector bmapL1_d(bmldL1 * nblock, uint32_t{0}); - - // allocate level 0 bitmap - size_t bmldL0 = bitmap_roundup(DIV_UP(m_mat.N, BLK_BWL0)); - rmm::device_vector bmapL0_d(nblock * bmldL0, uint32_t{0}); - - tricnt_b2b(nblock, - &m_mat, - ocnt_d.data().get(), - bmapL0_d.data().get(), - bmldL0, - bmapL1_d.data().get(), - bmldL1, - m_stream); - m_triangles_number = reduce(ocnt_d.data().get(), nblock, m_stream); -} - -template -void TrianglesCount::tcount_wrp() -{ - // allocate a big enough array for output - rmm::device_vector ocnt_d(DIV_UP(m_mat.nrows, (THREADS / 32)), uint64_t{0}); - - size_t bmld = bitmap_roundup(m_mat.N); - - // number of blocks limited by birmap size - size_t free_bytes, total_bytes; - cudaMemGetInfo(&free_bytes, &total_bytes); - RAFT_CHECK_CUDA(m_stream); - - size_t nblock_available = (free_bytes * 95 / 100) / (sizeof(uint32_t) * bmld * (THREADS / 32)); - - int nblock = static_cast( - MIN(nblock_available, static_cast(DIV_UP(m_mat.nrows, (THREADS / 32))))); - - size_t bmap_sz = bmld * nblock * (THREADS / 32); - - rmm::device_vector bmap_d(bmap_sz, uint32_t{0}); - - tricnt_wrp(nblock, &m_mat, ocnt_d.data().get(), bmap_d.data().get(), bmld, m_stream); - m_triangles_number = reduce(ocnt_d.data().get(), nblock, m_stream); -} - -template -void TrianglesCount::tcount_thr() -{ - int maxblocks = m_multi_processor_count * m_max_threads_per_multi_processor / THREADS; - - int nblock = MIN(maxblocks, DIV_UP(m_mat.nrows, THREADS)); - - rmm::device_vector ocnt_d(nblock, uint64_t{0}); - - tricnt_thr(nblock, &m_mat, ocnt_d.data().get(), m_stream); - m_triangles_number = reduce(ocnt_d.data().get(), nblock, m_stream); -} - -template -void TrianglesCount::count() -{ - double mean_deg = (double)m_mat.nnz / m_mat.nrows; - if (mean_deg < DEG_THR1) - tcount_thr(); - else if (mean_deg < DEG_THR2) - tcount_wrp(); - else { - const int shMinBlkXSM = 6; - if (static_cast(m_shared_mem_per_block * 8 / shMinBlkXSM) < - static_cast(m_mat.N)) - tcount_b2b(); - else - tcount_bsh(); - } -} - -} // namespace - -template -uint64_t triangle_count(legacy::GraphCSRView const& graph) -{ - TrianglesCount counter( - graph.number_of_vertices, graph.number_of_edges, graph.offsets, graph.indices); - - counter.count(); - return counter.get_triangles_count(); -} - -template uint64_t triangle_count( - legacy::GraphCSRView const&); - -} // namespace triangle -} // namespace cugraph diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 80fb2580c03..6777819d468 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -338,10 +338,6 @@ ConfigureTest(ECG_TEST community/ecg_test.cpp) # - Balanced cut clustering tests ----------------------------------------------------------------- ConfigureTest(BALANCED_TEST community/balanced_edge_test.cpp) -################################################################################################### -# - TRIANGLE tests -------------------------------------------------------------------------------- -ConfigureTest(TRIANGLE_TEST community/triangle_test.cu) - ################################################################################################### # - EGO tests -------------------------------------------------------------------------------- ConfigureTest(EGO_TEST community/egonet_test.cu) diff --git a/cpp/tests/community/triangle_test.cu b/cpp/tests/community/triangle_test.cu deleted file mode 100644 index f607ed6bf87..00000000000 --- a/cpp/tests/community/triangle_test.cu +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. - * - * NVIDIA CORPORATION and its licensors retain all intellectual property - * and proprietary rights in and to this software, related documentation - * and any modifications thereto. Any use, reproduction, disclosure or - * distribution of this software and related documentation without an express - * license agreement from NVIDIA CORPORATION is strictly prohibited. - * - */ -#include - -#include -#include - -#include -#include - -TEST(triangle, dolphin) -{ - std::vector off_h = {0, 6, 14, 18, 21, 22, 26, 32, 37, 43, 50, 55, 56, - 57, 65, 77, 84, 90, 99, 106, 110, 119, 125, 126, 129, 135, - 138, 141, 146, 151, 160, 165, 166, 169, 179, 184, 185, 192, 203, - 211, 213, 221, 226, 232, 239, 243, 254, 256, 262, 263, 265, 272, - 282, 286, 288, 295, 297, 299, 308, 309, 314, 315, 318}; - std::vector ind_h = { - 10, 14, 15, 40, 42, 47, 17, 19, 26, 27, 28, 36, 41, 54, 10, 42, 44, 61, 8, 14, 59, 51, 9, - 13, 56, 57, 9, 13, 17, 54, 56, 57, 19, 27, 30, 40, 54, 3, 20, 28, 37, 45, 59, 5, 6, 13, - 17, 32, 41, 57, 0, 2, 29, 42, 47, 51, 33, 5, 6, 9, 17, 32, 41, 54, 57, 0, 3, 16, 24, - 33, 34, 37, 38, 40, 43, 50, 52, 0, 18, 24, 40, 45, 55, 59, 14, 20, 33, 37, 38, 50, 1, 6, - 9, 13, 22, 25, 27, 31, 57, 15, 20, 21, 24, 29, 45, 51, 1, 7, 30, 54, 8, 16, 18, 28, 36, - 38, 44, 47, 50, 18, 29, 33, 37, 45, 51, 17, 36, 45, 51, 14, 15, 18, 29, 45, 51, 17, 26, 27, - 1, 25, 27, 1, 7, 17, 25, 26, 1, 8, 20, 30, 47, 10, 18, 21, 24, 35, 43, 45, 51, 52, 7, - 19, 28, 42, 47, 17, 9, 13, 60, 12, 14, 16, 21, 34, 37, 38, 40, 43, 50, 14, 33, 37, 44, 49, - 29, 1, 20, 23, 37, 39, 40, 59, 8, 14, 16, 21, 33, 34, 36, 40, 43, 45, 61, 14, 16, 20, 33, - 43, 44, 52, 58, 36, 57, 0, 7, 14, 15, 33, 36, 37, 52, 1, 9, 13, 54, 57, 0, 2, 10, 30, - 47, 50, 14, 29, 33, 37, 38, 46, 53, 2, 20, 34, 38, 8, 15, 18, 21, 23, 24, 29, 37, 50, 51, - 59, 43, 49, 0, 10, 20, 28, 30, 42, 57, 34, 46, 14, 16, 20, 33, 42, 45, 51, 4, 11, 18, 21, - 23, 24, 29, 45, 50, 55, 14, 29, 38, 40, 43, 61, 1, 6, 7, 13, 19, 41, 57, 15, 51, 5, 6, - 5, 6, 9, 13, 17, 39, 41, 48, 54, 38, 3, 8, 15, 36, 45, 32, 2, 37, 53}; - - std::vector w_h(ind_h.size(), float{1.0}); - - int num_verts = off_h.size() - 1; - int num_edges = ind_h.size(); - - uint64_t expected{285}; - - rmm::device_vector offsets_v(off_h); - rmm::device_vector indices_v(ind_h); - rmm::device_vector weights_v(w_h); - - cugraph::legacy::GraphCSRView graph_csr( - offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges); - - uint64_t count{0}; - - try { - count = cugraph::triangle::triangle_count(graph_csr); - } catch (std::exception& e) { - std::cout << "Exception: " << e.what() << std::endl; - } - - ASSERT_EQ(count, expected); -} - -TEST(triangle, karate) -{ - using vertex_t = int32_t; - using edge_t = int32_t; - using weight_t = float; - - std::vector off_h = {0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, - 67, 69, 74, 76, 78, 80, 82, 84, 87, 89, 91, 93, - 98, 101, 104, 106, 110, 113, 117, 121, 127, 139, 156}; - std::vector ind_h = { - 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 0, 2, 3, 7, 13, 17, 19, - 21, 30, 0, 1, 3, 7, 8, 9, 13, 27, 28, 32, 0, 1, 2, 7, 12, 13, 0, 6, 10, 0, 6, - 10, 16, 0, 4, 5, 16, 0, 1, 2, 3, 0, 2, 30, 32, 33, 2, 33, 0, 4, 5, 0, 0, 3, - 0, 1, 2, 3, 33, 32, 33, 32, 33, 5, 6, 0, 1, 32, 33, 0, 1, 33, 32, 33, 0, 1, 32, - 33, 25, 27, 29, 32, 33, 25, 27, 31, 23, 24, 31, 29, 33, 2, 23, 24, 33, 2, 31, 33, 23, 26, - 32, 33, 1, 8, 32, 33, 0, 24, 25, 28, 32, 33, 2, 8, 14, 15, 18, 20, 22, 23, 29, 30, 31, - 33, 8, 9, 13, 14, 15, 18, 19, 20, 22, 23, 26, 27, 28, 29, 30, 31, 32}; - - std::vector w_h(ind_h.size(), weight_t{1.0}); - - vertex_t num_verts = off_h.size() - 1; - int num_edges = ind_h.size(); - - uint64_t expected{135}; - - rmm::device_vector offsets_v(off_h); - rmm::device_vector indices_v(ind_h); - rmm::device_vector weights_v(w_h); - - cugraph::legacy::GraphCSRView graph_csr( - offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges); - - uint64_t count{0}; - - try { - count = cugraph::triangle::triangle_count(graph_csr); - } catch (std::exception& e) { - std::cout << "Exception: " << e.what() << std::endl; - } - - ASSERT_EQ(count, expected); -} - -CUGRAPH_TEST_PROGRAM_MAIN()