From 5efa9be09b848c026d0e680984fdf395038e9120 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 28 May 2020 12:15:24 -0500 Subject: [PATCH 01/88] Spectral partition header. --- cpp/include/raft/spectral/partition.hpp | 82 +++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 cpp/include/raft/spectral/partition.hpp diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp new file mode 100644 index 0000000000..747ce510da --- /dev/null +++ b/cpp/include/raft/spectral/partition.hpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace raft { + + /// Spectral graph partition + /** Compute partition for a weighted undirected graph. This + * partition attempts to minimize the cost function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * + * @param G Weighted graph in CSR format + * @param nParts Number of partitions. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter_lanczos Maximum number of Lanczos iterations. + * @param restartIter_lanczos Maximum size of Lanczos system before + * implicit restart. + * @param tol_lanczos Convergence tolerance for Lanczos method. + * @param maxIter_kmeans Maximum number of k-means iterations. + * @param tol_kmeans Convergence tolerance for k-means algorithm. + * @param parts (Output, device memory, n entries) Partition + * assignments. + * @param iters_lanczos On exit, number of Lanczos iterations + * performed. + * @param iters_kmeans On exit, number of k-means iterations + * performed. + * @return error flag. + */ + template typename GraphView> + int partition(GraphView const &graph, + vertex_t nParts, + vertex_t nEigVecs, + int maxIter_lanczos, + int restartIter_lanczos, + weight_t tol_lanczos, + int maxIter_kmeans, + weight_t tol_kmeans, + vertex_t * __restrict__ parts, + weight_t *eigVals, + weight_t *eig_vects); + + /// Compute cost function for partition + /** This function determines the edges cut by a partition and a cost + * function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * Graph is assumed to be weighted and undirected. + * + * @param G Weighted graph in CSR format + * @param nParts Number of partitions. + * @param parts (Input, device memory, n entries) Partition + * assignments. + * @param edgeCut On exit, weight of edges cut by partition. + * @param cost On exit, partition cost function. + * @return error flag. + */ + template typename GraphView> + int analyzePartition(GraphView const &graph, + vertex_t nParts, + vertex_t const* __restrict__ parts, + weight_t& edgeCut, weight_t & cost); + +} From f405fee0abaa68679cae06e041fec253c68283cc Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 28 May 2020 12:32:33 -0500 Subject: [PATCH 02/88] Updated CHANGELOG.md. --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9490099450..d9a391369f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ ## New Features - Initial RAFT version - PR #3: defining raft::handle_t, device_buffer, host_buffer, allocator classes +- PR #12: Spectral Clustering ## Bug Fixes - PR #5: Small build.sh fixes From c5bf5389200eca08d03effb488f67e9071008673 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 2 Jun 2020 15:50:10 -0500 Subject: [PATCH 03/88] Adding the main functionality. --- cpp/include/raft/spectral/kmeans.hpp | 935 +++++++++++ cpp/include/raft/spectral/lanczos.hpp | 1487 +++++++++++++++++ .../raft/spectral/modularity_maximization.hpp | 436 +++++ cpp/include/raft/spectral/partition.hpp | 472 +++++- cpp/include/raft/spectral/spectral_matrix.hpp | 1185 +++++++++++++ 5 files changed, 4450 insertions(+), 65 deletions(-) create mode 100644 cpp/include/raft/spectral/kmeans.hpp create mode 100644 cpp/include/raft/spectral/lanczos.hpp create mode 100644 cpp/include/raft/spectral/modularity_maximization.hpp create mode 100644 cpp/include/raft/spectral/spectral_matrix.hpp diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp new file mode 100644 index 0000000000..691df3e5ce --- /dev/null +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -0,0 +1,935 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//#ifdef NVGRAPH_PARTITION +//#ifdef DEBUG + +#include "include/kmeans.hxx" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "include/atomics.hxx" +#include "include/debug_macros.h" +#include "include/nvgraph_cublas.hxx" +#include "include/nvgraph_vector.hxx" +#include "include/sm_utils.h" + +using namespace nvgraph; + +// ========================================================= +// Useful macros +// ========================================================= + +#define BLOCK_SIZE 1024 +#define WARP_SIZE 32 +#define BSIZE_DIV_WSIZE (BLOCK_SIZE / WARP_SIZE) + +// Get index of matrix entry +#define IDX(i, j, lda) ((i) + (j) * (lda)) + +namespace { + +// ========================================================= +// CUDA kernels +// ========================================================= + +/// Compute distances between observation vectors and centroids +/** Block dimensions should be (warpSize, 1, + * blockSize/warpSize). Ideally, the grid is large enough so there + * are d threads in the x-direction, k threads in the y-direction, + * and n threads in the z-direction. + * + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param obs (Input, d*n entries) Observation matrix. Matrix is + * stored column-major and each column is an observation + * vector. Matrix dimensions are d x n. + * @param centroids (Input, d*k entries) Centroid matrix. Matrix is + * stored column-major and each column is a centroid. Matrix + * dimensions are d x k. + * @param dists (Output, n*k entries) Distance matrix. Matrix is + * stored column-major and the (i,j)-entry is the square of the + * Euclidean distance between the ith observation vector and jth + * centroid. Matrix dimensions are n x k. Entries must be + * initialized to zero. + */ +template +static __global__ void computeDistances(IndexType_ n, + IndexType_ d, + IndexType_ k, + const ValueType_* __restrict__ obs, + const ValueType_* __restrict__ centroids, + ValueType_* __restrict__ dists) +{ + // Loop index + IndexType_ i; + + // Block indices + IndexType_ bidx; + // Global indices + IndexType_ gidx, gidy, gidz; + + // Private memory + ValueType_ centroid_private, dist_private; + + // Global x-index indicates index of vector entry + bidx = blockIdx.x; + while (bidx * blockDim.x < d) { + gidx = threadIdx.x + bidx * blockDim.x; + + // Global y-index indicates centroid + gidy = threadIdx.y + blockIdx.y * blockDim.y; + while (gidy < k) { + // Load centroid coordinate from global memory + centroid_private = (gidx < d) ? centroids[IDX(gidx, gidy, d)] : 0; + + // Global z-index indicates observation vector + gidz = threadIdx.z + blockIdx.z * blockDim.z; + while (gidz < n) { + // Load observation vector coordinate from global memory + dist_private = (gidx < d) ? obs[IDX(gidx, gidz, d)] : 0; + + // Compute contribution of current entry to distance + dist_private = centroid_private - dist_private; + dist_private = dist_private * dist_private; + + // Perform reduction on warp + for (i = WARP_SIZE / 2; i > 0; i /= 2) + dist_private += utils::shfl_down(dist_private, i, 2 * i); + + // Write result to global memory + if (threadIdx.x == 0) atomicFPAdd(dists + IDX(gidz, gidy, n), dist_private); + + // Move to another observation vector + gidz += blockDim.z * gridDim.z; + } + + // Move to another centroid + gidy += blockDim.y * gridDim.y; + } + + // Move to another vector entry + bidx += gridDim.x; + } +} + +/// Find closest centroid to observation vectors +/** Block and grid dimensions should be 1-dimensional. Ideally the + * grid is large enough so there are n threads. + * + * @param n Number of observation vectors. + * @param k Number of clusters. + * @param centroids (Input, d*k entries) Centroid matrix. Matrix is + * stored column-major and each column is a centroid. Matrix + * dimensions are d x k. + * @param dists (Input/output, n*k entries) Distance matrix. Matrix + * is stored column-major and the (i,j)-entry is the square of + * the Euclidean distance between the ith observation vector and + * jth centroid. Matrix dimensions are n x k. On exit, the first + * n entries give the square of the Euclidean distance between + * observation vectors and closest centroids. + * @param codes (Output, n entries) Cluster assignments. + * @param clusterSizes (Output, k entries) Number of points in each + * cluster. Entries must be initialized to zero. + */ +template +static __global__ void minDistances(IndexType_ n, + IndexType_ k, + ValueType_* __restrict__ dists, + IndexType_* __restrict__ codes, + IndexType_* __restrict__ clusterSizes) +{ + // Loop index + IndexType_ i, j; + + // Current matrix entry + ValueType_ dist_curr; + + // Smallest entry in row + ValueType_ dist_min; + IndexType_ code_min; + + // Each row in observation matrix is processed by a thread + i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < n) { + // Find minimum entry in row + code_min = 0; + dist_min = dists[IDX(i, 0, n)]; + for (j = 1; j < k; ++j) { + dist_curr = dists[IDX(i, j, n)]; + code_min = (dist_curr < dist_min) ? j : code_min; + dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; + } + + // Transfer result to global memory + dists[i] = dist_min; + codes[i] = code_min; + + // Increment cluster sizes + atomicAdd(clusterSizes + code_min, 1); + + // Move to another row + i += blockDim.x * gridDim.x; + } +} + +/// Check if newly computed distances are smaller than old distances +/** Block and grid dimensions should be 1-dimensional. Ideally the + * grid is large enough so there are n threads. + * + * @param n Number of observation vectors. + * @param dists_old (Input/output, n entries) Distances between + * observation vectors and closest centroids. On exit, entries + * are replaced by entries in 'dists_new' if the corresponding + * observation vectors are closest to the new centroid. + * @param dists_new (Input, n entries) Distance between observation + * vectors and new centroid. + * @param codes_old (Input/output, n entries) Cluster + * assignments. On exit, entries are replaced with 'code_new' if + * the corresponding observation vectors are closest to the new + * centroid. + * @param code_new Index associated with new centroid. + */ +template +static __global__ void minDistances2(IndexType_ n, + ValueType_* __restrict__ dists_old, + const ValueType_* __restrict__ dists_new, + IndexType_* __restrict__ codes_old, + IndexType_ code_new) +{ + // Loop index + IndexType_ i; + + // Distances + ValueType_ dist_old_private; + ValueType_ dist_new_private; + + // Each row is processed by a thread + i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < n) { + // Get old and new distances + dist_old_private = dists_old[i]; + dist_new_private = dists_new[i]; + + // Update if new distance is smaller than old distance + if (dist_new_private < dist_old_private) { + dists_old[i] = dist_new_private; + codes_old[i] = code_new; + } + + // Move to another row + i += blockDim.x * gridDim.x; + } +} + +/// Compute size of k-means clusters +/** Block and grid dimensions should be 1-dimensional. Ideally the + * grid is large enough so there are n threads. + * + * @param n Number of observation vectors. + * @param k Number of clusters. + * @param codes (Input, n entries) Cluster assignments. + * @param clusterSizes (Output, k entries) Number of points in each + * cluster. Entries must be initialized to zero. + */ +template +static __global__ void computeClusterSizes(IndexType_ n, + IndexType_ k, + const IndexType_* __restrict__ codes, + IndexType_* __restrict__ clusterSizes) +{ + IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < n) { + atomicAdd(clusterSizes + codes[i], 1); + i += blockDim.x * gridDim.x; + } +} + +/// Divide rows of centroid matrix by cluster sizes +/** Divides the ith column of the sum matrix by the size of the ith + * cluster. If the sum matrix has been initialized so that the ith + * row is the sum of all observation vectors in the ith cluster, + * this kernel produces cluster centroids. The grid and block + * dimensions should be 2-dimensional. Ideally the grid is large + * enough so there are d threads in the x-direction and k threads + * in the y-direction. + * + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param clusterSizes (Input, k entries) Number of points in each + * cluster. + * @param centroids (Input/output, d*k entries) Sum matrix. Matrix + * is stored column-major and matrix dimensions are d x k. The + * ith column is the sum of all observation vectors in the ith + * cluster. On exit, the matrix is the centroid matrix (each + * column is the mean position of a cluster). + */ +template +static __global__ void divideCentroids(IndexType_ d, + IndexType_ k, + const IndexType_* __restrict__ clusterSizes, + ValueType_* __restrict__ centroids) +{ + // Global indices + IndexType_ gidx, gidy; + + // Current cluster size + IndexType_ clusterSize_private; + + // Observation vector is determined by global y-index + gidy = threadIdx.y + blockIdx.y * blockDim.y; + while (gidy < k) { + // Get cluster size from global memory + clusterSize_private = clusterSizes[gidy]; + + // Add vector entries to centroid matrix + // Vector entris are determined by global x-index + gidx = threadIdx.x + blockIdx.x * blockDim.x; + while (gidx < d) { + centroids[IDX(gidx, gidy, d)] /= clusterSize_private; + gidx += blockDim.x * gridDim.x; + } + + // Move to another centroid + gidy += blockDim.y * gridDim.y; + } +} + +// ========================================================= +// Helper functions +// ========================================================= + +/// Randomly choose new centroids +/** Centroid is randomly chosen with k-means++ algorithm. + * + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param rand Random number drawn uniformly from [0,1). + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are n x d. + * @param dists (Input, device memory, 2*n entries) Workspace. The + * first n entries should be the distance between observation + * vectors and the closest centroid. + * @param centroid (Output, device memory, d entries) Centroid + * coordinates. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int chooseNewCentroid(IndexType_ n, + IndexType_ d, + IndexType_ k, + ValueType_ rand, + const ValueType_* __restrict__ obs, + ValueType_* __restrict__ dists, + ValueType_* __restrict__ centroid) +{ + using namespace thrust; + + // Cumulative sum of distances + ValueType_* distsCumSum = dists + n; + // Residual sum of squares + ValueType_ distsSum; + // Observation vector that is chosen as new centroid + IndexType_ obsIndex; + + // Compute cumulative sum of distances + inclusive_scan( + device_pointer_cast(dists), device_pointer_cast(dists + n), device_pointer_cast(distsCumSum)); + cudaCheckError(); + CHECK_CUDA( + cudaMemcpy(&distsSum, distsCumSum + n - 1, sizeof(ValueType_), cudaMemcpyDeviceToHost)); + + // Randomly choose observation vector + // Probabilities are proportional to square of distance to closest + // centroid (see k-means++ algorithm) + obsIndex = + (lower_bound( + device_pointer_cast(distsCumSum), device_pointer_cast(distsCumSum + n), distsSum * rand) - + device_pointer_cast(distsCumSum)); + cudaCheckError(); + obsIndex = max(obsIndex, 0); + obsIndex = min(obsIndex, n - 1); + + // Record new centroid position + CHECK_CUDA(cudaMemcpyAsync( + centroid, obs + IDX(0, obsIndex, d), d * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); + + return 0; +} + +/// Choose initial cluster centroids for k-means algorithm +/** Centroids are randomly chosen with k-means++ algorithm + * + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param centroids (Output, device memory, d*k entries) Centroid + * matrix. Matrix is stored column-major and each column is a + * centroid. Matrix dimensions are d x k. + * @param codes (Output, device memory, n entries) Cluster + * assignments. + * @param clusterSizes (Output, device memory, k entries) Number of + * points in each cluster. + * @param dists (Output, device memory, 2*n entries) Workspace. On + * exit, the first n entries give the square of the Euclidean + * distance between observation vectors and the closest centroid. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int initializeCentroids(IndexType_ n, + IndexType_ d, + IndexType_ k, + const ValueType_* __restrict__ obs, + ValueType_* __restrict__ centroids, + IndexType_* __restrict__ codes, + IndexType_* __restrict__ clusterSizes, + ValueType_* __restrict__ dists) +{ + // ------------------------------------------------------- + // Variable declarations + // ------------------------------------------------------- + + // Loop index + IndexType_ i; + + // CUDA grid dimensions + dim3 blockDim_warp, gridDim_warp, gridDim_block; + + // Random number generator + thrust::default_random_engine rng(123456); + thrust::uniform_real_distribution uniformDist(0, 1); + + // ------------------------------------------------------- + // Implementation + // ------------------------------------------------------- + + // Initialize grid dimensions + blockDim_warp.x = WARP_SIZE; + blockDim_warp.y = 1; + blockDim_warp.z = BSIZE_DIV_WSIZE; + gridDim_warp.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); + gridDim_warp.y = 1; + gridDim_warp.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); + gridDim_block.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); + gridDim_block.y = 1; + gridDim_block.z = 1; + + // Assign observation vectors to code 0 + CHECK_CUDA(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_))); + + // Choose first centroid + thrust::fill(thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), 1); + cudaCheckError(); + if (chooseNewCentroid(n, d, k, uniformDist(rng), obs, dists, centroids)) + WARNING("error in k-means++ (could not pick centroid)"); + + // Compute distances from first centroid + CHECK_CUDA(cudaMemsetAsync(dists, 0, n * sizeof(ValueType_))); + computeDistances<<>>(n, d, 1, obs, centroids, dists); + cudaCheckError() + + // Choose remaining centroids + for (i = 1; i < k; ++i) + { + // Choose ith centroid + if (chooseNewCentroid(n, d, k, uniformDist(rng), obs, dists, centroids + IDX(0, i, d))) + WARNING("error in k-means++ (could not pick centroid)"); + + // Compute distances from ith centroid + CHECK_CUDA(cudaMemsetAsync(dists + n, 0, n * sizeof(ValueType_))); + computeDistances<<>>( + n, d, 1, obs, centroids + IDX(0, i, d), dists + n); + cudaCheckError(); + + // Recompute minimum distances + minDistances2<<>>(n, dists, dists + n, codes, i); + cudaCheckError(); + } + + // Compute cluster sizes + CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_))); + computeClusterSizes<<>>(n, k, codes, clusterSizes); + cudaCheckError(); + + return 0; +} + +/// Find cluster centroids closest to observation vectors +/** Distance is measured with Euclidean norm. + * + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param centroids (Input, device memory, d*k entries) Centroid + * matrix. Matrix is stored column-major and each column is a + * centroid. Matrix dimensions are d x k. + * @param dists (Output, device memory, n*k entries) Workspace. On + * exit, the first n entries give the square of the Euclidean + * distance between observation vectors and the closest centroid. + * @param codes (Output, device memory, n entries) Cluster + * assignments. + * @param clusterSizes (Output, device memory, k entries) Number of + * points in each cluster. + * @param residual_host (Output, host memory, 1 entry) Residual sum + * of squares of assignment. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int assignCentroids(IndexType_ n, + IndexType_ d, + IndexType_ k, + const ValueType_* __restrict__ obs, + const ValueType_* __restrict__ centroids, + ValueType_* __restrict__ dists, + IndexType_* __restrict__ codes, + IndexType_* __restrict__ clusterSizes, + ValueType_* residual_host) +{ + // CUDA grid dimensions + dim3 blockDim, gridDim; + + // Compute distance between centroids and observation vectors + CHECK_CUDA(cudaMemsetAsync(dists, 0, n * k * sizeof(ValueType_))); + blockDim.x = WARP_SIZE; + blockDim.y = 1; + blockDim.z = BLOCK_SIZE / WARP_SIZE; + gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); + gridDim.y = min(k, 65535); + gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); + computeDistances<<>>(n, d, k, obs, centroids, dists); + cudaCheckError(); + + // Find centroid closest to each observation vector + CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_))); + blockDim.x = BLOCK_SIZE; + blockDim.y = 1; + blockDim.z = 1; + gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); + gridDim.y = 1; + gridDim.z = 1; + minDistances<<>>(n, k, dists, codes, clusterSizes); + cudaCheckError(); + + // Compute residual sum of squares + *residual_host = + thrust::reduce(thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n)); + + return 0; +} + +/// Update cluster centroids for k-means algorithm +/** All clusters are assumed to be non-empty. + * + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param codes (Input, device memory, n entries) Cluster + * assignments. + * @param clusterSizes (Input, device memory, k entries) Number of + * points in each cluster. + * @param centroids (Output, device memory, d*k entries) Centroid + * matrix. Matrix is stored column-major and each column is a + * centroid. Matrix dimensions are d x k. + * @param work (Output, device memory, n*d entries) Workspace. + * @param work_int (Output, device memory, 2*d*n entries) + * Workspace. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int updateCentroids(IndexType_ n, + IndexType_ d, + IndexType_ k, + const ValueType_* __restrict__ obs, + const IndexType_* __restrict__ codes, + const IndexType_* __restrict__ clusterSizes, + ValueType_* __restrict__ centroids, + ValueType_* __restrict__ work, + IndexType_* __restrict__ work_int) +{ + using namespace thrust; + + // ------------------------------------------------------- + // Variable declarations + // ------------------------------------------------------- + + // Useful constants + const ValueType_ one = 1; + const ValueType_ zero = 0; + + // CUDA grid dimensions + dim3 blockDim, gridDim; + + // Device memory + device_ptr obs_copy(work); + device_ptr codes_copy(work_int); + device_ptr rows(work_int + d * n); + + // Take transpose of observation matrix + Cublas::geam( + true, false, n, d, &one, obs, d, &zero, (ValueType_*)NULL, n, raw_pointer_cast(obs_copy), n); + + // Cluster assigned to each observation matrix entry + sequence(rows, rows + d * n); + cudaCheckError(); + transform(rows, rows + d * n, make_constant_iterator(n), rows, modulus()); + cudaCheckError(); + gather(rows, rows + d * n, device_pointer_cast(codes), codes_copy); + cudaCheckError(); + + // Row associated with each observation matrix entry + sequence(rows, rows + d * n); + cudaCheckError(); + transform(rows, rows + d * n, make_constant_iterator(n), rows, divides()); + cudaCheckError(); + + // Sort and reduce to add observation vectors in same cluster + stable_sort_by_key(codes_copy, codes_copy + d * n, make_zip_iterator(make_tuple(obs_copy, rows))); + cudaCheckError(); + reduce_by_key(rows, + rows + d * n, + obs_copy, + codes_copy, // Output to codes_copy is ignored + device_pointer_cast(centroids)); + cudaCheckError(); + + // Divide sums by cluster size to get centroid matrix + blockDim.x = WARP_SIZE; + blockDim.y = BLOCK_SIZE / WARP_SIZE; + blockDim.z = 1; + gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); + gridDim.y = min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); + gridDim.z = 1; + divideCentroids<<>>(d, k, clusterSizes, centroids); + cudaCheckError(); + + return 0; +} + +} // namespace + +namespace nvgraph { + +// ========================================================= +// k-means algorithm +// ========================================================= + +/// Find clusters with k-means algorithm +/** Initial centroids are chosen with k-means++ algorithm. Empty + * clusters are reinitialized by choosing new centroids with + * k-means++ algorithm. + * + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param tol Tolerance for convergence. k-means stops when the + * change in residual divided by n is less than tol. + * @param maxiter Maximum number of k-means iterations. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param codes (Output, device memory, n entries) Cluster + * assignments. + * @param clusterSizes (Output, device memory, k entries) Number of + * points in each cluster. + * @param centroids (Output, device memory, d*k entries) Centroid + * matrix. Matrix is stored column-major and each column is a + * centroid. Matrix dimensions are d x k. + * @param work (Output, device memory, n*max(k,d) entries) + * Workspace. + * @param work_int (Output, device memory, 2*d*n entries) + * Workspace. + * @param residual_host (Output, host memory, 1 entry) Residual sum + * of squares (sum of squares of distances between observation + * vectors and centroids). + * @param iters_host (Output, host memory, 1 entry) Number of + * k-means iterations. + * @return NVGRAPH error flag. + */ +template +NVGRAPH_ERROR kmeans(IndexType_ n, + IndexType_ d, + IndexType_ k, + ValueType_ tol, + IndexType_ maxiter, + const ValueType_* __restrict__ obs, + IndexType_* __restrict__ codes, + IndexType_* __restrict__ clusterSizes, + ValueType_* __restrict__ centroids, + ValueType_* __restrict__ work, + IndexType_* __restrict__ work_int, + ValueType_* residual_host, + IndexType_* iters_host) +{ + // ------------------------------------------------------- + // Variable declarations + // ------------------------------------------------------- + + // Current iteration + IndexType_ iter; + + // Residual sum of squares at previous iteration + ValueType_ residualPrev = 0; + + // Random number generator + thrust::default_random_engine rng(123456); + thrust::uniform_real_distribution uniformDist(0, 1); + + // ------------------------------------------------------- + // Initialization + // ------------------------------------------------------- + + // Check that parameters are valid + if (n < 1) { + WARNING("invalid parameter (n<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (d < 1) { + WARNING("invalid parameter (d<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (k < 1) { + WARNING("invalid parameter (k<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (tol < 0) { + WARNING("invalid parameter (tol<0)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (maxiter < 0) { + WARNING("invalid parameter (maxiter<0)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + + // Trivial cases + if (k == 1) { + CHECK_CUDA(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_))); + CHECK_CUDA(cudaMemcpyAsync(clusterSizes, &n, sizeof(IndexType_), cudaMemcpyHostToDevice)); + if (updateCentroids(n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) + WARNING("could not compute k-means centroids"); + dim3 blockDim, gridDim; + blockDim.x = WARP_SIZE; + blockDim.y = 1; + blockDim.z = BLOCK_SIZE / WARP_SIZE; + gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); + gridDim.y = 1; + gridDim.z = min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), 65535); + CHECK_CUDA(cudaMemsetAsync(work, 0, n * k * sizeof(ValueType_))); + computeDistances<<>>(n, d, 1, obs, centroids, work); + cudaCheckError(); + *residual_host = + thrust::reduce(thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); + cudaCheckError(); + return NVGRAPH_OK; + } + if (n <= k) { + thrust::sequence(thrust::device_pointer_cast(codes), thrust::device_pointer_cast(codes + n)); + cudaCheckError(); + thrust::fill_n(thrust::device_pointer_cast(clusterSizes), n, 1); + cudaCheckError(); + + if (n < k) CHECK_CUDA(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(IndexType_))); + CHECK_CUDA( + cudaMemcpyAsync(centroids, obs, d * n * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); + *residual_host = 0; + return NVGRAPH_OK; + } + + // Initialize cuBLAS + Cublas::set_pointer_mode_host(); + + // ------------------------------------------------------- + // k-means++ algorithm + // ------------------------------------------------------- + + // Choose initial cluster centroids + if (initializeCentroids(n, d, k, obs, centroids, codes, clusterSizes, work)) + WARNING("could not initialize k-means centroids"); + + // Apply k-means iteration until convergence + for (iter = 0; iter < maxiter; ++iter) { + // Update cluster centroids + if (updateCentroids(n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) + WARNING("could not update k-means centroids"); + + // Determine centroid closest to each observation + residualPrev = *residual_host; + if (assignCentroids(n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) + WARNING("could not assign observation vectors to k-means clusters"); + + // Reinitialize empty clusters with new centroids + IndexType_ emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), + 0) - + thrust::device_pointer_cast(clusterSizes)); + + // FIXME: emptyCentroid never reaches k (infinite loop) under certain + // conditions, such as if obs is corrupt (as seen as a result of a + // DataFrame column of NULL edge vals used to create the Graph) + while (emptyCentroid < k) { + if (chooseNewCentroid( + n, d, k, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d))) + WARNING("could not replace empty centroid"); + if (assignCentroids(n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) + WARNING("could not assign observation vectors to k-means clusters"); + emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), + 0) - + thrust::device_pointer_cast(clusterSizes)); + cudaCheckError(); + } + + // Check for convergence + if (fabs(residualPrev - (*residual_host)) / n < tol) { + ++iter; + break; + } + } + + // Warning if k-means has failed to converge + if (fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge"); + + *iters_host = iter; + return NVGRAPH_OK; +} + +/// Find clusters with k-means algorithm +/** Initial centroids are chosen with k-means++ algorithm. Empty + * clusters are reinitialized by choosing new centroids with + * k-means++ algorithm. + * + * CNMEM must be initialized before calling this function. + * + * @param n Number of observation vectors. + * @param d Dimension of observation vectors. + * @param k Number of clusters. + * @param tol Tolerance for convergence. k-means stops when the + * change in residual divided by n is less than tol. + * @param maxiter Maximum number of k-means iterations. + * @param obs (Input, device memory, d*n entries) Observation + * matrix. Matrix is stored column-major and each column is an + * observation vector. Matrix dimensions are d x n. + * @param codes (Output, device memory, n entries) Cluster + * assignments. + * @param residual On exit, residual sum of squares (sum of squares + * of distances between observation vectors and centroids). + * @param On exit, number of k-means iterations. + * @return NVGRAPH error flag + */ +template +NVGRAPH_ERROR kmeans(IndexType_ n, + IndexType_ d, + IndexType_ k, + ValueType_ tol, + IndexType_ maxiter, + const ValueType_* __restrict__ obs, + IndexType_* __restrict__ codes, + ValueType_& residual, + IndexType_& iters) +{ + // Check that parameters are valid + if (n < 1) { + WARNING("invalid parameter (n<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (d < 1) { + WARNING("invalid parameter (d<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (k < 1) { + WARNING("invalid parameter (k<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (tol < 0) { + WARNING("invalid parameter (tol<0)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (maxiter < 0) { + WARNING("invalid parameter (maxiter<0)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + + // Allocate memory + // TODO: handle non-zero CUDA streams + cudaStream_t stream = 0; + Vector clusterSizes(k, stream); + Vector centroids(d * k, stream); + Vector work(n * max(k, d), stream); + Vector work_int(2 * d * n, stream); + + // Perform k-means + return kmeans(n, + d, + k, + tol, + maxiter, + obs, + codes, + clusterSizes.raw(), + centroids.raw(), + work.raw(), + work_int.raw(), + &residual, + &iters); +} + +// ========================================================= +// Explicit instantiations +// ========================================================= + +template NVGRAPH_ERROR kmeans(int n, + int d, + int k, + float tol, + int maxiter, + const float* __restrict__ obs, + int* __restrict__ codes, + float& residual, + int& iters); +template NVGRAPH_ERROR kmeans(int n, + int d, + int k, + double tol, + int maxiter, + const double* __restrict__ obs, + int* __restrict__ codes, + double& residual, + int& iters); +} // namespace nvgraph +//#endif //NVGRAPH_PARTITION +//#endif //debug diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp new file mode 100644 index 0000000000..ad49be1c05 --- /dev/null +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -0,0 +1,1487 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//#ifdef NVGRAPH_PARTITION + +#define _USE_MATH_DEFINES +#include +#include "include/lanczos.hxx" + +#include +#include +#include + +#include +#include + +#include "include/debug_macros.h" +#include "include/nvgraph_cublas.hxx" +#include "include/nvgraph_error.hxx" +#include "include/nvgraph_lapack.hxx" +#include "include/nvgraph_vector.hxx" +#include "include/nvgraph_vector_kernels.hxx" +// ========================================================= +// Useful macros +// ========================================================= + +// Get index of matrix entry +#define IDX(i, j, lda) ((i) + (j) * (lda)) + +namespace nvgraph { + +namespace { + +// ========================================================= +// Helper functions +// ========================================================= + +/// Perform Lanczos iteration +/** Lanczos iteration is performed on a shifted matrix A+shift*I. + * + * @param A Matrix. + * @param iter Pointer to current Lanczos iteration. On exit, the + * variable is set equal to the final Lanczos iteration. + * @param maxIter Maximum Lanczos iteration. This function will + * perform a maximum of maxIter-*iter iterations. + * @param shift Matrix shift. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm (i.e. entry in beta_host) is + * less than tol. + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param alpha_host (Output, host memory, maxIter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Output, host memory, maxIter entries) + * Off-diagonal entries of Lanczos system. + * @param lanczosVecs_dev (Input/output, device memory, + * n*(maxIter+1) entries) Lanczos vectors. Vectors are stored as + * columns of a column-major matrix with dimensions + * n x (maxIter+1). + * @param work_dev (Output, device memory, maxIter entries) + * Workspace. Not needed if full reorthogonalization is disabled. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int performLanczosIteration(const Matrix *A, + IndexType_ *iter, + IndexType_ maxIter, + ValueType_ shift, + ValueType_ tol, + bool reorthogonalize, + ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, + ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev) +{ + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Useful variables + const ValueType_ one = 1; + const ValueType_ negOne = -1; + const ValueType_ zero = 0; + + IndexType_ n = A->n; + + // ------------------------------------------------------- + // Compute second Lanczos vector + // ------------------------------------------------------- + if (*iter <= 0) { + *iter = 1; + + // Apply matrix + if (shift != 0) + CHECK_CUDA(cudaMemcpyAsync( + lanczosVecs_dev + n, lanczosVecs_dev, n * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); + A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); + + // Orthogonalize Lanczos vector + Cublas::dot(n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host); + Cublas::axpy(n, -alpha_host[0], lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1); + beta_host[0] = Cublas::nrm2(n, lanczosVecs_dev + IDX(0, 1, n), 1); + + // Check if Lanczos has converged + if (beta_host[0] <= tol) return 0; + + // Normalize Lanczos vector + Cublas::scal(n, 1 / beta_host[0], lanczosVecs_dev + IDX(0, 1, n), 1); + } + + // ------------------------------------------------------- + // Compute remaining Lanczos vectors + // ------------------------------------------------------- + + while (*iter < maxIter) { + ++(*iter); + + // Apply matrix + if (shift != 0) + CHECK_CUDA(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, + lanczosVecs_dev + (*iter - 1) * n, + n * sizeof(ValueType_), + cudaMemcpyDeviceToDevice)); + A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); + + // Full reorthogonalization + // "Twice is enough" algorithm per Kahan and Parlett + if (reorthogonalize) { + Cublas::gemv(true, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1); + Cublas::gemv(false, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1); + CHECK_CUDA(cudaMemcpyAsync(alpha_host + (*iter - 1), + work_dev + (*iter - 1), + sizeof(ValueType_), + cudaMemcpyDeviceToHost)); + Cublas::gemv(true, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1); + Cublas::gemv(false, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1); + } + + // Orthogonalization with 3-term recurrence relation + else { + Cublas::dot(n, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + alpha_host + (*iter - 1)); + Cublas::axpy(n, + -alpha_host[*iter - 1], + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1); + Cublas::axpy(n, + -beta_host[*iter - 2], + lanczosVecs_dev + IDX(0, *iter - 2, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1); + } + + // Compute residual + beta_host[*iter - 1] = Cublas::nrm2(n, lanczosVecs_dev + IDX(0, *iter, n), 1); + + // Check if Lanczos has converged + if (beta_host[*iter - 1] <= tol) break; + // Normalize Lanczos vector + Cublas::scal(n, 1 / beta_host[*iter - 1], lanczosVecs_dev + IDX(0, *iter, n), 1); + } + + CHECK_CUDA(cudaDeviceSynchronize()); + + return 0; +} + +/// Find Householder transform for 3-dimensional system +/** Given an input vector v=[x,y,z]', this function finds a + * Householder transform P such that P*v is a multiple of + * e_1=[1,0,0]'. The input vector v is overwritten with the + * Householder vector such that P=I-2*v*v'. + * + * @param v (Input/output, host memory, 3 entries) Input + * 3-dimensional vector. On exit, the vector is set to the + * Householder vector. + * @param Pv (Output, host memory, 1 entry) First entry of P*v + * (here v is the input vector). Either equal to ||v||_2 or + * -||v||_2. + * @param P (Output, host memory, 9 entries) Householder transform + * matrix. Matrix dimensions are 3 x 3. + */ +template +static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) +{ + // Compute norm of vector + *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + + // Choose whether to reflect to e_1 or -e_1 + // This choice avoids catastrophic cancellation + if (v[0] >= 0) *Pv = -(*Pv); + v[0] -= *Pv; + + // Normalize Householder vector + ValueType_ normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + if (normHouseholder != 0) { + v[0] /= normHouseholder; + v[1] /= normHouseholder; + v[2] /= normHouseholder; + } else { + v[0] = 0; + v[1] = 0; + v[2] = 0; + } + + // Construct Householder matrix + IndexType_ i, j; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) P[IDX(i, j, 3)] = -2 * v[i] * v[j]; + for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1; +} + +/// Apply 3-dimensional Householder transform to 4 x 4 matrix +/** The Householder transform is pre-applied to the top three rows + * of the matrix and post-applied to the left three columns. The + * 4 x 4 matrix is intended to contain the bulge that is produced + * in the Francis QR algorithm. + * + * @param v (Input, host memory, 3 entries) Householder vector. + * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. + */ +template +static void applyHouseholder3(const ValueType_ *v, ValueType_ *A) +{ + // Loop indices + IndexType_ i, j; + // Dot product between Householder vector and matrix row/column + ValueType_ vDotA; + + // Pre-apply Householder transform + for (j = 0; j < 4; ++j) { + vDotA = 0; + for (i = 0; i < 3; ++i) vDotA += v[i] * A[IDX(i, j, 4)]; + for (i = 0; i < 3; ++i) A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; + } + + // Post-apply Householder transform + for (i = 0; i < 4; ++i) { + vDotA = 0; + for (j = 0; j < 3; ++j) vDotA += A[IDX(i, j, 4)] * v[j]; + for (j = 0; j < 3; ++j) A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; + } +} + +/// Perform one step of Francis QR algorithm +/** Equivalent to two steps of the classical QR algorithm on a + * tridiagonal matrix. + * + * @param n Matrix dimension. + * @param shift1 QR algorithm shift. + * @param shift2 QR algorithm shift. + * @param alpha (Input/output, host memory, n entries) Diagonal + * entries of tridiagonal matrix. + * @param beta (Input/output, host memory, n-1 entries) + * Off-diagonal entries of tridiagonal matrix. + * @param V (Input/output, host memory, n*n entries) Orthonormal + * transforms from previous steps of QR algorithm. Matrix + * dimensions are n x n. On exit, the orthonormal transform from + * this Francis QR step is post-applied to the matrix. + * @param work (Output, host memory, 3*n entries) Workspace. + * @return Zero if successful. Otherwise non-zero. + */ +template +static int francisQRIteration(IndexType_ n, + ValueType_ shift1, + ValueType_ shift2, + ValueType_ *alpha, + ValueType_ *beta, + ValueType_ *V, + ValueType_ *work) +{ + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Temporary storage of 4x4 bulge and Householder vector + ValueType_ bulge[16]; + + // Householder vector + ValueType_ householder[3]; + // Householder matrix + ValueType_ householderMatrix[3 * 3]; + + // Shifts are roots of the polynomial p(x)=x^2+b*x+c + ValueType_ b = -shift1 - shift2; + ValueType_ c = shift1 * shift2; + + // Loop indices + IndexType_ i, j, pos; + // Temporary variable + ValueType_ temp; + + // ------------------------------------------------------- + // Implementation + // ------------------------------------------------------- + + // Compute initial Householder transform + householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c; + householder[1] = beta[0] * (alpha[0] + alpha[1] + b); + householder[2] = beta[0] * beta[1]; + findHouseholder3(householder, &temp, householderMatrix); + + // Apply initial Householder transform to create bulge + memset(bulge, 0, 16 * sizeof(ValueType_)); + for (i = 0; i < 4; ++i) bulge[IDX(i, i, 4)] = alpha[i]; + for (i = 0; i < 3; ++i) { + bulge[IDX(i + 1, i, 4)] = beta[i]; + bulge[IDX(i, i + 1, 4)] = beta[i]; + } + applyHouseholder3(householder, bulge); + Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n); + memcpy(V, work, 3 * n * sizeof(ValueType_)); + + // Chase bulge to bottom-right of matrix with Householder transforms + for (pos = 0; pos < n - 4; ++pos) { + // Move to next position + alpha[pos] = bulge[IDX(0, 0, 4)]; + householder[0] = bulge[IDX(1, 0, 4)]; + householder[1] = bulge[IDX(2, 0, 4)]; + householder[2] = bulge[IDX(3, 0, 4)]; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + bulge[IDX(3, 0, 4)] = 0; + bulge[IDX(3, 1, 4)] = 0; + bulge[IDX(3, 2, 4)] = beta[pos + 3]; + bulge[IDX(0, 3, 4)] = 0; + bulge[IDX(1, 3, 4)] = 0; + bulge[IDX(2, 3, 4)] = beta[pos + 3]; + bulge[IDX(3, 3, 4)] = alpha[pos + 4]; + + // Apply Householder transform + findHouseholder3(householder, beta + pos, householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm( + false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(ValueType_)); + } + + // Apply penultimate Householder transform + // Values in the last row and column are zero + alpha[n - 4] = bulge[IDX(0, 0, 4)]; + householder[0] = bulge[IDX(1, 0, 4)]; + householder[1] = bulge[IDX(2, 0, 4)]; + householder[2] = bulge[IDX(3, 0, 4)]; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + bulge[IDX(3, 0, 4)] = 0; + bulge[IDX(3, 1, 4)] = 0; + bulge[IDX(3, 2, 4)] = 0; + bulge[IDX(0, 3, 4)] = 0; + bulge[IDX(1, 3, 4)] = 0; + bulge[IDX(2, 3, 4)] = 0; + bulge[IDX(3, 3, 4)] = 0; + findHouseholder3(householder, beta + n - 4, householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm( + false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(ValueType_)); + + // Apply final Householder transform + // Values in the last two rows and columns are zero + alpha[n - 3] = bulge[IDX(0, 0, 4)]; + householder[0] = bulge[IDX(1, 0, 4)]; + householder[1] = bulge[IDX(2, 0, 4)]; + householder[2] = 0; + for (j = 0; j < 3; ++j) + for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + findHouseholder3(householder, beta + n - 3, householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm( + false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(ValueType_)); + + // Bulge has been eliminated + alpha[n - 2] = bulge[IDX(0, 0, 4)]; + alpha[n - 1] = bulge[IDX(1, 1, 4)]; + beta[n - 2] = bulge[IDX(1, 0, 4)]; + + return 0; +} + +/// Perform implicit restart of Lanczos algorithm +/** Shifts are Chebyshev nodes of unwanted region of matrix spectrum. + * + * @param n Matrix dimension. + * @param iter Current Lanczos iteration. + * @param iter_new Lanczos iteration after restart. + * @param shiftUpper Pointer to upper bound for unwanted + * region. Value is ignored if less than *shiftLower. If a + * stronger upper bound has been found, the value is updated on + * exit. + * @param shiftLower Pointer to lower bound for unwanted + * region. Value is ignored if greater than *shiftUpper. If a + * stronger lower bound has been found, the value is updated on + * exit. + * @param alpha_host (Input/output, host memory, iter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Input/output, host memory, iter entries) + * Off-diagonal entries of Lanczos system. + * @param V_host (Output, host memory, iter*iter entries) + * Orthonormal transform used to obtain restarted system. Matrix + * dimensions are iter x iter. + * @param work_host (Output, host memory, 4*iter entries) + * Workspace. + * @param lanczosVecs_dev (Input/output, device memory, n*(iter+1) + * entries) Lanczos vectors. Vectors are stored as columns of a + * column-major matrix with dimensions n x (iter+1). + * @param work_dev (Output, device memory, (n+iter)*iter entries) + * Workspace. + */ +template +static int lanczosRestart(IndexType_ n, + IndexType_ iter, + IndexType_ iter_new, + ValueType_ *shiftUpper, + ValueType_ *shiftLower, + ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, + ValueType_ *__restrict__ V_host, + ValueType_ *__restrict__ work_host, + ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev, + bool smallest_eig) +{ + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Useful constants + const ValueType_ zero = 0; + const ValueType_ one = 1; + + // Loop index + IndexType_ i; + + // Number of implicit restart steps + // Assumed to be even since each call to Francis algorithm is + // equivalent to two calls of QR algorithm + IndexType_ restartSteps = iter - iter_new; + + // Ritz values from Lanczos method + ValueType_ *ritzVals_host = work_host + 3 * iter; + // Shifts for implicit restart + ValueType_ *shifts_host; + + // Orthonormal matrix for similarity transform + ValueType_ *V_dev = work_dev + n * iter; + + // ------------------------------------------------------- + // Implementation + // ------------------------------------------------------- + + // Compute Ritz values + memcpy(ritzVals_host, alpha_host, iter * sizeof(ValueType_)); + memcpy(work_host, beta_host, (iter - 1) * sizeof(ValueType_)); + Lapack::sterf(iter, ritzVals_host, work_host); + + // Debug: Print largest eigenvalues + // for (int i = iter-iter_new; i < iter; ++i) + // std::cout <<*(ritzVals_host+i)<< " "; + // std::cout < *shiftUpper) { + *shiftUpper = ritzVals_host[iter - 1]; + *shiftLower = ritzVals_host[iter_new]; + } else { + *shiftUpper = max(*shiftUpper, ritzVals_host[iter - 1]); + *shiftLower = min(*shiftLower, ritzVals_host[iter_new]); + } + } else { + if (*shiftLower > *shiftUpper) { + *shiftUpper = ritzVals_host[iter - iter_new - 1]; + *shiftLower = ritzVals_host[0]; + } else { + *shiftUpper = max(*shiftUpper, ritzVals_host[iter - iter_new - 1]); + *shiftLower = min(*shiftLower, ritzVals_host[0]); + } + } + + // Calculate Chebyshev nodes as shifts + shifts_host = ritzVals_host; + for (i = 0; i < restartSteps; ++i) { + shifts_host[i] = cos((i + 0.5) * static_cast(M_PI) / restartSteps); + shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower)); + shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower)); + } + + // Apply Francis QR algorithm to implicitly restart Lanczos + for (i = 0; i < restartSteps; i += 2) + if (francisQRIteration( + iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host)) + WARNING("error in implicitly shifted QR algorithm"); + + // Obtain new residual + CHECK_CUDA( + cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(ValueType_), cudaMemcpyHostToDevice)); + + beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; + Cublas::gemv(false, + n, + iter, + beta_host + iter_new - 1, + lanczosVecs_dev, + n, + V_dev + IDX(0, iter_new, iter), + 1, + beta_host + iter - 1, + lanczosVecs_dev + IDX(0, iter, n), + 1); + + // Obtain new Lanczos vectors + Cublas::gemm( + false, false, n, iter_new, iter, &one, lanczosVecs_dev, n, V_dev, iter, &zero, work_dev, n); + + CHECK_CUDA(cudaMemcpyAsync( + lanczosVecs_dev, work_dev, n * iter_new * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); + + // Normalize residual to obtain new Lanczos vector + CHECK_CUDA(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), + lanczosVecs_dev + IDX(0, iter, n), + n * sizeof(ValueType_), + cudaMemcpyDeviceToDevice)); + beta_host[iter_new - 1] = Cublas::nrm2(n, lanczosVecs_dev + IDX(0, iter_new, n), 1); + Cublas::scal(n, 1 / beta_host[iter_new - 1], lanczosVecs_dev + IDX(0, iter_new, n), 1); + + return 0; +} + +} // namespace + +// ========================================================= +// Eigensolver +// ========================================================= + +/// Compute smallest eigenvectors of symmetric matrix +/** Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are smallest in + * magnitude. + * + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied to A+s*I, where s is negative the largest + * eigenvalue. + * + * @param A Matrix. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter Maximum number of Lanczos steps. Does not include + * Lanczos steps used to estimate largest eigenvalue. + * @param restartIter Maximum size of Lanczos system before + * performing an implicit restart. Should be at least 4. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm is less than tol*theta, where + * theta is an estimate for the smallest unwanted eigenvalue + * (i.e. the (nEigVecs+1)th smallest eigenvalue). + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param effIter On exit, pointer to final size of Lanczos system. + * @param totalIter On exit, pointer to total number of Lanczos + * iterations performed. Does not include Lanczos steps used to + * estimate largest eigenvalue. + * @param shift On exit, pointer to matrix shift (estimate for + * largest eigenvalue). + * @param alpha_host (Output, host memory, restartIter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Output, host memory, restartIter entries) + * Off-diagonal entries of Lanczos system. + * @param lanczosVecs_dev (Output, device memory, n*(restartIter+1) + * entries) Lanczos vectors. Vectors are stored as columns of a + * column-major matrix with dimensions n x (restartIter+1). + * @param work_dev (Output, device memory, + * (n+restartIter)*restartIter entries) Workspace. + * @param eigVals_dev (Output, device memory, nEigVecs entries) + * Largest eigenvalues of matrix. + * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) + * Eigenvectors corresponding to smallest eigenvalues of + * matrix. Vectors are stored as columns of a column-major matrix + * with dimensions n x nEigVecs. + * @return NVGRAPH error flag. + */ +template +NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix *A, + IndexType_ nEigVecs, + IndexType_ maxIter, + IndexType_ restartIter, + ValueType_ tol, + bool reorthogonalize, + IndexType_ *effIter, + IndexType_ *totalIter, + ValueType_ *shift, + ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, + ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev, + ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev) +{ + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Useful constants + const ValueType_ one = 1; + const ValueType_ zero = 0; + + // Matrix dimension + IndexType_ n = A->n; + + // Shift for implicit restart + ValueType_ shiftUpper; + ValueType_ shiftLower; + + // Lanczos iteration counters + IndexType_ maxIter_curr = restartIter; // Maximum size of Lanczos system + + // Status flags + int status; + + // Loop index + IndexType_ i; + + // Host memory + ValueType_ *Z_host; // Eigenvectors in Lanczos basis + ValueType_ *work_host; // Workspace + + // ------------------------------------------------------- + // Check that LAPACK is enabled + // ------------------------------------------------------- + // Lapack::check_lapack_enabled(); + + // ------------------------------------------------------- + // Check that parameters are valid + // ------------------------------------------------------- + if (A->m != A->n) { + WARNING("invalid parameter (matrix is not square)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (nEigVecs < 1) { + WARNING("invalid parameter (nEigVecs<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (restartIter < 1) { + WARNING("invalid parameter (restartIter<4)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (tol < 0) { + WARNING("invalid parameter (tol<0)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (nEigVecs > n) { + WARNING("invalid parameters (nEigVecs>n)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (maxIter < nEigVecs) { + WARNING("invalid parameters (maxIter Z_host_v(restartIter * restartIter); + std::vector work_host_v(4 * restartIter); + + Z_host = Z_host_v.data(); + work_host = work_host_v.data(); + + // Initialize cuBLAS + Cublas::set_pointer_mode_host(); + + // ------------------------------------------------------- + // Compute largest eigenvalue to determine shift + // ------------------------------------------------------- + + // Random number generator + curandGenerator_t randGen; + // Initialize random number generator + CHECK_CURAND(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); + + // FIXME: This is hard coded, which is good for unit testing... + // but should really be a parameter so it could be + // "random" for real runs and "fixed" for tests + CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen, 1234567 /*time(NULL)*/)); + // CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen, time(NULL))); + // Initialize initial Lanczos vector + CHECK_CURAND(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); + ValueType_ normQ1 = Cublas::nrm2(n, lanczosVecs_dev, 1); + Cublas::scal(n, 1 / normQ1, lanczosVecs_dev, 1); + + // Estimate number of Lanczos iterations + // See bounds in Kuczynski and Wozniakowski (1992). + // const ValueType_ relError = 0.25; // Relative error + // const ValueType_ failProb = 1e-4; // Probability of failure + // maxIter_curr = log(n/pow(failProb,2))/(4*std::sqrt(relError)) + 1; + // maxIter_curr = min(maxIter_curr, restartIter); + + // Obtain tridiagonal matrix with Lanczos + *effIter = 0; + *shift = 0; + status = performLanczosIteration(A, + effIter, + maxIter_curr, + *shift, + 0.0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + + // Determine largest eigenvalue + + Lapack::sterf(*effIter, alpha_host, beta_host); + *shift = -alpha_host[*effIter - 1]; + // std::cout << *shift <(A, + effIter, + maxIter_curr, + *shift, + 0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter; + + // Apply Lanczos method until convergence + shiftLower = 1; + shiftUpper = -1; + while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) { + // Determine number of restart steps + // Number of steps must be even due to Francis algorithm + IndexType_ iter_new = nEigVecs + 1; + if (restartIter - (maxIter - *totalIter) > nEigVecs + 1) + iter_new = restartIter - (maxIter - *totalIter); + if ((restartIter - iter_new) % 2) iter_new -= 1; + if (iter_new == *effIter) break; + + // Implicit restart of Lanczos method + status = lanczosRestart(n, + *effIter, + iter_new, + &shiftUpper, + &shiftLower, + alpha_host, + beta_host, + Z_host, + work_host, + lanczosVecs_dev, + work_dev, + true); + if (status) WARNING("error in Lanczos implicit restart"); + *effIter = iter_new; + + // Check for convergence + if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break; + + // Proceed with Lanczos method + // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); + status = performLanczosIteration(A, + effIter, + maxIter_curr, + *shift, + tol * fabs(shiftLower), + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter - iter_new; + } + + // Warning if Lanczos has failed to converge + if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) { + WARNING("implicitly restarted Lanczos failed to converge"); + } + + // Solve tridiagonal system + memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(ValueType_)); + memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(ValueType_)); + Lapack::steqr('I', + *effIter, + work_host + 2 * (*effIter), + work_host + 3 * (*effIter), + Z_host, + *effIter, + work_host); + + // Obtain desired eigenvalues by applying shift + for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; + for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0; + + // Copy results to device memory + CHECK_CUDA(cudaMemcpy(eigVals_dev, + work_host + 2 * (*effIter), + nEigVecs * sizeof(ValueType_), + cudaMemcpyHostToDevice)); + // for (int i = 0; i < nEigVecs; ++i) + //{ + // std::cout <<*(work_host+(2*(*effIter)+i))<< std::endl; + //} + CHECK_CUDA(cudaMemcpy( + work_dev, Z_host, (*effIter) * nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); + + // Convert eigenvectors from Lanczos basis to standard basis + Cublas::gemm(false, + false, + n, + nEigVecs, + *effIter, + &one, + lanczosVecs_dev, + n, + work_dev, + *effIter, + &zero, + eigVecs_dev, + n); + + // Clean up and exit + CHECK_CURAND(curandDestroyGenerator(randGen)); + return NVGRAPH_OK; +} + +/// Compute smallest eigenvectors of symmetric matrix +/** Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are smallest in + * magnitude. + * + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied to A+s*I, where s is negative the largest + * eigenvalue. + * + * CNMEM must be initialized before calling this function. + * + * @param A Matrix. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter Maximum number of Lanczos steps. Does not include + * Lanczos steps used to estimate largest eigenvalue. + * @param restartIter Maximum size of Lanczos system before + * performing an implicit restart. Should be at least 4. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm is less than tol*theta, where + * theta is an estimate for the smallest unwanted eigenvalue + * (i.e. the (nEigVecs+1)th smallest eigenvalue). + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param iter On exit, pointer to total number of Lanczos + * iterations performed. Does not include Lanczos steps used to + * estimate largest eigenvalue. + * @param eigVals_dev (Output, device memory, nEigVecs entries) + * Smallest eigenvalues of matrix. + * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) + * Eigenvectors corresponding to smallest eigenvalues of + * matrix. Vectors are stored as columns of a column-major matrix + * with dimensions n x nEigVecs. + * @return NVGRAPH error flag. + */ +template +NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix &A, + IndexType_ nEigVecs, + IndexType_ maxIter, + IndexType_ restartIter, + ValueType_ tol, + bool reorthogonalize, + IndexType_ &iter, + ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev) +{ + // CUDA stream + // TODO: handle non-zero streams + cudaStream_t stream = 0; + + // Matrix dimension + IndexType_ n = A.n; + + // Check that parameters are valid + if (A.m != A.n) { + WARNING("invalid parameter (matrix is not square)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (nEigVecs < 1) { + WARNING("invalid parameter (nEigVecs<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (restartIter < 1) { + WARNING("invalid parameter (restartIter<4)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (tol < 0) { + WARNING("invalid parameter (tol<0)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (nEigVecs > n) { + WARNING("invalid parameters (nEigVecs>n)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (maxIter < nEigVecs) { + WARNING("invalid parameters (maxIter alpha_host_v(restartIter); + std::vector beta_host_v(restartIter); + + ValueType_ *alpha_host = alpha_host_v.data(); + ValueType_ *beta_host = beta_host_v.data(); + + Vector lanczosVecs_dev(n * (restartIter + 1), stream); + Vector work_dev((n + restartIter) * restartIter, stream); + + // Perform Lanczos method + IndexType_ effIter; + ValueType_ shift; + NVGRAPH_ERROR status = computeSmallestEigenvectors(&A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + &shift, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev); + + // Clean up and return + return status; +} + +// ========================================================= +// Eigensolver +// ========================================================= + +/// Compute largest eigenvectors of symmetric matrix +/** Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are largest in + * magnitude. + * + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied. + * + * @param A Matrix. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter Maximum number of Lanczos steps. + * @param restartIter Maximum size of Lanczos system before + * performing an implicit restart. Should be at least 4. + * @param tol Convergence tolerance. Lanczos iteration will + * terminate when the residual norm is less than tol*theta, where + * theta is an estimate for the largest unwanted eigenvalue + * (i.e. the (nEigVecs+1)th largest eigenvalue). + * @param reorthogonalize Whether to reorthogonalize Lanczos + * vectors. + * @param effIter On exit, pointer to final size of Lanczos system. + * @param totalIter On exit, pointer to total number of Lanczos + * iterations performed. + * @param alpha_host (Output, host memory, restartIter entries) + * Diagonal entries of Lanczos system. + * @param beta_host (Output, host memory, restartIter entries) + * Off-diagonal entries of Lanczos system. + * @param lanczosVecs_dev (Output, device memory, n*(restartIter+1) + * entries) Lanczos vectors. Vectors are stored as columns of a + * column-major matrix with dimensions n x (restartIter+1). + * @param work_dev (Output, device memory, + * (n+restartIter)*restartIter entries) Workspace. + * @param eigVals_dev (Output, device memory, nEigVecs entries) + * Largest eigenvalues of matrix. + * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) + * Eigenvectors corresponding to largest eigenvalues of + * matrix. Vectors are stored as columns of a column-major matrix + * with dimensions n x nEigVecs. + * @return NVGRAPH error flag. + */ +template +NVGRAPH_ERROR computeLargestEigenvectors(const Matrix *A, + IndexType_ nEigVecs, + IndexType_ maxIter, + IndexType_ restartIter, + ValueType_ tol, + bool reorthogonalize, + IndexType_ *effIter, + IndexType_ *totalIter, + ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, + ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev, + ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev) +{ + // ------------------------------------------------------- + // Variable declaration + // ------------------------------------------------------- + + // Useful constants + const ValueType_ one = 1; + const ValueType_ zero = 0; + + // Matrix dimension + IndexType_ n = A->n; + + // Lanczos iteration counters + IndexType_ maxIter_curr = restartIter; // Maximum size of Lanczos system + + // Status flags + int status; + + // Loop index + IndexType_ i; + + // Host memory + ValueType_ *Z_host; // Eigenvectors in Lanczos basis + ValueType_ *work_host; // Workspace + + // ------------------------------------------------------- + // Check that LAPACK is enabled + // ------------------------------------------------------- + // Lapack::check_lapack_enabled(); + + // ------------------------------------------------------- + // Check that parameters are valid + // ------------------------------------------------------- + if (A->m != A->n) { + WARNING("invalid parameter (matrix is not square)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (nEigVecs < 1) { + WARNING("invalid parameter (nEigVecs<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (restartIter < 1) { + WARNING("invalid parameter (restartIter<4)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (tol < 0) { + WARNING("invalid parameter (tol<0)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (nEigVecs > n) { + WARNING("invalid parameters (nEigVecs>n)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (maxIter < nEigVecs) { + WARNING("invalid parameters (maxIter Z_host_v(restartIter * restartIter); + std::vector work_host_v(4 * restartIter); + + Z_host = Z_host_v.data(); + work_host = work_host_v.data(); + + // Initialize cuBLAS + Cublas::set_pointer_mode_host(); + + // ------------------------------------------------------- + // Compute largest eigenvalue + // ------------------------------------------------------- + + // Random number generator + curandGenerator_t randGen; + // Initialize random number generator + CHECK_CURAND(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); + CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen, 123456)); + // Initialize initial Lanczos vector + CHECK_CURAND(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); + ValueType_ normQ1 = Cublas::nrm2(n, lanczosVecs_dev, 1); + Cublas::scal(n, 1 / normQ1, lanczosVecs_dev, 1); + + // Estimate number of Lanczos iterations + // See bounds in Kuczynski and Wozniakowski (1992). + // const ValueType_ relError = 0.25; // Relative error + // const ValueType_ failProb = 1e-4; // Probability of failure + // maxIter_curr = log(n/pow(failProb,2))/(4*std::sqrt(relError)) + 1; + // maxIter_curr = min(maxIter_curr, restartIter); + + // Obtain tridiagonal matrix with Lanczos + *effIter = 0; + ValueType_ shift_val = 0.0; + ValueType_ *shift = &shift_val; + // maxIter_curr = min(maxIter, restartIter); + status = performLanczosIteration(A, + effIter, + maxIter_curr, + *shift, + 0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter; + + // Apply Lanczos method until convergence + ValueType_ shiftLower = 1; + ValueType_ shiftUpper = -1; + while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) { + // Determine number of restart steps + // Number of steps must be even due to Francis algorithm + IndexType_ iter_new = nEigVecs + 1; + if (restartIter - (maxIter - *totalIter) > nEigVecs + 1) + iter_new = restartIter - (maxIter - *totalIter); + if ((restartIter - iter_new) % 2) iter_new -= 1; + if (iter_new == *effIter) break; + + // Implicit restart of Lanczos method + status = lanczosRestart(n, + *effIter, + iter_new, + &shiftUpper, + &shiftLower, + alpha_host, + beta_host, + Z_host, + work_host, + lanczosVecs_dev, + work_dev, + false); + if (status) WARNING("error in Lanczos implicit restart"); + *effIter = iter_new; + + // Check for convergence + if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break; + + // Proceed with Lanczos method + // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); + status = performLanczosIteration(A, + effIter, + maxIter_curr, + *shift, + tol * fabs(shiftLower), + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); + if (status) WARNING("error in Lanczos iteration"); + *totalIter += *effIter - iter_new; + } + + // Warning if Lanczos has failed to converge + if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) { + WARNING("implicitly restarted Lanczos failed to converge"); + } + for (int i = 0; i < restartIter; ++i) { + for (int j = 0; j < restartIter; ++j) Z_host[i * restartIter + j] = 0; + } + // Solve tridiagonal system + memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(ValueType_)); + memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(ValueType_)); + Lapack::steqr('I', + *effIter, + work_host + 2 * (*effIter), + work_host + 3 * (*effIter), + Z_host, + *effIter, + work_host); + + // note: We need to pick the top nEigVecs eigenvalues + // but effItter can be larger than nEigVecs + // hence we add an offset for that case, because we want to access top nEigVecs eigenpairs in the + // matrix of size effIter. remember the array is sorted, so it is not needed for smallest + // eigenvalues case because the first ones are the smallest ones + + IndexType_ top_eigenparis_idx_offset = *effIter - nEigVecs; + + // Debug : print nEigVecs largest eigenvalues + // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i) + // std::cout <<*(work_host+(2*(*effIter)+i))<< " "; + // std::cout < +NVGRAPH_ERROR computeLargestEigenvectors(const Matrix &A, + IndexType_ nEigVecs, + IndexType_ maxIter, + IndexType_ restartIter, + ValueType_ tol, + bool reorthogonalize, + IndexType_ &iter, + ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev) +{ + // CUDA stream + // TODO: handle non-zero streams + cudaStream_t stream = 0; + + // Matrix dimension + IndexType_ n = A.n; + + // Check that parameters are valid + if (A.m != A.n) { + WARNING("invalid parameter (matrix is not square)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (nEigVecs < 1) { + WARNING("invalid parameter (nEigVecs<1)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (restartIter < 1) { + WARNING("invalid parameter (restartIter<4)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (tol < 0) { + WARNING("invalid parameter (tol<0)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (nEigVecs > n) { + WARNING("invalid parameters (nEigVecs>n)"); + return NVGRAPH_ERR_BAD_PARAMETERS; + } + if (maxIter < nEigVecs) { + WARNING("invalid parameters (maxIter alpha_host_v(restartIter); + std::vector beta_host_v(restartIter); + + ValueType_ *alpha_host = alpha_host_v.data(); + ValueType_ *beta_host = beta_host_v.data(); + + Vector lanczosVecs_dev(n * (restartIter + 1), stream); + Vector work_dev((n + restartIter) * restartIter, stream); + + // Perform Lanczos method + IndexType_ effIter; + NVGRAPH_ERROR status = computeLargestEigenvectors(&A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev); + + // Clean up and return + return status; +} + +// ========================================================= +// Explicit instantiation +// ========================================================= + +template NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix &A, + int nEigVecs, + int maxIter, + int restartIter, + float tol, + bool reorthogonalize, + int &iter, + float *__restrict__ eigVals_dev, + float *__restrict__ eigVecs_dev); +template NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix &A, + int nEigVecs, + int maxIter, + int restartIter, + double tol, + bool reorthogonalize, + int &iter, + double *__restrict__ eigVals_dev, + double *__restrict__ eigVecs_dev); + +template NVGRAPH_ERROR computeLargestEigenvectors(const Matrix &A, + int nEigVecs, + int maxIter, + int restartIter, + float tol, + bool reorthogonalize, + int &iter, + float *__restrict__ eigVals_dev, + float *__restrict__ eigVecs_dev); +template NVGRAPH_ERROR computeLargestEigenvectors(const Matrix &A, + int nEigVecs, + int maxIter, + int restartIter, + double tol, + bool reorthogonalize, + int &iter, + double *__restrict__ eigVals_dev, + double *__restrict__ eigVecs_dev); + +} // namespace nvgraph diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp new file mode 100644 index 0000000000..bd90f3093a --- /dev/null +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +//#ifdef NVGRAPH_PARTITION + +#include "include/modularity_maximization.hxx" + +#include +#include + +#include +#include +#include +#include +#include + +#include "include/debug_macros.h" +#include "include/kmeans.hxx" +#include "include/lanczos.hxx" +#include "include/nvgraph_cublas.hxx" +#include "include/nvgraph_error.hxx" +#include "include/nvgraph_vector.hxx" +#include "include/sm_utils.h" +#include "include/spectral_matrix.hxx" + +//#define COLLECT_TIME_STATISTICS 1 +//#undef COLLECT_TIME_STATISTICS + +#ifdef COLLECT_TIME_STATISTICS +#include +#include +#include +#include +#include "cuda_profiler_api.h" +#endif + +#ifdef COLLECT_TIME_STATISTICS +static double timer(void) +{ + struct timeval tv; + cudaDeviceSynchronize(); + gettimeofday(&tv, NULL); + return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; +} +#endif + +namespace nvgraph { + +// ========================================================= +// Useful macros +// ========================================================= + +// Get index of matrix entry +#define IDX(i, j, lda) ((i) + (j) * (lda)) + +template +static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, ValueType_ *obs) +{ + IndexType_ i, j, k, index, mm; + ValueType_ alpha, v, last; + bool valid; + // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension + + // compute alpha + mm = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x); // m in multiple of blockDim.x + alpha = 0.0; + // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, + // li, mn); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { + for (i = threadIdx.x; i < mm; i += blockDim.x) { + // check if the thread is valid + valid = i < m; + + // get the value of the last thread + last = utils::shfl(alpha, blockDim.x - 1, blockDim.x); + + // if you are valid read the value from memory, otherwise set your value to 0 + alpha = (valid) ? obs[i + j * m] : 0.0; + alpha = alpha * alpha; + + // do prefix sum (of size warpSize=blockDim.x =< 32) + for (k = 1; k < blockDim.x; k *= 2) { + v = utils::shfl_up(alpha, k, blockDim.x); + if (threadIdx.x >= k) alpha += v; + } + // shift by last + alpha += last; + } + } + + // scale by alpha + alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); + alpha = std::sqrt(alpha); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { + for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 + index = i + j * m; + obs[index] = obs[index] / alpha; + } + } +} + +template +IndexType_ next_pow2(IndexType_ n) +{ + IndexType_ v; + // Reference: + // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float + v = n - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return v + 1; +} + +template +cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) +{ + IndexType_ p2m; + dim3 nthreads, nblocks; + + // find next power of 2 + p2m = next_pow2(m); + // setup launch configuration + nthreads.x = max(2, min(p2m, 32)); + nthreads.y = 256 / nthreads.x; + nthreads.z = 1; + nblocks.x = 1; + nblocks.y = (n + nthreads.y - 1) / nthreads.y; + nblocks.z = 1; + // printf("m=%d(%d),n=%d,obs=%p, + // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); + + // launch scaling kernel (scale each column of obs by its norm) + scale_obs_kernel<<>>(m, n, obs); + cudaCheckError(); + + return cudaSuccess; +} + +// ========================================================= +// Spectral modularity_maximization +// ========================================================= + +/** Compute partition for a weighted undirected graph. This + * partition attempts to minimize the cost function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * + * @param G Weighted graph in CSR format + * @param nClusters Number of partitions. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter_lanczos Maximum number of Lanczos iterations. + * @param restartIter_lanczos Maximum size of Lanczos system before + * implicit restart. + * @param tol_lanczos Convergence tolerance for Lanczos method. + * @param maxIter_kmeans Maximum number of k-means iterations. + * @param tol_kmeans Convergence tolerance for k-means algorithm. + * @param parts (Output, device memory, n entries) Cluster + * assignments. + * @param iters_lanczos On exit, number of Lanczos iterations + * performed. + * @param iters_kmeans On exit, number of k-means iterations + * performed. + * @return NVGRAPH error flag. + */ +template +NVGRAPH_ERROR modularity_maximization( + cugraph::experimental::GraphCSRView const &graph, + vertex_t nClusters, + vertex_t nEigVecs, + int maxIter_lanczos, + int restartIter_lanczos, + weight_t tol_lanczos, + int maxIter_kmeans, + weight_t tol_kmeans, + vertex_t *__restrict__ clusters, + weight_t *eigVals, + weight_t *eigVecs, + int &iters_lanczos, + int &iters_kmeans) +{ + cudaStream_t stream = 0; + const weight_t zero{0.0}; + const weight_t one{1.0}; + + edge_t i; + edge_t n = graph.number_of_vertices; + + // k-means residual + weight_t residual_kmeans; + + // Compute eigenvectors of Modularity Matrix + // Initialize Modularity Matrix + CsrMatrix A(false, + false, + graph.number_of_vertices, + graph.number_of_vertices, + graph.number_of_edges, + 0, + graph.edge_data, + graph.offsets, + graph.indices); + ModularityMatrix B(A, graph.number_of_edges); + + // Compute smallest eigenvalues and eigenvectors + CHECK_NVGRAPH(computeLargestEigenvectors(B, + nEigVecs, + maxIter_lanczos, + restartIter_lanczos, + tol_lanczos, + false, + iters_lanczos, + eigVals, + eigVecs)); + + // eigVals.dump(0, nEigVecs); + // eigVecs.dump(0, nEigVecs); + // eigVecs.dump(n, nEigVecs); + // eigVecs.dump(2*n, nEigVecs); + // Whiten eigenvector matrix + for (i = 0; i < nEigVecs; ++i) { + weight_t mean, std; + mean = thrust::reduce(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + cudaCheckError(); + mean /= n; + thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(mean), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::minus()); + cudaCheckError(); + std = Cublas::nrm2(n, eigVecs + IDX(0, i, n), 1) / std::sqrt(static_cast(n)); + thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(std), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::divides()); + cudaCheckError(); + } + + // Transpose eigenvector matrix + // TODO: in-place transpose + { + Vector work(nEigVecs * n, stream); + Cublas::set_pointer_mode_host(); + Cublas::geam(true, + false, + nEigVecs, + n, + &one, + eigVecs, + n, + &zero, + (weight_t *)NULL, + nEigVecs, + work.raw(), + nEigVecs); + CHECK_CUDA(cudaMemcpyAsync( + eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice)); + } + + // WARNING: notice that at this point the matrix has already been transposed, so we are scaling + // columns + scale_obs(nEigVecs, n, eigVecs); + cudaCheckError(); + + // eigVecs.dump(0, nEigVecs*n); + // Find partition with k-means clustering + CHECK_NVGRAPH(kmeans(n, + nEigVecs, + nClusters, + tol_kmeans, + maxIter_kmeans, + eigVecs, + clusters, + residual_kmeans, + iters_kmeans)); + + return NVGRAPH_OK; +} +//=================================================== +// Analysis of graph partition +// ========================================================= + +namespace { +/// Functor to generate indicator vectors +/** For use in Thrust transform + */ +template +struct equal_to_i_op { + const IndexType_ i; + + public: + equal_to_i_op(IndexType_ _i) : i(_i) {} + template + __host__ __device__ void operator()(Tuple_ t) + { + thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; + } +}; +} // namespace + +/// Compute modularity +/** This function determines the modularity based on a graph and cluster assignments + * @param G Weighted graph in CSR format + * @param nClusters Number of clusters. + * @param parts (Input, device memory, n entries) Cluster assignments. + * @param modularity On exit, modularity + */ +template +NVGRAPH_ERROR analyzeModularity( + cugraph::experimental::GraphCSRView const &graph, + vertex_t nClusters, + const vertex_t *__restrict__ parts, + weight_t &modularity) +{ + cudaStream_t stream = 0; + edge_t i; + edge_t n = graph.number_of_vertices; + weight_t partModularity, partSize; + + // Device memory + Vector part_i(n, stream); + Vector Bx(n, stream); + + // Initialize cuBLAS + Cublas::set_pointer_mode_host(); + + // Initialize Modularity + CsrMatrix A(false, + false, + graph.number_of_vertices, + graph.number_of_vertices, + graph.number_of_edges, + 0, + graph.edge_data, + graph.offsets, + graph.indices); + ModularityMatrix B(A, graph.number_of_edges); + + // Initialize output + modularity = 0; + + // Iterate through partitions + for (i = 0; i < nClusters; ++i) { + // Construct indicator vector for ith partition + thrust::for_each( + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts), + thrust::device_pointer_cast(part_i.raw()))), + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts + n), + thrust::device_pointer_cast(part_i.raw() + n))), + equal_to_i_op(i)); + cudaCheckError(); + + // Compute size of ith partition + Cublas::dot(n, part_i.raw(), 1, part_i.raw(), 1, &partSize); + partSize = round(partSize); + if (partSize < 0.5) { + WARNING("empty partition"); + continue; + } + + // Compute modularity + B.mv(1, part_i.raw(), 0, Bx.raw()); + Cublas::dot(n, Bx.raw(), 1, part_i.raw(), 1, &partModularity); + + // Record results + modularity += partModularity; + // std::cout<< "partModularity " <( + cugraph::experimental::GraphCSRView const &graph, + int nClusters, + int nEigVecs, + int maxIter_lanczos, + int restartIter_lanczos, + float tol_lanczos, + int maxIter_kmeans, + float tol_kmeans, + int *__restrict__ parts, + float *eigVals, + float *eigVecs, + int &iters_lanczos, + int &iters_kmeans); +template NVGRAPH_ERROR modularity_maximization( + cugraph::experimental::GraphCSRView const &graph, + int nClusters, + int nEigVecs, + int maxIter_lanczos, + int restartIter_lanczos, + double tol_lanczos, + int maxIter_kmeans, + double tol_kmeans, + int *__restrict__ parts, + double *eigVals, + double *eigVecs, + int &iters_lanczos, + int &iters_kmeans); +template NVGRAPH_ERROR analyzeModularity( + cugraph::experimental::GraphCSRView const &graph, + int nClusters, + const int *__restrict__ parts, + float &modularity); +template NVGRAPH_ERROR analyzeModularity( + cugraph::experimental::GraphCSRView const &graph, + int nClusters, + const int *__restrict__ parts, + double &modularity); + +} // namespace nvgraph +//#endif //NVGRAPH_PARTITION diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 747ce510da..e4b9f50790 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,69 +14,411 @@ * limitations under the License. */ -#pragma once - -namespace raft { - - /// Spectral graph partition - /** Compute partition for a weighted undirected graph. This - * partition attempts to minimize the cost function: - * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) - * - * @param G Weighted graph in CSR format - * @param nParts Number of partitions. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter_lanczos Maximum number of Lanczos iterations. - * @param restartIter_lanczos Maximum size of Lanczos system before - * implicit restart. - * @param tol_lanczos Convergence tolerance for Lanczos method. - * @param maxIter_kmeans Maximum number of k-means iterations. - * @param tol_kmeans Convergence tolerance for k-means algorithm. - * @param parts (Output, device memory, n entries) Partition - * assignments. - * @param iters_lanczos On exit, number of Lanczos iterations - * performed. - * @param iters_kmeans On exit, number of k-means iterations - * performed. - * @return error flag. - */ - template typename GraphView> - int partition(GraphView const &graph, - vertex_t nParts, - vertex_t nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - weight_t tol_lanczos, - int maxIter_kmeans, - weight_t tol_kmeans, - vertex_t * __restrict__ parts, - weight_t *eigVals, - weight_t *eig_vects); - - /// Compute cost function for partition - /** This function determines the edges cut by a partition and a cost - * function: - * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) - * Graph is assumed to be weighted and undirected. - * - * @param G Weighted graph in CSR format - * @param nParts Number of partitions. - * @param parts (Input, device memory, n entries) Partition - * assignments. - * @param edgeCut On exit, weight of edges cut by partition. - * @param cost On exit, partition cost function. - * @return error flag. - */ - template typename GraphView> - int analyzePartition(GraphView const &graph, - vertex_t nParts, - vertex_t const* __restrict__ parts, - weight_t& edgeCut, weight_t & cost); +#include "include/partition.hxx" +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nvgraph { + +// ========================================================= +// Useful macros +// ========================================================= + +// Get index of matrix entry +#define IDX(i, j, lda) ((i) + (j) * (lda)) + +template +static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, ValueType_ *obs) +{ + IndexType_ i, j, k, index, mm; + ValueType_ alpha, v, last; + bool valid; + // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension + + // compute alpha + mm = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x); // m in multiple of blockDim.x + alpha = 0.0; + // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, + // li, mn); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { + for (i = threadIdx.x; i < mm; i += blockDim.x) { + // check if the thread is valid + valid = i < m; + + // get the value of the last thread + last = utils::shfl(alpha, blockDim.x - 1, blockDim.x); + + // if you are valid read the value from memory, otherwise set your value to 0 + alpha = (valid) ? obs[i + j * m] : 0.0; + alpha = alpha * alpha; + + // do prefix sum (of size warpSize=blockDim.x =< 32) + for (k = 1; k < blockDim.x; k *= 2) { + v = utils::shfl_up(alpha, k, blockDim.x); + if (threadIdx.x >= k) alpha += v; + } + // shift by last + alpha += last; + } + } + + // scale by alpha + alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); + alpha = std::sqrt(alpha); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { + for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 + index = i + j * m; + obs[index] = obs[index] / alpha; + } + } +} + +template +IndexType_ next_pow2(IndexType_ n) +{ + IndexType_ v; + // Reference: + // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float + v = n - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return v + 1; +} + +template +cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) +{ + IndexType_ p2m; + dim3 nthreads, nblocks; + + // find next power of 2 + p2m = next_pow2(m); + // setup launch configuration + nthreads.x = max(2, min(p2m, 32)); + nthreads.y = 256 / nthreads.x; + nthreads.z = 1; + nblocks.x = 1; + nblocks.y = (n + nthreads.y - 1) / nthreads.y; + nblocks.z = 1; + // printf("m=%d(%d),n=%d,obs=%p, + // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); + + // launch scaling kernel (scale each column of obs by its norm) + scale_obs_kernel<<>>(m, n, obs); + cudaCheckError(); + + return cudaSuccess; +} + +// ========================================================= +// Spectral partitioner +// ========================================================= + +/// Compute spectral graph partition +/** Compute partition for a weighted undirected graph. This + * partition attempts to minimize the cost function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * + * @param G Weighted graph in CSR format + * @param nParts Number of partitions. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter_lanczos Maximum number of Lanczos iterations. + * @param restartIter_lanczos Maximum size of Lanczos system before + * implicit restart. + * @param tol_lanczos Convergence tolerance for Lanczos method. + * @param maxIter_kmeans Maximum number of k-means iterations. + * @param tol_kmeans Convergence tolerance for k-means algorithm. + * @param parts (Output, device memory, n entries) Partition + * assignments. + * @param iters_lanczos On exit, number of Lanczos iterations + * performed. + * @param iters_kmeans On exit, number of k-means iterations + * performed. + * @return NVGRAPH error flag. + */ +template +NVGRAPH_ERROR partition( + cugraph::experimental::GraphCSRView const &graph, + vertex_t nParts, + vertex_t nEigVecs, + int maxIter_lanczos, + int restartIter_lanczos, + weight_t tol_lanczos, + int maxIter_kmeans, + weight_t tol_kmeans, + vertex_t *__restrict__ parts, + weight_t *eigVals, + weight_t *eigVecs) +{ + cudaStream_t stream = 0; + + const weight_t zero{0.0}; + const weight_t one{1.0}; + + int iters_lanczos; + int iters_kmeans; + + edge_t i; + edge_t n = graph.number_of_vertices; + + // k-means residual + weight_t residual_kmeans; + + // ------------------------------------------------------- + // Spectral partitioner + // ------------------------------------------------------- + + // Compute eigenvectors of Laplacian + + // Initialize Laplacian + CsrMatrix A(false, + false, + graph.number_of_vertices, + graph.number_of_vertices, + graph.number_of_edges, + 0, + graph.edge_data, + graph.offsets, + graph.indices); + LaplacianMatrix L(A); + + // Compute smallest eigenvalues and eigenvectors + CHECK_NVGRAPH(computeSmallestEigenvectors(L, + nEigVecs, + maxIter_lanczos, + restartIter_lanczos, + tol_lanczos, + false, + iters_lanczos, + eigVals, + eigVecs)); + + // Whiten eigenvector matrix + for (i = 0; i < nEigVecs; ++i) { + weight_t mean, std; + + mean = thrust::reduce(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + cudaCheckError(); + mean /= n; + thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(mean), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::minus()); + cudaCheckError(); + std = Cublas::nrm2(n, eigVecs + IDX(0, i, n), 1) / std::sqrt(static_cast(n)); + thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(std), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::divides()); + cudaCheckError(); + } + + // Transpose eigenvector matrix + // TODO: in-place transpose + { + Vector work(nEigVecs * n, stream); + Cublas::set_pointer_mode_host(); + Cublas::geam(true, + false, + nEigVecs, + n, + &one, + eigVecs, + n, + &zero, + (weight_t *)NULL, + nEigVecs, + work.raw(), + nEigVecs); + CHECK_CUDA(cudaMemcpyAsync( + eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice)); + } + + // Clean up + + // eigVecs.dump(0, nEigVecs*n); + // Find partition with k-means clustering + CHECK_NVGRAPH(kmeans(n, + nEigVecs, + nParts, + tol_kmeans, + maxIter_kmeans, + eigVecs, + parts, + residual_kmeans, + iters_kmeans)); + + return NVGRAPH_OK; +} + +// ========================================================= +// Analysis of graph partition +// ========================================================= + +namespace { +/// Functor to generate indicator vectors +/** For use in Thrust transform + */ +template +struct equal_to_i_op { + const IndexType_ i; + + public: + equal_to_i_op(IndexType_ _i) : i(_i) {} + template + __host__ __device__ void operator()(Tuple_ t) + { + thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; + } +}; +} // namespace + +/// Compute cost function for partition +/** This function determines the edges cut by a partition and a cost + * function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * Graph is assumed to be weighted and undirected. + * + * @param G Weighted graph in CSR format + * @param nParts Number of partitions. + * @param parts (Input, device memory, n entries) Partition + * assignments. + * @param edgeCut On exit, weight of edges cut by partition. + * @param cost On exit, partition cost function. + * @return NVGRAPH error flag. + */ +template +NVGRAPH_ERROR analyzePartition( + cugraph::experimental::GraphCSRView const &graph, + vertex_t nParts, + const vertex_t *__restrict__ parts, + weight_t &edgeCut, + weight_t &cost) +{ + cudaStream_t stream = 0; + + edge_t i; + edge_t n = graph.number_of_vertices; + + weight_t partEdgesCut, partSize; + + // Device memory + Vector part_i(n, stream); + Vector Lx(n, stream); + + // Initialize cuBLAS + Cublas::set_pointer_mode_host(); + + // Initialize Laplacian + CsrMatrix A(false, + false, + graph.number_of_vertices, + graph.number_of_vertices, + graph.number_of_edges, + 0, + graph.edge_data, + graph.offsets, + graph.indices); + LaplacianMatrix L(A); + + // Initialize output + cost = 0; + edgeCut = 0; + + // Iterate through partitions + for (i = 0; i < nParts; ++i) { + // Construct indicator vector for ith partition + thrust::for_each( + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts), + thrust::device_pointer_cast(part_i.raw()))), + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts + n), + thrust::device_pointer_cast(part_i.raw() + n))), + equal_to_i_op(i)); + cudaCheckError(); + + // Compute size of ith partition + Cublas::dot(n, part_i.raw(), 1, part_i.raw(), 1, &partSize); + partSize = round(partSize); + if (partSize < 0.5) { + WARNING("empty partition"); + continue; + } + + // Compute number of edges cut by ith partition + L.mv(1, part_i.raw(), 0, Lx.raw()); + Cublas::dot(n, Lx.raw(), 1, part_i.raw(), 1, &partEdgesCut); + + // Record results + cost += partEdgesCut / partSize; + edgeCut += partEdgesCut / 2; + } + + // Clean up and return + return NVGRAPH_OK; } + +// ========================================================= +// Explicit instantiation +// ========================================================= +template NVGRAPH_ERROR partition( + cugraph::experimental::GraphCSRView const &graph, + int nParts, + int nEigVecs, + int maxIter_lanczos, + int restartIter_lanczos, + float tol_lanczos, + int maxIter_kmeans, + float tol_kmeans, + int *__restrict__ parts, + float *eigVals, + float *eigVecs); + +template NVGRAPH_ERROR partition( + cugraph::experimental::GraphCSRView const &graph, + int nParts, + int nEigVecs, + int maxIter_lanczos, + int restartIter_lanczos, + double tol_lanczos, + int maxIter_kmeans, + double tol_kmeans, + int *__restrict__ parts, + double *eigVals, + double *eigVecs); + +template NVGRAPH_ERROR analyzePartition( + cugraph::experimental::GraphCSRView const &graph, + int nParts, + const int *__restrict__ parts, + float &edgeCut, + float &cost); +template NVGRAPH_ERROR analyzePartition( + cugraph::experimental::GraphCSRView const &graph, + int nParts, + const int *__restrict__ parts, + double &edgeCut, + double &cost); + +} // namespace nvgraph diff --git a/cpp/include/raft/spectral/spectral_matrix.hpp b/cpp/include/raft/spectral/spectral_matrix.hpp new file mode 100644 index 0000000000..c77bb8e5a0 --- /dev/null +++ b/cpp/include/raft/spectral/spectral_matrix.hpp @@ -0,0 +1,1185 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +// #include +// #include +// #include +// #include +// #include + +#include + +// CUDA block size +#define BLOCK_SIZE 1024 + +// Get index of matrix entry +#define IDX(i, j, lda) ((i) + (j) * (lda)) + +namespace raft { +namespace matrix { + void check_size(size_t sz) + { + if (sz > INT_MAX) FatalError("Vector larger than INT_MAX", ERR_BAD_PARAMETERS); + } + template + void nrm1_raw_vec(ValueType_* vec, size_t n, ValueType_* res, cudaStream_t stream) + { + thrust::device_ptr dev_ptr(vec); + *res = thrust::reduce(dev_ptr, dev_ptr + n); + cudaCheckError(); + } + + template + void fill_raw_vec(ValueType_* vec, size_t n, ValueType_ value, cudaStream_t stream) + { + thrust::device_ptr dev_ptr(vec); + thrust::fill(dev_ptr, dev_ptr + n, value); + cudaCheckError(); + } + + template + void dump_raw_vec(ValueType_* vec, size_t n, int offset, cudaStream_t stream) + { +#ifdef DEBUG + thrust::device_ptr dev_ptr(vec); + COUT().precision(15); + COUT() << "sample size = " << n << ", offset = " << offset << std::endl; + thrust::copy( + dev_ptr + offset, dev_ptr + offset + n, std::ostream_iterator(COUT(), " ")); + cudaCheckError(); + COUT() << std::endl; +#endif + } + + template + __global__ void flag_zeroes_kernel(int num_vertices, ValueType_* vec, int* flags) + { + int tidx = blockDim.x * blockIdx.x + threadIdx.x; + for (int r = tidx; r < num_vertices; r += blockDim.x * gridDim.x) { + if (vec[r] != 0.0) + flags[r] = 1; // NOTE 2 : alpha*0 + (1-alpha)*1 = (1-alpha) + else + flags[r] = 0; + } + } + template + __global__ void dmv0_kernel(const ValueType_* __restrict__ D, + const ValueType_* __restrict__ x, + ValueType_* __restrict__ y, + int n) + { + // y=D*x + int tidx = blockIdx.x * blockDim.x + threadIdx.x; + for (int i = tidx; i < n; i += blockDim.x * gridDim.x) y[i] = D[i] * x[i]; + } + template + __global__ void dmv1_kernel(const ValueType_* __restrict__ D, + const ValueType_* __restrict__ x, + ValueType_* __restrict__ y, + int n) + { + // y+=D*x + int tidx = blockIdx.x * blockDim.x + threadIdx.x; + for (int i = tidx; i < n; i += blockDim.x * gridDim.x) y[i] += D[i] * x[i]; + } + template + void copy_vec(ValueType_* vec1, size_t n, ValueType_* res, cudaStream_t stream) + { + thrust::device_ptr dev_ptr(vec1); + thrust::device_ptr res_ptr(res); +#ifdef DEBUG + // COUT() << "copy "<< n << " elements" << std::endl; +#endif + thrust::copy_n(dev_ptr, n, res_ptr); + cudaCheckError(); + // dump_raw_vec (res, n, 0); + } + + template + void flag_zeros_raw_vec(size_t num_vertices, ValueType_* vec, int* flags, cudaStream_t stream) + { + int items_per_thread = 4; + int num_threads = 128; + int max_grid_size = 4096; + check_size(num_vertices); + int n = static_cast(num_vertices); + int num_blocks = std::min(max_grid_size, (n / (items_per_thread * num_threads)) + 1); + flag_zeroes_kernel<<>>(num_vertices, vec, flags); + cudaCheckError(); + } + + template + void dmv(size_t num_vertices, + ValueType_ alpha, + ValueType_* D, + ValueType_* x, + ValueType_ beta, + ValueType_* y, + cudaStream_t stream) + { + int items_per_thread = 4; + int num_threads = 128; + int max_grid_size = 4096; + check_size(num_vertices); + int n = static_cast(num_vertices); + int num_blocks = std::min(max_grid_size, (n / (items_per_thread * num_threads)) + 1); + if (alpha == 1.0 && beta == 0.0) + dmv0_kernel<<>>(D, x, y, n); + else if (alpha == 1.0 && beta == 1.0) + dmv1_kernel<<>>(D, x, y, n); + else + FatalError("Not implemented case of y = D*x", ERR_BAD_PARAMETERS); + + cudaCheckError(); + } + + template + void set_connectivity(size_t n, + IndexType_ root, + ValueType_ self_loop_val, + ValueType_ unreachable_val, + ValueType_* res, + cudaStream_t stream) + { + fill_raw_vec(res, n, unreachable_val); + cudaMemcpy(&res[root], &self_loop_val, sizeof(self_loop_val), cudaMemcpyHostToDevice); + cudaCheckError(); + } + + + /*! A Vector contains a device vector of size |E| and type T + */ + template + class Vector { + public: + typedef ValueType_ ValueType; + + protected: + rmm::device_vector values; + + public: + /*! Construct an empty \p Vector. + */ + Vector(void) {} + ~Vector(void) {} + /*! Construct a \p Vector of size vertices. + * + * \param vertices The size of the Vector + */ + Vector(size_t vertices, cudaStream_t stream = 0) + : values(vertices) {} + + size_t get_size() const { return values.size(); } + size_t bytes() const { return values.size()*sizeof(ValueType);} + ValueType const *raw() const { return values.data().get(); } + ValueType *raw() { return values.data().get(); } + + void allocate(size_t n, cudaStream_t stream = 0) + { + values.resize(n); + } + + void fill(ValueType val, cudaStream_t stream = 0) + { + fill_raw_vec(this->raw(), this->get_size(), val, stream); + } + + void copy(Vector &vec1, cudaStream_t stream = 0) + { + if (this->get_size() == 0 && vec1.get_size()>0) { + allocate(vec1.get_size(), stream); + copy_vec(vec1.raw(), this->get_size(), this->raw(), stream); + } else if (this->get_size() == vec1.get_size()) + copy_vec(vec1.raw(), this->get_size(), this->raw(), stream); + else if (this->get_size() > vec1.get_size()) { + copy_vec(vec1.raw(), vec1.get_size(), this->raw(), stream); + } else { + FatalError("Cannot copy a vector into a smaller one", ERR_BAD_PARAMETERS); + } + } + + ValueType nrm1(cudaStream_t stream = 0) { + ValueType res = 0; + nrm1_raw_vec(this->raw(), this->get_size(), &res, stream); + return res; + } + }; // class Vector + + /// Abstract matrix class + /** Derived classes must implement matrix-vector products. + */ + template + class Matrix { + public: + /// Number of rows + const IndexType_ m; + /// Number of columns + const IndexType_ n; + /// CUDA stream + cudaStream_t s; + + /// Constructor + /** @param _m Number of rows. + * @param _n Number of columns. + */ + Matrix(IndexType_ _m, IndexType_ _n) : m(_m), n(_n), s(0){} + + /// Destructor + virtual ~Matrix() {} + + + /// Get and Set CUDA stream + virtual void setCUDAStream(cudaStream_t _s) = 0; + virtual void getCUDAStream(cudaStream_t *_s) = 0; + + /// Matrix-vector product + /** y is overwritten with alpha*A*x+beta*y. + * + * @param alpha Scalar. + * @param x (Input, device memory, n entries) Vector. + * @param beta Scalar. + * @param y (Input/output, device memory, m entries) Output + * vector. + */ + virtual void mv(ValueType_ alpha, + const ValueType_ * __restrict__ x, + ValueType_ beta, + ValueType_ * __restrict__ y) const = 0; + + virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const = 0; + /// Color and Reorder + virtual void color(IndexType_ *c, IndexType_ *p) const = 0; + virtual void reorder(IndexType_ *p) const = 0; + + /// Incomplete Cholesky (setup, factor and solve) + virtual void prec_setup(Matrix * _M) = 0; + virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const = 0; + + //Get the sum of all edges + virtual ValueType_ getEdgeSum() const = 0; + }; + + /// Dense matrix class + template + class DenseMatrix : public Matrix { + + private: + /// Whether to transpose matrix + const bool trans; + /// Matrix entries, stored column-major in device memory + const ValueType_ * A; + /// Leading dimension of matrix entry array + const IndexType_ lda; + + public: + /// Constructor + DenseMatrix(bool _trans, + IndexType_ _m, IndexType_ _n, + const ValueType_ * _A, IndexType_ _lda); + + /// Destructor + virtual ~DenseMatrix(); + + /// Get and Set CUDA stream + virtual void setCUDAStream(cudaStream_t _s); + virtual void getCUDAStream(cudaStream_t *_s); + + /// Matrix-vector product + virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, + ValueType_ beta, ValueType_ * __restrict__ y) const; + /// Matrix-set of k vectors product + virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; + + /// Color and Reorder + virtual void color(IndexType_ *c, IndexType_ *p) const; + virtual void reorder(IndexType_ *p) const; + + /// Incomplete Cholesky (setup, factor and solve) + virtual void prec_setup(Matrix * _M); + virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; + + //Get the sum of all edges + virtual ValueType_ getEdgeSum() const; + }; + + /// Sparse matrix class in CSR format + template + class CsrMatrix : public Matrix { + + private: + /// Whether to transpose matrix + const bool trans; + /// Whether matrix is stored in symmetric format + const bool sym; + /// Number of non-zero entries + const IndexType_ nnz; + /// Matrix properties + const cusparseMatDescr_t descrA; + /// Matrix entry values (device memory) + /*const*/ ValueType_ * csrValA; + /// Pointer to first entry in each row (device memory) + const IndexType_ * csrRowPtrA; + /// Column index of each matrix entry (device memory) + const IndexType_ * csrColIndA; + /// Analysis info (pointer to opaque CUSPARSE struct) + cusparseSolveAnalysisInfo_t info_l; + cusparseSolveAnalysisInfo_t info_u; + /// factored flag (originally set to false, then reset to true after factorization), + /// notice we only want to factor once + bool factored; + + public: + /// Constructor + CsrMatrix(bool _trans, bool _sym, + IndexType_ _m, IndexType_ _n, IndexType_ _nnz, + const cusparseMatDescr_t _descrA, + /*const*/ ValueType_ * _csrValA, + const IndexType_ * _csrRowPtrA, + const IndexType_ * _csrColIndA); + + /// Destructor + virtual ~CsrMatrix(); + + /// Get and Set CUDA stream + virtual void setCUDAStream(cudaStream_t _s); + virtual void getCUDAStream(cudaStream_t *_s); + + + /// Matrix-vector product + virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, + ValueType_ beta, ValueType_ * __restrict__ y) const; + /// Matrix-set of k vectors product + virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; + + /// Color and Reorder + virtual void color(IndexType_ *c, IndexType_ *p) const; + virtual void reorder(IndexType_ *p) const; + + /// Incomplete Cholesky (setup, factor and solve) + virtual void prec_setup(Matrix * _M); + virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; + + //Get the sum of all edges + virtual ValueType_ getEdgeSum() const; + }; + + /// Graph Laplacian matrix + template + class LaplacianMatrix + : public Matrix { + + private: + /// Adjacency matrix + /*const*/ Matrix * A; + /// Degree of each vertex + Vector D; + /// Preconditioning matrix + Matrix * M; + + public: + /// Constructor + LaplacianMatrix(/*const*/ Matrix & _A); + + /// Destructor + virtual ~LaplacianMatrix(); + + /// Get and Set CUDA stream + virtual void setCUDAStream(cudaStream_t _s); + virtual void getCUDAStream(cudaStream_t *_s); + + /// Matrix-vector product + virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, + ValueType_ beta, ValueType_ * __restrict__ y) const; + /// Matrix-set of k vectors product + virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; + + /// Scale a set of k vectors by a diagonal + virtual void dm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; + + /// Color and Reorder + virtual void color(IndexType_ *c, IndexType_ *p) const; + virtual void reorder(IndexType_ *p) const; + + /// Solve preconditioned system M x = f for a set of k vectors + virtual void prec_setup(Matrix * _M); + virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; + + //Get the sum of all edges + virtual ValueType_ getEdgeSum() const; + }; + + /// Modularity matrix + template + class ModularityMatrix + : public Matrix { + + private: + /// Adjacency matrix + /*const*/ Matrix * A; + /// Degree of each vertex + Vector D; + IndexType_ nnz; + ValueType_ edge_sum; + + /// Preconditioning matrix + Matrix * M; + + public: + /// Constructor + ModularityMatrix(/*const*/ Matrix & _A, IndexType_ _nnz); + + /// Destructor + virtual ~ModularityMatrix(); + + /// Get and Set CUDA stream + virtual void setCUDAStream(cudaStream_t _s); + virtual void getCUDAStream(cudaStream_t *_s); + + /// Matrix-vector product + virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, + ValueType_ beta, ValueType_ * __restrict__ y) const; + /// Matrix-set of k vectors product + virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; + + /// Scale a set of k vectors by a diagonal + virtual void dm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; + + /// Color and Reorder + virtual void color(IndexType_ *c, IndexType_ *p) const; + virtual void reorder(IndexType_ *p) const; + + /// Solve preconditioned system M x = f for a set of k vectors + virtual void prec_setup(Matrix * _M); + virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; + + //Get the sum of all edges + virtual ValueType_ getEdgeSum() const; + }; + +// ============================================= +// CUDA kernels +// ============================================= + +namespace { + +/// Apply diagonal matrix to vector +template +static __global__ void diagmv(IndexType_ n, + ValueType_ alpha, + const ValueType_ *__restrict__ D, + const ValueType_ *__restrict__ x, + ValueType_ *__restrict__ y) +{ + IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < n) { + y[i] += alpha * D[i] * x[i]; + i += blockDim.x * gridDim.x; + } +} + +/// Apply diagonal matrix to a set of dense vectors (tall matrix) +template +static __global__ void diagmm(IndexType_ n, + IndexType_ k, + ValueType_ alpha, + const ValueType_ *__restrict__ D, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) +{ + IndexType_ i, j, index; + + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < k; j += blockDim.y * gridDim.y) { + for (i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x) { + index = i + j * n; + if (beta_is_zero) { + y[index] = alpha * D[i] * x[index]; + } else { + y[index] = alpha * D[i] * x[index] + beta * y[index]; + } + } + } +} +} // namespace + +// ============================================= +// Dense matrix class +// ============================================= + +/// Constructor for dense matrix class +/** @param _trans Whether to transpose matrix. + * @param _m Number of rows. + * @param _n Number of columns. + * @param _A (Input, device memory, _m*_n entries) Matrix + * entries, stored column-major. + * @param _lda Leading dimension of _A. + */ +template +DenseMatrix::DenseMatrix( + bool _trans, IndexType_ _m, IndexType_ _n, const ValueType_ *_A, IndexType_ _lda) + : Matrix(_m, _n), trans(_trans), A(_A), lda(_lda) +{ + Cublas::set_pointer_mode_host(); + if (_lda < _m) FatalError("invalid dense matrix parameter (lda +DenseMatrix::~DenseMatrix() +{ +} + +/// Get and Set CUDA stream +template +void DenseMatrix::setCUDAStream(cudaStream_t _s) +{ + this->s = _s; + // printf("DenseMatrix setCUDAStream stream=%p\n",this->s); + Cublas::setStream(_s); +} +template +void DenseMatrix::getCUDAStream(cudaStream_t *_s) +{ + *_s = this->s; + // CHECK_CUBLAS(cublasGetStream(cublasHandle, _s)); +} + +/// Matrix-vector product for dense matrix class +/** y is overwritten with alpha*A*x+beta*y. + * + * @param alpha Scalar. + * @param x (Input, device memory, n entries) Vector. + * @param beta Scalar. + * @param y (Input/output, device memory, m entries) Output vector. + */ +template +void DenseMatrix::mv(ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + Cublas::gemv(this->trans, this->m, this->n, &alpha, this->A, this->lda, x, 1, &beta, y, 1); +} + +template +void DenseMatrix::mm(IndexType_ k, + ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + Cublas::gemm( + this->trans, false, this->m, k, this->n, &alpha, A, lda, x, this->m, &beta, y, this->n); +} + +/// Color and Reorder +template +void DenseMatrix::color(IndexType_ *c, IndexType_ *p) const +{ +} + +template +void DenseMatrix::reorder(IndexType_ *p) const +{ +} + +/// Incomplete Cholesky (setup, factor and solve) +template +void DenseMatrix::prec_setup(Matrix *_M) +{ + printf("ERROR: DenseMatrix prec_setup dispacthed\n"); + // exit(1); +} + +template +void DenseMatrix::prec_solve(IndexType_ k, + ValueType_ alpha, + ValueType_ *__restrict__ fx, + ValueType_ *__restrict__ t) const +{ + printf("ERROR: DenseMatrix prec_solve dispacthed\n"); + // exit(1); +} + +template +ValueType_ DenseMatrix::getEdgeSum() const +{ + return 0.0; +} + +// ============================================= +// CSR matrix class +// ============================================= + +/// Constructor for CSR matrix class +/** @param _transA Whether to transpose matrix. + * @param _m Number of rows. + * @param _n Number of columns. + * @param _nnz Number of non-zero entries. + * @param _descrA Matrix properties. + * @param _csrValA (Input, device memory, _nnz entries) Matrix + * entry values. + * @param _csrRowPtrA (Input, device memory, _m+1 entries) Pointer + * to first entry in each row. + * @param _csrColIndA (Input, device memory, _nnz entries) Column + * index of each matrix entry. + */ +template +CsrMatrix::CsrMatrix(bool _trans, + bool _sym, + IndexType_ _m, + IndexType_ _n, + IndexType_ _nnz, + const cusparseMatDescr_t _descrA, + /*const*/ ValueType_ *_csrValA, + const IndexType_ *_csrRowPtrA, + const IndexType_ *_csrColIndA) + : Matrix(_m, _n), + trans(_trans), + sym(_sym), + nnz(_nnz), + descrA(_descrA), + csrValA(_csrValA), + csrRowPtrA(_csrRowPtrA), + csrColIndA(_csrColIndA) +{ + if (nnz < 0) FatalError("invalid CSR matrix parameter (nnz<0)", NVGRAPH_ERR_BAD_PARAMETERS); + Cusparse::set_pointer_mode_host(); +} + +/// Destructor for CSR matrix class +template +CsrMatrix::~CsrMatrix() +{ +} + +/// Get and Set CUDA stream +template +void CsrMatrix::setCUDAStream(cudaStream_t _s) +{ + this->s = _s; + // printf("CsrMatrix setCUDAStream stream=%p\n",this->s); + Cusparse::setStream(_s); +} +template +void CsrMatrix::getCUDAStream(cudaStream_t *_s) +{ + *_s = this->s; + // CHECK_CUSPARSE(cusparseGetStream(Cusparse::get_handle(), _s)); +} +template +void CsrMatrix::mm(IndexType_ k, + ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + // CHECK_CUSPARSE(cusparseXcsrmm(Cusparse::get_handle(), transA, this->m, k, this->n, nnz, &alpha, + // descrA, csrValA, csrRowPtrA, csrColIndA, x, this->n, &beta, y, this->m)); + Cusparse::csrmm(this->trans, + this->sym, + this->m, + k, + this->n, + this->nnz, + &alpha, + this->csrValA, + this->csrRowPtrA, + this->csrColIndA, + x, + this->n, + &beta, + y, + this->m); +} + +/// Color and Reorder +template +void CsrMatrix::color(IndexType_ *c, IndexType_ *p) const +{ +} + +template +void CsrMatrix::reorder(IndexType_ *p) const +{ +} + +/// Incomplete Cholesky (setup, factor and solve) +template +void CsrMatrix::prec_setup(Matrix *_M) +{ + // printf("CsrMatrix prec_setup dispacthed\n"); + if (!factored) { + // analyse lower triangular factor + CHECK_CUSPARSE(cusparseCreateSolveAnalysisInfo(&info_l)); + CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_LOWER)); + CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_UNIT)); + CHECK_CUSPARSE(cusparseXcsrsm_analysis(Cusparse::get_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + this->m, + nnz, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + info_l)); + // analyse upper triangular factor + CHECK_CUSPARSE(cusparseCreateSolveAnalysisInfo(&info_u)); + CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_UPPER)); + CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_NON_UNIT)); + CHECK_CUSPARSE(cusparseXcsrsm_analysis(Cusparse::get_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + this->m, + nnz, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + info_u)); + // perform csrilu0 (should be slightly faster than csric0) + CHECK_CUSPARSE(cusparseXcsrilu0(Cusparse::get_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + this->m, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + info_l)); + // set factored flag to true + factored = true; + } +} + +template +void CsrMatrix::prec_solve(IndexType_ k, + ValueType_ alpha, + ValueType_ *__restrict__ fx, + ValueType_ *__restrict__ t) const +{ + // printf("CsrMatrix prec_solve dispacthed (stream %p)\n",this->s); + + // preconditioning Mx=f (where M = L*U, threfore x=U\(L\f)) + // solve lower triangular factor + CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_LOWER)); + CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_UNIT)); + CHECK_CUSPARSE(cusparseXcsrsm_solve(Cusparse::get_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + this->m, + k, + alpha, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + info_l, + fx, + this->m, + t, + this->m)); + // solve upper triangular factor + CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_UPPER)); + CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_NON_UNIT)); + CHECK_CUSPARSE(cusparseXcsrsm_solve(Cusparse::get_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + this->m, + k, + alpha, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + info_u, + t, + this->m, + fx, + this->m)); +} + +/// Matrix-vector product for CSR matrix class +/** y is overwritten with alpha*A*x+beta*y. + * + * @param alpha Scalar. + * @param x (Input, device memory, n entries) Vector. + * @param beta Scalar. + * @param y (Input/output, device memory, m entries) Output vector. + */ +template +void CsrMatrix::mv(ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + // TODO: consider using merge-path csrmv + Cusparse::csrmv(this->trans, + this->sym, + this->m, + this->n, + this->nnz, + &alpha, + this->csrValA, + this->csrRowPtrA, + this->csrColIndA, + x, + &beta, + y); +} + +template +ValueType_ CsrMatrix::getEdgeSum() const +{ + return 0.0; +} + +// ============================================= +// Laplacian matrix class +// ============================================= + +/// Constructor for Laplacian matrix class +/** @param A Adjacency matrix + */ +template +LaplacianMatrix::LaplacianMatrix( + /*const*/ Matrix &_A) + : Matrix(_A.m, _A.n), A(&_A) +{ + // Check that adjacency matrix is square + if (_A.m != _A.n) + FatalError("cannot construct Laplacian matrix from non-square adjacency matrix", + NVGRAPH_ERR_BAD_PARAMETERS); + // set CUDA stream + this->s = NULL; + // Construct degree matrix + D.allocate(_A.m, this->s); + Vector ones(this->n, this->s); + ones.fill(1.0); + _A.mv(1, ones.raw(), 0, D.raw()); + + // Set preconditioning matrix pointer to NULL + M = NULL; +} + +/// Destructor for Laplacian matrix class +template +LaplacianMatrix::~LaplacianMatrix() +{ +} + +/// Get and Set CUDA stream +template +void LaplacianMatrix::setCUDAStream(cudaStream_t _s) +{ + this->s = _s; + // printf("LaplacianMatrix setCUDAStream stream=%p\n",this->s); + A->setCUDAStream(_s); + if (M != NULL) { M->setCUDAStream(_s); } +} +template +void LaplacianMatrix::getCUDAStream(cudaStream_t *_s) +{ + *_s = this->s; + // A->getCUDAStream(_s); +} + +/// Matrix-vector product for Laplacian matrix class +/** y is overwritten with alpha*A*x+beta*y. + * + * @param alpha Scalar. + * @param x (Input, device memory, n entries) Vector. + * @param beta Scalar. + * @param y (Input/output, device memory, m entries) Output vector. + */ +template +void LaplacianMatrix::mv(ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + // Scale result vector + if (beta == 0) + CHECK_CUDA(cudaMemset(y, 0, (this->n) * sizeof(ValueType_))) + else if (beta != 1) + thrust::transform(thrust::device_pointer_cast(y), + thrust::device_pointer_cast(y + this->n), + thrust::make_constant_iterator(beta), + thrust::device_pointer_cast(y), + thrust::multiplies()); + + // Apply diagonal matrix + dim3 gridDim, blockDim; + gridDim.x = min(((this->n) + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); + gridDim.y = 1; + gridDim.z = 1; + blockDim.x = BLOCK_SIZE; + blockDim.y = 1; + blockDim.z = 1; + diagmv<<s>>>(this->n, alpha, D.raw(), x, y); + cudaCheckError(); + + // Apply adjacency matrix + A->mv(-alpha, x, 1, y); +} +/// Matrix-vector product for Laplacian matrix class +/** y is overwritten with alpha*A*x+beta*y. + * + * @param alpha Scalar. + * @param x (Input, device memory, n*k entries) nxk dense matrix. + * @param beta Scalar. + * @param y (Input/output, device memory, m*k entries) Output mxk dense matrix. + */ +template +void LaplacianMatrix::mm(IndexType_ k, + ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + // Apply diagonal matrix + ValueType_ one = (ValueType_)1.0; + this->dm(k, alpha, x, beta, y); + + // Apply adjacency matrix + A->mm(k, -alpha, x, one, y); +} + +template +void LaplacianMatrix::dm(IndexType_ k, + ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + IndexType_ t = k * (this->n); + dim3 gridDim, blockDim; + + // setup launch parameters + gridDim.x = min(((this->n) + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); + gridDim.y = min(k, 65535); + gridDim.z = 1; + blockDim.x = BLOCK_SIZE; + blockDim.y = 1; + blockDim.z = 1; + + // Apply diagonal matrix + if (beta == 0.0) { + // set vectors to 0 (WARNING: notice that you need to set, not scale, because of NaNs corner + // case) + CHECK_CUDA(cudaMemset(y, 0, t * sizeof(ValueType_))); + diagmm + <<s>>>(this->n, k, alpha, D.raw(), x, beta, y); + } else { + diagmm + <<s>>>(this->n, k, alpha, D.raw(), x, beta, y); + } + cudaCheckError(); +} + +/// Color and Reorder +template +void LaplacianMatrix::color(IndexType_ *c, IndexType_ *p) const +{ +} + +template +void LaplacianMatrix::reorder(IndexType_ *p) const +{ +} + +/// Solve preconditioned system M x = f for a set of k vectors +template +void LaplacianMatrix::prec_setup(Matrix *_M) +{ + // save the pointer to preconditioner M + M = _M; + if (M != NULL) { + // setup the preconditioning matrix M + M->prec_setup(NULL); + } +} + +template +void LaplacianMatrix::prec_solve(IndexType_ k, + ValueType_ alpha, + ValueType_ *__restrict__ fx, + ValueType_ *__restrict__ t) const +{ + if (M != NULL) { + // preconditioning + M->prec_solve(k, alpha, fx, t); + } +} + +template +ValueType_ LaplacianMatrix::getEdgeSum() const +{ + return 0.0; +} +// ============================================= +// Modularity matrix class +// ============================================= + +/// Constructor for Modularity matrix class +/** @param A Adjacency matrix + */ +template +ModularityMatrix::ModularityMatrix( + /*const*/ Matrix &_A, IndexType_ _nnz) + : Matrix(_A.m, _A.n), A(&_A), nnz(_nnz) +{ + // Check that adjacency matrix is square + if (_A.m != _A.n) + FatalError("cannot construct Modularity matrix from non-square adjacency matrix", + NVGRAPH_ERR_BAD_PARAMETERS); + + // set CUDA stream + this->s = NULL; + // Construct degree matrix + D.allocate(_A.m, this->s); + Vector ones(this->n, this->s); + ones.fill(1.0); + _A.mv(1, ones.raw(), 0, D.raw()); + // D.dump(0,this->n); + edge_sum = D.nrm1(); + + // Set preconditioning matrix pointer to NULL + M = NULL; +} + +/// Destructor for Modularity matrix class +template +ModularityMatrix::~ModularityMatrix() +{ +} + +/// Get and Set CUDA stream +template +void ModularityMatrix::setCUDAStream(cudaStream_t _s) +{ + this->s = _s; + // printf("ModularityMatrix setCUDAStream stream=%p\n",this->s); + A->setCUDAStream(_s); + if (M != NULL) { M->setCUDAStream(_s); } +} + +template +void ModularityMatrix::getCUDAStream(cudaStream_t *_s) +{ + *_s = this->s; + // A->getCUDAStream(_s); +} + +/// Matrix-vector product for Modularity matrix class +/** y is overwritten with alpha*A*x+beta*y. + * + * @param alpha Scalar. + * @param x (Input, device memory, n entries) Vector. + * @param beta Scalar. + * @param y (Input/output, device memory, m entries) Output vector. + */ +template +void ModularityMatrix::mv(ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + // Scale result vector + if (alpha != 1 || beta != 0) + FatalError("This isn't implemented for Modularity Matrix currently", + NVGRAPH_ERR_NOT_IMPLEMENTED); + + // CHECK_CUBLAS(cublasXdot(handle, this->n, const double *x, int incx, const double *y, int incy, + // double *result)); + // y = A*x + A->mv(alpha, x, 0, y); + ValueType_ dot_res; + // gamma = d'*x + Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); + // y = y -(gamma/edge_sum)*d + Cublas::axpy(this->n, -(dot_res / this->edge_sum), D.raw(), 1, y, 1); +} +/// Matrix-vector product for Modularity matrix class +/** y is overwritten with alpha*A*x+beta*y. + * + * @param alpha Scalar. + * @param x (Input, device memory, n*k entries) nxk dense matrix. + * @param beta Scalar. + * @param y (Input/output, device memory, m*k entries) Output mxk dense matrix. + */ +template +void ModularityMatrix::mm(IndexType_ k, + ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); +} + +template +void ModularityMatrix::dm(IndexType_ k, + ValueType_ alpha, + const ValueType_ *__restrict__ x, + ValueType_ beta, + ValueType_ *__restrict__ y) const +{ + FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); +} + +/// Color and Reorder +template +void ModularityMatrix::color(IndexType_ *c, IndexType_ *p) const +{ + FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); +} + +template +void ModularityMatrix::reorder(IndexType_ *p) const +{ + FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); +} + +/// Solve preconditioned system M x = f for a set of k vectors +template +void ModularityMatrix::prec_setup(Matrix *_M) +{ + // save the pointer to preconditioner M + M = _M; + if (M != NULL) { + // setup the preconditioning matrix M + M->prec_setup(NULL); + } +} + +template +void ModularityMatrix::prec_solve(IndexType_ k, + ValueType_ alpha, + ValueType_ *__restrict__ fx, + ValueType_ *__restrict__ t) const +{ + if (M != NULL) { + FatalError("This isn't implemented for Modularity Matrix currently", + NVGRAPH_ERR_NOT_IMPLEMENTED); + } +} + +template +ValueType_ ModularityMatrix::getEdgeSum() const +{ + return edge_sum; +} + +} // namespace matrix +} // namespace raft From cc4be83737c558bcd0e6edadfdac1bdeb9d1f9fc Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 3 Jun 2020 13:15:16 -0500 Subject: [PATCH 04/88] Clean-up of matrix definitions and dummy error handling. --- cpp/include/raft/spectral/error_temp.hpp | 15 ++ cpp/include/raft/spectral/spectral_matrix.hpp | 243 ++++-------------- 2 files changed, 65 insertions(+), 193 deletions(-) create mode 100644 cpp/include/raft/spectral/error_temp.hpp diff --git a/cpp/include/raft/spectral/error_temp.hpp b/cpp/include/raft/spectral/error_temp.hpp new file mode 100644 index 0000000000..0cd58b1769 --- /dev/null +++ b/cpp/include/raft/spectral/error_temp.hpp @@ -0,0 +1,15 @@ +#pragma once + +#define STRINGIFY_DETAIL(x) #x +#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) + + +#define RAFT_EXPECT(cond, reason) + +#define RAFT_TRY(error_expression) + +#define RAFT_FAIL(reason) + +#define CUDA_TRY(call) + +#define CUDA_CHECK_LAST() diff --git a/cpp/include/raft/spectral/spectral_matrix.hpp b/cpp/include/raft/spectral/spectral_matrix.hpp index c77bb8e5a0..86ef0ec89c 100644 --- a/cpp/include/raft/spectral/spectral_matrix.hpp +++ b/cpp/include/raft/spectral/spectral_matrix.hpp @@ -22,6 +22,21 @@ // #include #include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef DEBUG +#include +#include +#endif + +#include "error_temp.hpp" // TODO: replace w/ actual error handling to be brought in soon // CUDA block size #define BLOCK_SIZE 1024 @@ -33,7 +48,7 @@ namespace raft { namespace matrix { void check_size(size_t sz) { - if (sz > INT_MAX) FatalError("Vector larger than INT_MAX", ERR_BAD_PARAMETERS); + RAFT_EXPECT( sz <= INT_MAX, "Vector larger than INT_MAX"); } template void nrm1_raw_vec(ValueType_* vec, size_t n, ValueType_* res, cudaStream_t stream) @@ -56,12 +71,12 @@ namespace matrix { { #ifdef DEBUG thrust::device_ptr dev_ptr(vec); - COUT().precision(15); - COUT() << "sample size = " << n << ", offset = " << offset << std::endl; + std::cout<(std::cout, " ")); cudaCheckError(); - COUT() << std::endl; + std::cout << std::endl; #endif } @@ -131,18 +146,18 @@ namespace matrix { ValueType_* y, cudaStream_t stream) { + RAFT_EXPECT((alpha == 1.0) && ((beta == 0.0) || (beta == 1.0)), "Not implemented case of y = D*x"); + int items_per_thread = 4; int num_threads = 128; int max_grid_size = 4096; check_size(num_vertices); int n = static_cast(num_vertices); int num_blocks = std::min(max_grid_size, (n / (items_per_thread * num_threads)) + 1); - if (alpha == 1.0 && beta == 0.0) + if (beta == 0.0) dmv0_kernel<<>>(D, x, y, n); - else if (alpha == 1.0 && beta == 1.0) + else if (beta == 1.0) dmv1_kernel<<>>(D, x, y, n); - else - FatalError("Not implemented case of y = D*x", ERR_BAD_PARAMETERS); cudaCheckError(); } @@ -190,7 +205,7 @@ namespace matrix { void allocate(size_t n, cudaStream_t stream = 0) { - values.resize(n); + values.resize(n);//TODO: delegate to outer alocator! } void fill(ValueType val, cudaStream_t stream = 0) @@ -200,16 +215,15 @@ namespace matrix { void copy(Vector &vec1, cudaStream_t stream = 0) { + RAFT_EXPECT( (get_size() == 0 && vec1.get_size()>0) || (get_size() >= vec1.get_size()) ); if (this->get_size() == 0 && vec1.get_size()>0) { allocate(vec1.get_size(), stream); copy_vec(vec1.raw(), this->get_size(), this->raw(), stream); } else if (this->get_size() == vec1.get_size()) copy_vec(vec1.raw(), this->get_size(), this->raw(), stream); - else if (this->get_size() > vec1.get_size()) { + else // if (this->get_size() > vec1.get_size()) { copy_vec(vec1.raw(), vec1.get_size(), this->raw(), stream); - } else { - FatalError("Cannot copy a vector into a smaller one", ERR_BAD_PARAMETERS); - } + } } ValueType nrm1(cudaStream_t stream = 0) { @@ -273,49 +287,6 @@ namespace matrix { virtual ValueType_ getEdgeSum() const = 0; }; - /// Dense matrix class - template - class DenseMatrix : public Matrix { - - private: - /// Whether to transpose matrix - const bool trans; - /// Matrix entries, stored column-major in device memory - const ValueType_ * A; - /// Leading dimension of matrix entry array - const IndexType_ lda; - - public: - /// Constructor - DenseMatrix(bool _trans, - IndexType_ _m, IndexType_ _n, - const ValueType_ * _A, IndexType_ _lda); - - /// Destructor - virtual ~DenseMatrix(); - - /// Get and Set CUDA stream - virtual void setCUDAStream(cudaStream_t _s); - virtual void getCUDAStream(cudaStream_t *_s); - - /// Matrix-vector product - virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, - ValueType_ beta, ValueType_ * __restrict__ y) const; - /// Matrix-set of k vectors product - virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; - - /// Color and Reorder - virtual void color(IndexType_ *c, IndexType_ *p) const; - virtual void reorder(IndexType_ *p) const; - - /// Incomplete Cholesky (setup, factor and solve) - virtual void prec_setup(Matrix * _M); - virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; - - //Get the sum of all edges - virtual ValueType_ getEdgeSum() const; - }; - /// Sparse matrix class in CSR format template class CsrMatrix : public Matrix { @@ -516,111 +487,6 @@ static __global__ void diagmm(IndexType_ n, } } // namespace -// ============================================= -// Dense matrix class -// ============================================= - -/// Constructor for dense matrix class -/** @param _trans Whether to transpose matrix. - * @param _m Number of rows. - * @param _n Number of columns. - * @param _A (Input, device memory, _m*_n entries) Matrix - * entries, stored column-major. - * @param _lda Leading dimension of _A. - */ -template -DenseMatrix::DenseMatrix( - bool _trans, IndexType_ _m, IndexType_ _n, const ValueType_ *_A, IndexType_ _lda) - : Matrix(_m, _n), trans(_trans), A(_A), lda(_lda) -{ - Cublas::set_pointer_mode_host(); - if (_lda < _m) FatalError("invalid dense matrix parameter (lda -DenseMatrix::~DenseMatrix() -{ -} - -/// Get and Set CUDA stream -template -void DenseMatrix::setCUDAStream(cudaStream_t _s) -{ - this->s = _s; - // printf("DenseMatrix setCUDAStream stream=%p\n",this->s); - Cublas::setStream(_s); -} -template -void DenseMatrix::getCUDAStream(cudaStream_t *_s) -{ - *_s = this->s; - // CHECK_CUBLAS(cublasGetStream(cublasHandle, _s)); -} - -/// Matrix-vector product for dense matrix class -/** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n entries) Vector. - * @param beta Scalar. - * @param y (Input/output, device memory, m entries) Output vector. - */ -template -void DenseMatrix::mv(ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - Cublas::gemv(this->trans, this->m, this->n, &alpha, this->A, this->lda, x, 1, &beta, y, 1); -} - -template -void DenseMatrix::mm(IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - Cublas::gemm( - this->trans, false, this->m, k, this->n, &alpha, A, lda, x, this->m, &beta, y, this->n); -} - -/// Color and Reorder -template -void DenseMatrix::color(IndexType_ *c, IndexType_ *p) const -{ -} - -template -void DenseMatrix::reorder(IndexType_ *p) const -{ -} - -/// Incomplete Cholesky (setup, factor and solve) -template -void DenseMatrix::prec_setup(Matrix *_M) -{ - printf("ERROR: DenseMatrix prec_setup dispacthed\n"); - // exit(1); -} - -template -void DenseMatrix::prec_solve(IndexType_ k, - ValueType_ alpha, - ValueType_ *__restrict__ fx, - ValueType_ *__restrict__ t) const -{ - printf("ERROR: DenseMatrix prec_solve dispacthed\n"); - // exit(1); -} - -template -ValueType_ DenseMatrix::getEdgeSum() const -{ - return 0.0; -} - // ============================================= // CSR matrix class // ============================================= @@ -657,7 +523,7 @@ CsrMatrix::CsrMatrix(bool _trans, csrRowPtrA(_csrRowPtrA), csrColIndA(_csrColIndA) { - if (nnz < 0) FatalError("invalid CSR matrix parameter (nnz<0)", NVGRAPH_ERR_BAD_PARAMETERS); + RAFT_EXPECT(nnz >= 0, "invalid CSR matrix parameter (nnz<0)"); Cusparse::set_pointer_mode_host(); } @@ -857,19 +723,17 @@ LaplacianMatrix::LaplacianMatrix( : Matrix(_A.m, _A.n), A(&_A) { // Check that adjacency matrix is square - if (_A.m != _A.n) - FatalError("cannot construct Laplacian matrix from non-square adjacency matrix", - NVGRAPH_ERR_BAD_PARAMETERS); + RAFT_EXPECT(_A.m == _A.n, "cannot construct Laplacian matrix from non-square adjacency matrix"); // set CUDA stream - this->s = NULL; + this->s = nullptr; // Construct degree matrix D.allocate(_A.m, this->s); Vector ones(this->n, this->s); ones.fill(1.0); _A.mv(1, ones.raw(), 0, D.raw()); - // Set preconditioning matrix pointer to NULL - M = NULL; + // Set preconditioning matrix pointer to nullptr + M = nullptr; } /// Destructor for Laplacian matrix class @@ -885,7 +749,7 @@ void LaplacianMatrix::setCUDAStream(cudaStream_t _s) this->s = _s; // printf("LaplacianMatrix setCUDAStream stream=%p\n",this->s); A->setCUDAStream(_s); - if (M != NULL) { M->setCUDAStream(_s); } + if (M != nullptr) { M->setCUDAStream(_s); } } template void LaplacianMatrix::getCUDAStream(cudaStream_t *_s) @@ -1004,9 +868,9 @@ void LaplacianMatrix::prec_setup(Matrixprec_setup(NULL); + M->prec_setup(nullptr); } } @@ -1016,7 +880,7 @@ void LaplacianMatrix::prec_solve(IndexType_ k, ValueType_ *__restrict__ fx, ValueType_ *__restrict__ t) const { - if (M != NULL) { + if (M != nullptr) { // preconditioning M->prec_solve(k, alpha, fx, t); } @@ -1040,12 +904,10 @@ ModularityMatrix::ModularityMatrix( : Matrix(_A.m, _A.n), A(&_A), nnz(_nnz) { // Check that adjacency matrix is square - if (_A.m != _A.n) - FatalError("cannot construct Modularity matrix from non-square adjacency matrix", - NVGRAPH_ERR_BAD_PARAMETERS); + RAFT_EXPECT(_A.m == _A.n, "cannot construct Modularity matrix from non-square adjacency matrix"); // set CUDA stream - this->s = NULL; + this->s = nullptr; // Construct degree matrix D.allocate(_A.m, this->s); Vector ones(this->n, this->s); @@ -1054,8 +916,8 @@ ModularityMatrix::ModularityMatrix( // D.dump(0,this->n); edge_sum = D.nrm1(); - // Set preconditioning matrix pointer to NULL - M = NULL; + // Set preconditioning matrix pointer to nullptr + M = nullptr; } /// Destructor for Modularity matrix class @@ -1071,7 +933,7 @@ void ModularityMatrix::setCUDAStream(cudaStream_t _s) this->s = _s; // printf("ModularityMatrix setCUDAStream stream=%p\n",this->s); A->setCUDAStream(_s); - if (M != NULL) { M->setCUDAStream(_s); } + if (M != nullptr) { M->setCUDAStream(_s); } } template @@ -1096,9 +958,7 @@ void ModularityMatrix::mv(ValueType_ alpha, ValueType_ *__restrict__ y) const { // Scale result vector - if (alpha != 1 || beta != 0) - FatalError("This isn't implemented for Modularity Matrix currently", - NVGRAPH_ERR_NOT_IMPLEMENTED); + RAFT_EXPECT(alpha == 1 && beta == 0, "cannot construct Modularity matrix from non-square adjacency matrix"); // CHECK_CUBLAS(cublasXdot(handle, this->n, const double *x, int incx, const double *y, int incy, // double *result)); @@ -1125,7 +985,7 @@ void ModularityMatrix::mm(IndexType_ k, ValueType_ beta, ValueType_ *__restrict__ y) const { - FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); + RAFT_FAIL("Functionality not currently supported in Modularity Matrix."); } template @@ -1135,20 +995,20 @@ void ModularityMatrix::dm(IndexType_ k, ValueType_ beta, ValueType_ *__restrict__ y) const { - FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); + RAFT_FAIL("Functionality not currently supported in Modularity Matrix."); } /// Color and Reorder template void ModularityMatrix::color(IndexType_ *c, IndexType_ *p) const { - FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); + RAFT_FAIL("Functionality not currently supported in Modularity Matrix."); } template void ModularityMatrix::reorder(IndexType_ *p) const { - FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); + RAFT_FAIL("Functionality not currently supported in Modularity Matrix."); } /// Solve preconditioned system M x = f for a set of k vectors @@ -1157,9 +1017,9 @@ void ModularityMatrix::prec_setup(Matrixprec_setup(NULL); + M->prec_setup(nullptr); } } @@ -1169,10 +1029,7 @@ void ModularityMatrix::prec_solve(IndexType_ k, ValueType_ *__restrict__ fx, ValueType_ *__restrict__ t) const { - if (M != NULL) { - FatalError("This isn't implemented for Modularity Matrix currently", - NVGRAPH_ERR_NOT_IMPLEMENTED); - } + RAFT_EXPECT(M == nullptr, "Functionality not currently supported in Modularity Matrix."); } template From f3e8862d30491386bdf2c3f0814ac42584a35a5d Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 3 Jun 2020 13:20:23 -0500 Subject: [PATCH 05/88] Clean-up of some error checking. --- cpp/include/raft/spectral/spectral_matrix.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/include/raft/spectral/spectral_matrix.hpp b/cpp/include/raft/spectral/spectral_matrix.hpp index 86ef0ec89c..eda7af120c 100644 --- a/cpp/include/raft/spectral/spectral_matrix.hpp +++ b/cpp/include/raft/spectral/spectral_matrix.hpp @@ -55,7 +55,7 @@ namespace matrix { { thrust::device_ptr dev_ptr(vec); *res = thrust::reduce(dev_ptr, dev_ptr + n); - cudaCheckError(); + CUDA_CHECK_LAST(); } template @@ -63,7 +63,7 @@ namespace matrix { { thrust::device_ptr dev_ptr(vec); thrust::fill(dev_ptr, dev_ptr + n, value); - cudaCheckError(); + CUDA_CHECK_LAST(); } template @@ -75,7 +75,7 @@ namespace matrix { std::cout << "sample size = " << n << ", offset = " << offset << std::endl; thrust::copy( dev_ptr + offset, dev_ptr + offset + n, std::ostream_iterator(std::cout, " ")); - cudaCheckError(); + CUDA_CHECK_LAST(); std::cout << std::endl; #endif } @@ -120,7 +120,7 @@ namespace matrix { // COUT() << "copy "<< n << " elements" << std::endl; #endif thrust::copy_n(dev_ptr, n, res_ptr); - cudaCheckError(); + CUDA_CHECK_LAST(); // dump_raw_vec (res, n, 0); } @@ -134,7 +134,7 @@ namespace matrix { int n = static_cast(num_vertices); int num_blocks = std::min(max_grid_size, (n / (items_per_thread * num_threads)) + 1); flag_zeroes_kernel<<>>(num_vertices, vec, flags); - cudaCheckError(); + CUDA_CHECK_LAST(); } template @@ -159,7 +159,7 @@ namespace matrix { else if (beta == 1.0) dmv1_kernel<<>>(D, x, y, n); - cudaCheckError(); + CUDA_CHECK_LAST(); } template @@ -172,7 +172,7 @@ namespace matrix { { fill_raw_vec(res, n, unreachable_val); cudaMemcpy(&res[root], &self_loop_val, sizeof(self_loop_val), cudaMemcpyHostToDevice); - cudaCheckError(); + CUDA_CHECK_LAST(); } @@ -791,7 +791,7 @@ void LaplacianMatrix::mv(ValueType_ alpha, blockDim.y = 1; blockDim.z = 1; diagmv<<s>>>(this->n, alpha, D.raw(), x, y); - cudaCheckError(); + CUDA_CHECK_LAST(); // Apply adjacency matrix A->mv(-alpha, x, 1, y); @@ -848,7 +848,7 @@ void LaplacianMatrix::dm(IndexType_ k, diagmm <<s>>>(this->n, k, alpha, D.raw(), x, beta, y); } - cudaCheckError(); + CUDA_CHECK_LAST(); } /// Color and Reorder From 56ddbb5d08c51960a7cbdb3328fcf8f9d1a53699 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 3 Jun 2020 16:39:51 -0500 Subject: [PATCH 06/88] Pulling GraphCSRView from cugraph. --- cpp/include/raft/graph.hpp | 575 ++++++++++++++++++ cpp/include/raft/spectral/spectral_matrix.hpp | 2 + 2 files changed, 577 insertions(+) create mode 100644 cpp/include/raft/graph.hpp diff --git a/cpp/include/raft/graph.hpp b/cpp/include/raft/graph.hpp new file mode 100644 index 0000000000..d7b1a2838a --- /dev/null +++ b/cpp/include/raft/graph.hpp @@ -0,0 +1,575 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include +#include +#include + +#include +#include + +namespace cugraph { +namespace experimental { + +enum class PropType { PROP_UNDEF, PROP_FALSE, PROP_TRUE }; + +struct GraphProperties { + bool directed{false}; + bool weighted{false}; + bool multigraph{false}; + bool bipartite{false}; + bool tree{false}; + PropType has_negative_edges{PropType::PROP_UNDEF}; + GraphProperties() = default; +}; + +enum class DegreeDirection { + IN_PLUS_OUT = 0, ///> Compute sum of in and out degree + IN, ///> Compute in degree + OUT, ///> Compute out degree + DEGREE_DIRECTION_COUNT +}; + +/** + * @brief Base class graphs, all but vertices and edges + * + * @tparam VT Type of vertex id + * @tparam ET Type of edge id + * @tparam WT Type of weight + */ +template +class GraphViewBase { + public: + WT *edge_data; ///< edge weight + Comm comm; + + GraphProperties prop; + + VT number_of_vertices; + ET number_of_edges; + + /** + * @brief Fill the identifiers array with the vertex identifiers. + * + * @param[out] identifier Pointer to device memory to store the vertex + * identifiers + */ + void get_vertex_identifiers(VT *identifiers) const; + void set_communicator(Comm &comm_) { comm = comm_; } + + GraphViewBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + : edge_data(edge_data_), + comm(), + prop(), + number_of_vertices(number_of_vertices_), + number_of_edges(number_of_edges_) + { + } + bool has_data(void) const { return edge_data != nullptr; } +}; + +/** + * @brief A graph stored in COO (COOrdinate) format. + * + * @tparam VT Type of vertex id + * @tparam ET Type of edge id + * @tparam WT Type of weight + */ +template +class GraphCOOView : public GraphViewBase { + public: + VT *src_indices{nullptr}; ///< rowInd + VT *dst_indices{nullptr}; ///< colInd + + /** + * @brief Computes degree(in, out, in+out) of all the nodes of a Graph + * + * @throws cugraph::logic_error when an error occurs. + * + * @param[out] degree Device array of size V (V is number of vertices) initialized + * to zeros. Will contain the computed degree of every vertex. + * @param[in] direction IN_PLUS_OUT, IN or OUT + */ + void degree(ET *degree, DegreeDirection direction) const; + + /** + * @brief Default constructor + */ + GraphCOOView() : GraphViewBase(nullptr, 0, 0) {} + + /** + * @brief Wrap existing arrays representing an edge list in a Graph. + * + * GraphCOOView does not own the memory used to represent this graph. This + * function does not allocate memory. + * + * @param source_indices This array of size E (number of edges) contains the index of the + * source for each edge. Indices must be in the range [0, V-1]. + * @param destination_indices This array of size E (number of edges) contains the index of the + * destination for each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array size E (number of edges) contains the weight for each + * edge. This array can be null in which case the graph is considered unweighted. + * @param number_of_vertices The number of vertices in the graph + * @param number_of_edges The number of edges in the graph + */ + GraphCOOView( + VT *src_indices_, VT *dst_indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + : GraphViewBase(edge_data_, number_of_vertices_, number_of_edges_), + src_indices(src_indices_), + dst_indices(dst_indices_) + { + } +}; + +/** + * @brief Base class for graph stored in CSR (Compressed Sparse Row) format or CSC (Compressed + * Sparse Column) format + * + * @tparam VT Type of vertex id + * @tparam ET Type of edge id + * @tparam WT Type of weight + */ +template +class GraphCompressedSparseBaseView : public GraphViewBase { + public: + ET *offsets{nullptr}; ///< CSR offsets + VT *indices{nullptr}; ///< CSR indices + + /** + * @brief Fill the identifiers in the array with the source vertex + * identifiers + * + * @param[out] src_indices Pointer to device memory to store the + * source vertex identifiers + */ + void get_source_indices(VT *src_indices) const; + + /** + * @brief Computes degree(in, out, in+out) of all the nodes of a Graph + * + * @throws cugraph::logic_error when an error occurs. + * + * @param[out] degree Device array of size V (V is number of vertices) initialized + * to zeros. Will contain the computed degree of every vertex. + * @param[in] x Integer value indicating type of degree calculation + * 0 : in+out degree + * 1 : in-degree + * 2 : out-degree + */ + void degree(ET *degree, DegreeDirection direction) const; + + /** + * @brief Wrap existing arrays representing adjacency lists in a Graph. + * GraphCSRView does not own the memory used to represent this graph. This + * function does not allocate memory. + * + * @param offsets This array of size V+1 (V is number of vertices) contains the + * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of + * edges). + * @param indices This array of size E contains the index of the destination for + * each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array of size E (number of edges) contains the weight for + * each edge. This array can be null in which case the graph is considered unweighted. + * @param number_of_vertices The number of vertices in the graph + * @param number_of_edges The number of edges in the graph + */ + GraphCompressedSparseBaseView( + ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + : GraphViewBase(edge_data_, number_of_vertices_, number_of_edges_), + offsets{offsets_}, + indices{indices_} + { + } +}; + +/** + * @brief A graph stored in CSR (Compressed Sparse Row) format. + * + * @tparam VT Type of vertex id + * @tparam ET Type of edge id + * @tparam WT Type of weight + */ +template +class GraphCSRView : public GraphCompressedSparseBaseView { + public: + /** + * @brief Default constructor + */ + GraphCSRView() : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, 0) {} + + /** + * @brief Wrap existing arrays representing adjacency lists in a Graph. + * GraphCSRView does not own the memory used to represent this graph. This + * function does not allocate memory. + * + * @param offsets This array of size V+1 (V is number of vertices) contains the + * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of + * edges). + * @param indices This array of size E contains the index of the destination for + * each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array of size E (number of edges) contains the weight for + * each edge. This array can be null in which case the graph is considered unweighted. + * @param number_of_vertices The number of vertices in the graph + * @param number_of_edges The number of edges in the graph + */ + GraphCSRView( + ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + : GraphCompressedSparseBaseView( + offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) + { + } +}; + +/** + * @brief A graph stored in CSC (Compressed Sparse Column) format. + * + * @tparam VT Type of vertex id + * @tparam ET Type of edge id + * @tparam WT Type of weight + */ +template +class GraphCSCView : public GraphCompressedSparseBaseView { + public: + /** + * @brief Default constructor + */ + GraphCSCView() : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, 0) {} + + /** + * @brief Wrap existing arrays representing transposed adjacency lists in a Graph. + * GraphCSCView does not own the memory used to represent this graph. This + * function does not allocate memory. + * + * @param offsets This array of size V+1 (V is number of vertices) contains the + * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of + * edges). + * @param indices This array of size E contains the index of the destination for + * each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array of size E (number of edges) contains the weight for + * each edge. This array can be null in which case the graph is considered unweighted. + * @param number_of_vertices The number of vertices in the graph + * @param number_of_edges The number of edges in the graph + */ + GraphCSCView( + ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + : GraphCompressedSparseBaseView( + offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) + { + } +}; + +/** + * @brief TODO : Change this Take ownership of the provided graph arrays in COO format + * + * @param source_indices This array of size E (number of edges) contains the index of the + * source for each edge. Indices must be in the range [0, V-1]. + * @param destination_indices This array of size E (number of edges) contains the index of the + * destination for each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array size E (number of edges) contains the weight for each + * edge. This array can be null in which case the graph is considered unweighted. + * @param number_of_vertices The number of vertices in the graph + * @param number_of_edges The number of edges in the graph + */ +template +struct GraphCOOContents { + VT number_of_vertices; + ET number_of_edges; + std::unique_ptr src_indices; + std::unique_ptr dst_indices; + std::unique_ptr edge_data; +}; + +/** + * @brief A constructed graph stored in COO (COOrdinate) format. + * + * This class will src_indices and dst_indicies (until moved) + * + * @tparam VT Type of vertex id + * @tparam ET Type of edge id + * @tparam WT Type of weight + */ +template +class GraphCOO { + VT number_of_vertices_; + ET number_of_edges_; + rmm::device_buffer src_indices_{}; ///< rowInd + rmm::device_buffer dst_indices_{}; ///< colInd + rmm::device_buffer edge_data_{}; ///< CSR data + + public: + /** + * @brief Take ownership of the provided graph arrays in COO format + * + * @param source_indices This array of size E (number of edges) contains the index of the + * source for each edge. Indices must be in the range [0, V-1]. + * @param destination_indices This array of size E (number of edges) contains the index of the + * destination for each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array size E (number of edges) contains the weight for each + * edge. This array can be null in which case the graph is considered unweighted. + * @param number_of_vertices The number of vertices in the graph + * @param number_of_edges The number of edges in the graph + */ + GraphCOO(VT number_of_vertices, + ET number_of_edges, + bool has_data = false, + cudaStream_t stream = nullptr, + rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) + : number_of_vertices_(number_of_vertices), + number_of_edges_(number_of_edges), + src_indices_(sizeof(VT) * number_of_edges, stream, mr), + dst_indices_(sizeof(VT) * number_of_edges, stream, mr), + edge_data_((has_data ? sizeof(WT) * number_of_edges : 0), stream, mr) + { + } + + GraphCOO(GraphCOOView const &graph, + cudaStream_t stream = nullptr, + rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) + : number_of_vertices_(graph.number_of_vertices), + number_of_edges_(graph.number_of_edges), + src_indices_(graph.src_indices, graph.number_of_edges * sizeof(VT), stream, mr), + dst_indices_(graph.dst_indices, graph.number_of_edges * sizeof(VT), stream, mr) + { + if (graph.has_data()) { + edge_data_ = + rmm::device_buffer{graph.edge_data, graph.number_of_edges * sizeof(WT), stream, mr}; + } + } + + VT number_of_vertices(void) { return number_of_vertices_; } + ET number_of_edges(void) { return number_of_edges_; } + VT *src_indices(void) { return static_cast(src_indices_.data()); } + VT *dst_indices(void) { return static_cast(dst_indices_.data()); } + WT *edge_data(void) { return static_cast(edge_data_.data()); } + + GraphCOOContents release() noexcept + { + VT number_of_vertices = number_of_vertices_; + ET number_of_edges = number_of_edges_; + number_of_vertices_ = 0; + number_of_edges_ = 0; + return GraphCOOContents{ + number_of_vertices, + number_of_edges, + std::make_unique(std::move(src_indices_)), + std::make_unique(std::move(dst_indices_)), + std::make_unique(std::move(edge_data_))}; + } + + GraphCOOView view(void) noexcept + { + return GraphCOOView( + src_indices(), dst_indices(), edge_data(), number_of_vertices_, number_of_edges_); + } + + bool has_data(void) { return nullptr != edge_data_.data(); } +}; + +template +struct GraphSparseContents { + VT number_of_vertices; + ET number_of_edges; + std::unique_ptr offsets; + std::unique_ptr indices; + std::unique_ptr edge_data; +}; + +/** + * @brief Base class for constructted graphs stored in CSR (Compressed Sparse Row) format or + * CSC (Compressed Sparse Column) format + * + * @tparam VT Type of vertex id + * @tparam ET Type of edge id + * @tparam WT Type of weight + */ +template +class GraphCompressedSparseBase { + VT number_of_vertices_{0}; + ET number_of_edges_{0}; + rmm::device_buffer offsets_{}; ///< CSR offsets + rmm::device_buffer indices_{}; ///< CSR indices + rmm::device_buffer edge_data_{}; ///< CSR data + + bool has_data_{false}; + + public: + /** + * @brief Take ownership of the provided graph arrays in CSR/CSC format + * + * @param offsets This array of size V+1 (V is number of vertices) contains the + * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of + * edges). + * @param indices This array of size E contains the index of the destination for + * each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array of size E (number of edges) contains the weight for + * each edge. This array can be null in which case the graph is considered unweighted. + * @param number_of_vertices The number of vertices in the graph + * @param number_of_edges The number of edges in the graph + */ + GraphCompressedSparseBase(VT number_of_vertices, + ET number_of_edges, + bool has_data, + cudaStream_t stream, + rmm::mr::device_memory_resource *mr) + : number_of_vertices_(number_of_vertices), + number_of_edges_(number_of_edges), + offsets_(sizeof(ET) * (number_of_vertices + 1), stream, mr), + indices_(sizeof(VT) * number_of_edges, stream, mr), + edge_data_((has_data ? sizeof(WT) * number_of_edges : 0), stream, mr) + { + } + + GraphCompressedSparseBase(GraphSparseContents &&contents) + : number_of_vertices_(contents.number_of_vertices), + number_of_edges_(contents.number_of_edges), + offsets_(std::move(*contents.offsets.release())), + indices_(std::move(*contents.indices.release())), + edge_data_(std::move(*contents.edge_data.release())) + { + } + + VT number_of_vertices(void) { return number_of_vertices_; } + ET number_of_edges(void) { return number_of_edges_; } + ET *offsets(void) { return static_cast(offsets_.data()); } + VT *indices(void) { return static_cast(indices_.data()); } + WT *edge_data(void) { return static_cast(edge_data_.data()); } + + GraphSparseContents release() noexcept + { + VT number_of_vertices = number_of_vertices_; + ET number_of_edges = number_of_edges_; + number_of_vertices_ = 0; + number_of_edges_ = 0; + return GraphSparseContents{ + number_of_vertices, + number_of_edges, + std::make_unique(std::move(offsets_)), + std::make_unique(std::move(indices_)), + std::make_unique(std::move(edge_data_))}; + } + + bool has_data(void) { return nullptr != edge_data_.data(); } +}; + +/** + * @brief A constructed graph stored in CSR (Compressed Sparse Row) format. + * + * @tparam VT Type of vertex id + * @tparam ET Type of edge id + * @tparam WT Type of weight + */ +template +class GraphCSR : public GraphCompressedSparseBase { + public: + /** + * @brief Default constructor + */ + GraphCSR() : GraphCompressedSparseBase() {} + + /** + * @brief Take ownership of the provided graph arrays in CSR format + * + * @param offsets This array of size V+1 (V is number of vertices) contains the + * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of + * edges). + * @param indices This array of size E contains the index of the destination for + * each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array of size E (number of edges) contains the weight for + * each edge. This array can be null in which case the graph is considered unweighted. + * @param number_of_vertices The number of vertices in the graph + * @param number_of_edges The number of edges in the graph + */ + GraphCSR(VT number_of_vertices_, + ET number_of_edges_, + bool has_data_ = false, + cudaStream_t stream = nullptr, + rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) + : GraphCompressedSparseBase( + number_of_vertices_, number_of_edges_, has_data_, stream, mr) + { + } + + GraphCSR(GraphSparseContents &&contents) + : GraphCompressedSparseBase(std::move(contents)) + { + } + + GraphCSRView view(void) noexcept + { + return GraphCSRView(GraphCompressedSparseBase::offsets(), + GraphCompressedSparseBase::indices(), + GraphCompressedSparseBase::edge_data(), + GraphCompressedSparseBase::number_of_vertices(), + GraphCompressedSparseBase::number_of_edges()); + } +}; + +/** + * @brief A constructed graph stored in CSC (Compressed Sparse Column) format. + * + * @tparam VT Type of vertex id + * @tparam ET Type of edge id + * @tparam WT Type of weight + */ +template +class GraphCSC : public GraphCompressedSparseBase { + public: + /** + * @brief Default constructor + */ + GraphCSC() : GraphCompressedSparseBase() {} + + /** + * @brief Take ownership of the provided graph arrays in CSR format + * + * @param offsets This array of size V+1 (V is number of vertices) contains the + * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of + * edges). + * @param indices This array of size E contains the index of the destination for + * each edge. Indices must be in the range [0, V-1]. + * @param edge_data This array of size E (number of edges) contains the weight for + * each edge. This array can be null in which case the graph is considered unweighted. + * @param number_of_vertices The number of vertices in the graph + * @param number_of_edges The number of edges in the graph + */ + GraphCSC(VT number_of_vertices_, + ET number_of_edges_, + bool has_data_ = false, + cudaStream_t stream = nullptr, + rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) + : GraphCompressedSparseBase( + number_of_vertices_, number_of_edges_, has_data_, stream, mr) + { + } + + GraphCSC(GraphSparseContents &&contents) + : GraphCompressedSparseBase(contents) + { + } + + GraphCSCView view(void) noexcept + { + return GraphCSCView(GraphCompressedSparseBase::offsets(), + GraphCompressedSparseBase::indices(), + GraphCompressedSparseBase::edge_data(), + GraphCompressedSparseBase::number_of_vertices(), + GraphCompressedSparseBase::number_of_edges()); + } +}; + +} // namespace experimental +} // namespace cugraph diff --git a/cpp/include/raft/spectral/spectral_matrix.hpp b/cpp/include/raft/spectral/spectral_matrix.hpp index eda7af120c..b9186329d3 100644 --- a/cpp/include/raft/spectral/spectral_matrix.hpp +++ b/cpp/include/raft/spectral/spectral_matrix.hpp @@ -21,6 +21,8 @@ // #include // #include +#include + #include #include #include From aa058ef3aef9ae214445ba24caaf77875ca9fe88 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 4 Jun 2020 13:04:57 -0500 Subject: [PATCH 07/88] Matrix replacements for nvgraph Matrix types. --- cpp/include/raft/graph.hpp | 8 +- cpp/include/raft/spectral/matrix_wrappers.hpp | 76 +++++++++++++++++++ 2 files changed, 80 insertions(+), 4 deletions(-) create mode 100644 cpp/include/raft/spectral/matrix_wrappers.hpp diff --git a/cpp/include/raft/graph.hpp b/cpp/include/raft/graph.hpp index d7b1a2838a..8e72572764 100644 --- a/cpp/include/raft/graph.hpp +++ b/cpp/include/raft/graph.hpp @@ -22,8 +22,8 @@ #include #include -namespace cugraph { -namespace experimental { +namespace raft { +namespace matrix { enum class PropType { PROP_UNDEF, PROP_FALSE, PROP_TRUE }; @@ -571,5 +571,5 @@ class GraphCSC : public GraphCompressedSparseBase { } }; -} // namespace experimental -} // namespace cugraph +} // namespace matrix +} // namespace raft diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp new file mode 100644 index 0000000000..be6b58a8cf --- /dev/null +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace raft{ +namespace matrix { + +using size_type = int; // for now; TODO: move it in appropriate header + +template +struct sparse_matrix_t { + sparse_matrix_t(index_type const* row_offsets, + index_type const* col_indices, + value_type const* values, + index_type const nnz, + index_type const nrows) : + row_offsets_(row_offsets), + col_indices_(col_indices), + values_(values), + nrows_(nrows), + nnz_(nnz) + { + } + + sparse_matrix_t(const GraphCSRView& csr_view): + row_offsets_(csr_view.offsets_), + col_indices_(csr_view.indices_), + values_(csr_view.edge_data_), + nrows_(csr_view.number_of_vertices_), + nnz_(csr_view.number_of_edges_) + { + } + + + virtual ~sparse_matrix_t(void) = default; // virtual because used as base for following matrix types + + // y = alpha*A*x + beta*y + // + template + void mv(value_type alpha, + value_type const* __restrict__ x, + value_type beta, + value_type* __restrict__ y, + exe_policy_t&& policy, + cudaStream_t stream = nullptr) const + { + //TODO: call cusparse::csrmv + } + + //private: // maybe not, keep this ASAP ("as simple as possible"); hence, aggregate + + index_type const* row_offsets_; + index_type const* col_indices_; + value_type const* values_; // TODO: const-ness of this is debatable; cusparse primitives may not accept it... + index_type const nrows_; + index_type const nnz_; +}; + +} // namespace matrix +} // namespace raft From 61c669c2c88a03b276fc6c87bf2a11c5e21af938 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 4 Jun 2020 17:30:29 -0500 Subject: [PATCH 08/88] Refactoring of Lanczos algorithms (except AllocatableVector). More error control placeholders. More matrix wrappers. --- cpp/include/raft/spectral/error_temp.hpp | 14 + cpp/include/raft/spectral/lanczos.hpp | 461 +++++++----------- cpp/include/raft/spectral/matrix_wrappers.hpp | 50 +- 3 files changed, 221 insertions(+), 304 deletions(-) diff --git a/cpp/include/raft/spectral/error_temp.hpp b/cpp/include/raft/spectral/error_temp.hpp index 0cd58b1769..f8dabf994b 100644 --- a/cpp/include/raft/spectral/error_temp.hpp +++ b/cpp/include/raft/spectral/error_temp.hpp @@ -13,3 +13,17 @@ #define CUDA_TRY(call) #define CUDA_CHECK_LAST() + +#ifdef DEBUG +#define COUT() (std::cout) +#define CERR() (std::cerr) +#define WARNING(message) \ + do { \ + std::stringstream ss; \ + ss << "Warning (" << __FILE__ << ":" << __LINE__ << "): " << message; \ + CERR() << ss.str() << std::endl; \ + } while (0) +#else // DEBUG +#define WARNING(message) +#endif + diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index ad49be1c05..2cc9f002d1 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,25 +14,20 @@ * limitations under the License. */ -//#ifdef NVGRAPH_PARTITION +#pragma once +//for cmath: #define _USE_MATH_DEFINES -#include -#include "include/lanczos.hxx" -#include -#include +#include #include #include #include -#include "include/debug_macros.h" -#include "include/nvgraph_cublas.hxx" -#include "include/nvgraph_error.hxx" -#include "include/nvgraph_lapack.hxx" -#include "include/nvgraph_vector.hxx" -#include "include/nvgraph_vector_kernels.hxx" +#include +#include + // ========================================================= // Useful macros // ========================================================= @@ -40,10 +35,12 @@ // Get index of matrix entry #define IDX(i, j, lda) ((i) + (j) * (lda)) -namespace nvgraph { +namespace raft { namespace { +using namespace matrix; + // ========================================================= // Helper functions // ========================================================= @@ -75,16 +72,16 @@ namespace { * @return Zero if successful. Otherwise non-zero. */ template -static int performLanczosIteration(const Matrix *A, - IndexType_ *iter, - IndexType_ maxIter, - ValueType_ shift, - ValueType_ tol, - bool reorthogonalize, - ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev) +int performLanczosIteration(sparse_matrix_t const* A, + IndexType_ *iter, + IndexType_ maxIter, + ValueType_ shift, + ValueType_ tol, + bool reorthogonalize, + ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, + ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev) { // ------------------------------------------------------- // Variable declaration @@ -95,7 +92,7 @@ static int performLanczosIteration(const Matrix *A, const ValueType_ negOne = -1; const ValueType_ zero = 0; - IndexType_ n = A->n; + IndexType_ n = A->nrows; // ------------------------------------------------------- // Compute second Lanczos vector @@ -105,7 +102,7 @@ static int performLanczosIteration(const Matrix *A, // Apply matrix if (shift != 0) - CHECK_CUDA(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( lanczosVecs_dev + n, lanczosVecs_dev, n * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); @@ -130,7 +127,7 @@ static int performLanczosIteration(const Matrix *A, // Apply matrix if (shift != 0) - CHECK_CUDA(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n, n * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); @@ -161,7 +158,7 @@ static int performLanczosIteration(const Matrix *A, &one, lanczosVecs_dev + IDX(0, *iter, n), 1); - CHECK_CUDA(cudaMemcpyAsync(alpha_host + (*iter - 1), + CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1), sizeof(ValueType_), cudaMemcpyDeviceToHost)); @@ -220,7 +217,7 @@ static int performLanczosIteration(const Matrix *A, Cublas::scal(n, 1 / beta_host[*iter - 1], lanczosVecs_dev + IDX(0, *iter, n), 1); } - CHECK_CUDA(cudaDeviceSynchronize()); + CUDA_TRY(cudaDeviceSynchronize()); return 0; } @@ -558,7 +555,7 @@ static int lanczosRestart(IndexType_ n, WARNING("error in implicitly shifted QR algorithm"); // Obtain new residual - CHECK_CUDA( + CUDA_TRY( cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(ValueType_), cudaMemcpyHostToDevice)); beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; @@ -578,11 +575,11 @@ static int lanczosRestart(IndexType_ n, Cublas::gemm( false, false, n, iter_new, iter, &one, lanczosVecs_dev, n, V_dev, iter, &zero, work_dev, n); - CHECK_CUDA(cudaMemcpyAsync( + CUDA_TRY(cudaMemcpyAsync( lanczosVecs_dev, work_dev, n * iter_new * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); // Normalize residual to obtain new Lanczos vector - CHECK_CUDA(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n), n * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); @@ -592,7 +589,7 @@ static int lanczosRestart(IndexType_ n, return 0; } -} // namespace +} // anonym. namespace // ========================================================= // Eigensolver @@ -642,24 +639,25 @@ static int lanczosRestart(IndexType_ n, * Eigenvectors corresponding to smallest eigenvalues of * matrix. Vectors are stored as columns of a column-major matrix * with dimensions n x nEigVecs. - * @return NVGRAPH error flag. + * @return error flag. */ template -NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix *A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ *effIter, - IndexType_ *totalIter, - ValueType_ *shift, - ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev) +int computeSmallestEigenvectors(sparse_matrix_t const* A, + IndexType_ nEigVecs, + IndexType_ maxIter, + IndexType_ restartIter, + ValueType_ tol, + bool reorthogonalize, + IndexType_ *effIter, + IndexType_ *totalIter, + ValueType_ *shift, + ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, + ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev, + ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev, + unsigned long long seed) { // ------------------------------------------------------- // Variable declaration @@ -670,7 +668,7 @@ NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix * const ValueType_ zero = 0; // Matrix dimension - IndexType_ n = A->n; + IndexType_ n = A->nrows; // Shift for implicit restart ValueType_ shiftUpper; @@ -697,34 +695,12 @@ NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix * // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - if (A->m != A->n) { - WARNING("invalid parameter (matrix is not square)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs < 1) { - WARNING("invalid parameter (nEigVecs<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (restartIter < 1) { - WARNING("invalid parameter (restartIter<4)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (tol < 0) { - WARNING("invalid parameter (tol<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs > n) { - WARNING("invalid parameters (nEigVecs>n)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (maxIter < nEigVecs) { - WARNING("invalid parameters (maxIter 0 && nEigVecs<=n, "Invalid number of eigenvectors."); + RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECT(tol > 0, "Invalid tolerance."); + RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); + // ------------------------------------------------------- // Variable initialization @@ -750,15 +726,15 @@ NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix * // Random number generator curandGenerator_t randGen; // Initialize random number generator - CHECK_CURAND(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); + CUDA_TRY(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); // FIXME: This is hard coded, which is good for unit testing... // but should really be a parameter so it could be // "random" for real runs and "fixed" for tests - CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen, 1234567 /*time(NULL)*/)); - // CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen, time(NULL))); + CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, seed /*time(NULL)*/)); + // CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, time(NULL))); // Initialize initial Lanczos vector - CHECK_CURAND(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); + CUDA_TRY(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); ValueType_ normQ1 = Cublas::nrm2(n, lanczosVecs_dev, 1); Cublas::scal(n, 1 / normQ1, lanczosVecs_dev, 1); @@ -877,7 +853,7 @@ NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix * for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory - CHECK_CUDA(cudaMemcpy(eigVals_dev, + CUDA_TRY(cudaMemcpy(eigVals_dev, work_host + 2 * (*effIter), nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); @@ -885,7 +861,7 @@ NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix * //{ // std::cout <<*(work_host+(2*(*effIter)+i))<< std::endl; //} - CHECK_CUDA(cudaMemcpy( + CUDA_TRY(cudaMemcpy( work_dev, Z_host, (*effIter) * nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); // Convert eigenvectors from Lanczos basis to standard basis @@ -904,8 +880,8 @@ NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix * n); // Clean up and exit - CHECK_CURAND(curandDestroyGenerator(randGen)); - return NVGRAPH_OK; + CUDA_TRY(curandDestroyGenerator(randGen)); + return 0; } /// Compute smallest eigenvectors of symmetric matrix @@ -942,55 +918,30 @@ NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix * * Eigenvectors corresponding to smallest eigenvalues of * matrix. Vectors are stored as columns of a column-major matrix * with dimensions n x nEigVecs. - * @return NVGRAPH error flag. + * @return error flag. */ template -NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix &A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ &iter, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev) +int computeSmallestEigenvectors(sparse_matrix_t const& A, + IndexType_ nEigVecs, + IndexType_ maxIter, + IndexType_ restartIter, + ValueType_ tol, + bool reorthogonalize, + IndexType_ &iter, + ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev, + unsigned long long seed = 1234567, + cudaStream_t stream = 0) { - // CUDA stream - // TODO: handle non-zero streams - cudaStream_t stream = 0; - // Matrix dimension - IndexType_ n = A.n; + IndexType_ n = A.nrows; // Check that parameters are valid - if (A.m != A.n) { - WARNING("invalid parameter (matrix is not square)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs < 1) { - WARNING("invalid parameter (nEigVecs<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (restartIter < 1) { - WARNING("invalid parameter (restartIter<4)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (tol < 0) { - WARNING("invalid parameter (tol<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs > n) { - WARNING("invalid parameters (nEigVecs>n)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (maxIter < nEigVecs) { - WARNING("invalid parameters (maxIter 0 && nEigVecs<=n, "Invalid number of eigenvectors."); + RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECT(tol > 0, "Invalid tolerance."); + RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); // Allocate memory std::vector alpha_host_v(restartIter); @@ -999,27 +950,29 @@ NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix & ValueType_ *alpha_host = alpha_host_v.data(); ValueType_ *beta_host = beta_host_v.data(); - Vector lanczosVecs_dev(n * (restartIter + 1), stream); - Vector work_dev((n + restartIter) * restartIter, stream); + //TODO: replace and fix allocation via RAFT handle + AllocatableVector lanczosVecs_dev(n * (restartIter + 1), stream); + AllocatableVector work_dev((n + restartIter) * restartIter, stream); // Perform Lanczos method IndexType_ effIter; ValueType_ shift; - NVGRAPH_ERROR status = computeSmallestEigenvectors(&A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - &effIter, - &iter, - &shift, - alpha_host, - beta_host, - lanczosVecs_dev.raw(), - work_dev.raw(), - eigVals_dev, - eigVecs_dev); + int status = computeSmallestEigenvectors(&A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + &shift, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); // Clean up and return return status; @@ -1068,23 +1021,24 @@ NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix & * Eigenvectors corresponding to largest eigenvalues of * matrix. Vectors are stored as columns of a column-major matrix * with dimensions n x nEigVecs. - * @return NVGRAPH error flag. + * @return error flag. */ template -NVGRAPH_ERROR computeLargestEigenvectors(const Matrix *A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ *effIter, - IndexType_ *totalIter, - ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev) +int computeLargestEigenvectors(sparse_matrix_t const* A, + IndexType_ nEigVecs, + IndexType_ maxIter, + IndexType_ restartIter, + ValueType_ tol, + bool reorthogonalize, + IndexType_ *effIter, + IndexType_ *totalIter, + ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, + ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev, + ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev, + unsigned long long seed) { // ------------------------------------------------------- // Variable declaration @@ -1095,7 +1049,7 @@ NVGRAPH_ERROR computeLargestEigenvectors(const Matrix *A const ValueType_ zero = 0; // Matrix dimension - IndexType_ n = A->n; + IndexType_ n = A->nrows; // Lanczos iteration counters IndexType_ maxIter_curr = restartIter; // Maximum size of Lanczos system @@ -1118,34 +1072,11 @@ NVGRAPH_ERROR computeLargestEigenvectors(const Matrix *A // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - if (A->m != A->n) { - WARNING("invalid parameter (matrix is not square)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs < 1) { - WARNING("invalid parameter (nEigVecs<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (restartIter < 1) { - WARNING("invalid parameter (restartIter<4)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (tol < 0) { - WARNING("invalid parameter (tol<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs > n) { - WARNING("invalid parameters (nEigVecs>n)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (maxIter < nEigVecs) { - WARNING("invalid parameters (maxIter 0 && nEigVecs<=n, "Invalid number of eigenvectors."); + RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECT(tol > 0, "Invalid tolerance."); + RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); // ------------------------------------------------------- // Variable initialization @@ -1171,10 +1102,10 @@ NVGRAPH_ERROR computeLargestEigenvectors(const Matrix *A // Random number generator curandGenerator_t randGen; // Initialize random number generator - CHECK_CURAND(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); - CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen, 123456)); + CUDA_TRY(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); + CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, seed)); // Initialize initial Lanczos vector - CHECK_CURAND(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); + CUDA_TRY(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); ValueType_ normQ1 = Cublas::nrm2(n, lanczosVecs_dev, 1); Cublas::scal(n, 1 / normQ1, lanczosVecs_dev, 1); @@ -1296,13 +1227,13 @@ NVGRAPH_ERROR computeLargestEigenvectors(const Matrix *A // Copy results to device memory // skip smallest eigenvalue if needed - CHECK_CUDA(cudaMemcpy(eigVals_dev, + CUDA_TRY(cudaMemcpy(eigVals_dev, work_host + 2 * (*effIter) + top_eigenparis_idx_offset, nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); // skip smallest eigenvector if needed - CHECK_CUDA(cudaMemcpy(work_dev, + CUDA_TRY(cudaMemcpy(work_dev, Z_host + (top_eigenparis_idx_offset * (*effIter)), (*effIter) * nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); @@ -1323,8 +1254,8 @@ NVGRAPH_ERROR computeLargestEigenvectors(const Matrix *A n); // Clean up and exit - CHECK_CURAND(curandDestroyGenerator(randGen)); - return NVGRAPH_OK; + CUDA_TRY(curandDestroyGenerator(randGen)); + return 0; } /// Compute largest eigenvectors of symmetric matrix @@ -1361,55 +1292,30 @@ NVGRAPH_ERROR computeLargestEigenvectors(const Matrix *A * Eigenvectors corresponding to largest eigenvalues of * matrix. Vectors are stored as columns of a column-major matrix * with dimensions n x nEigVecs. - * @return NVGRAPH error flag. + * @return error flag. */ template -NVGRAPH_ERROR computeLargestEigenvectors(const Matrix &A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ &iter, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev) +int computeLargestEigenvectors(sparse_matrix_t const& A, + IndexType_ nEigVecs, + IndexType_ maxIter, + IndexType_ restartIter, + ValueType_ tol, + bool reorthogonalize, + IndexType_ &iter, + ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev, + unsigned long long seed = 123456, + cudaStream_t stream = 0) { - // CUDA stream - // TODO: handle non-zero streams - cudaStream_t stream = 0; - // Matrix dimension - IndexType_ n = A.n; + IndexType_ n = A.nrows; // Check that parameters are valid - if (A.m != A.n) { - WARNING("invalid parameter (matrix is not square)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs < 1) { - WARNING("invalid parameter (nEigVecs<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (restartIter < 1) { - WARNING("invalid parameter (restartIter<4)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (tol < 0) { - WARNING("invalid parameter (tol<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs > n) { - WARNING("invalid parameters (nEigVecs>n)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (maxIter < nEigVecs) { - WARNING("invalid parameters (maxIter 0 && nEigVecs<=n, "Invalid number of eigenvectors."); + RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECT(tol > 0, "Invalid tolerance."); + RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); // Allocate memory std::vector alpha_host_v(restartIter); @@ -1418,70 +1324,31 @@ NVGRAPH_ERROR computeLargestEigenvectors(const Matrix &A ValueType_ *alpha_host = alpha_host_v.data(); ValueType_ *beta_host = beta_host_v.data(); - Vector lanczosVecs_dev(n * (restartIter + 1), stream); - Vector work_dev((n + restartIter) * restartIter, stream); + //TODO: replace and fix allocation via RAFT handle + AllocatableVector lanczosVecs_dev(n * (restartIter + 1), stream); + AllocatableVector work_dev((n + restartIter) * restartIter, stream); // Perform Lanczos method IndexType_ effIter; - NVGRAPH_ERROR status = computeLargestEigenvectors(&A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - &effIter, - &iter, - alpha_host, - beta_host, - lanczosVecs_dev.raw(), - work_dev.raw(), - eigVals_dev, - eigVecs_dev); + int status = computeLargestEigenvectors(&A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); // Clean up and return return status; } -// ========================================================= -// Explicit instantiation -// ========================================================= -template NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix &A, - int nEigVecs, - int maxIter, - int restartIter, - float tol, - bool reorthogonalize, - int &iter, - float *__restrict__ eigVals_dev, - float *__restrict__ eigVecs_dev); -template NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix &A, - int nEigVecs, - int maxIter, - int restartIter, - double tol, - bool reorthogonalize, - int &iter, - double *__restrict__ eigVals_dev, - double *__restrict__ eigVecs_dev); - -template NVGRAPH_ERROR computeLargestEigenvectors(const Matrix &A, - int nEigVecs, - int maxIter, - int restartIter, - float tol, - bool reorthogonalize, - int &iter, - float *__restrict__ eigVals_dev, - float *__restrict__ eigVecs_dev); -template NVGRAPH_ERROR computeLargestEigenvectors(const Matrix &A, - int nEigVecs, - int maxIter, - int restartIter, - double tol, - bool reorthogonalize, - int &iter, - double *__restrict__ eigVals_dev, - double *__restrict__ eigVecs_dev); - -} // namespace nvgraph +} // namespace raft diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index be6b58a8cf..9dc75fdd77 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -22,6 +22,20 @@ namespace raft{ namespace matrix { using size_type = int; // for now; TODO: move it in appropriate header + +// Vector "view"-like aggregate for linear algebra purposes +// +template +struct vector_t { + value_type* buffer_; + size_type size_; + + vector_t(value_type* buffer, size_type sz): + buffer_(buffer), + size_(sz) + { + } +}; template struct sparse_matrix_t { @@ -38,7 +52,7 @@ struct sparse_matrix_t { { } - sparse_matrix_t(const GraphCSRView& csr_view): + sparse_matrix_t(GraphCSRView const& csr_view): row_offsets_(csr_view.offsets_), col_indices_(csr_view.indices_), values_(csr_view.edge_data_), @@ -52,18 +66,15 @@ struct sparse_matrix_t { // y = alpha*A*x + beta*y // - template - void mv(value_type alpha, + virtual void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, - value_type* __restrict__ y, - exe_policy_t&& policy, - cudaStream_t stream = nullptr) const + value_type* __restrict__ y) const { //TODO: call cusparse::csrmv } - //private: // maybe not, keep this ASAP ("as simple as possible"); hence, aggregate + //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate index_type const* row_offsets_; index_type const* col_indices_; @@ -72,5 +83,30 @@ struct sparse_matrix_t { index_type const nnz_; }; +template +struct laplacian_matrix_t : sparse_matrix_t { + laplacian_matrix_t(index_type const* row_offsets, + index_type const* col_indices, + value_type const* values, + vector_t&& diagonal, + index_type const nnz, + index_type const nrows) : + sparse_matrix_t(row_offsets,col_indices,values,nrows,nnz), + diagonal_(diagonal) + { + } + + laplacian_matrix_t(GraphCSRView const& csr_view, vector_t&& diagonal): + sparse_matrix_t(csr_view), + diagonal_(diagonal) + { + } + + vector_t diagonal_; +}; + +template +using modularity_matrix_t = laplacian_matrix_t; // for now; TODO: if it turns out modularity matrix actually behaves differently than Laplacian matrix, this should be made a separate class; + } // namespace matrix } // namespace raft From 9d71cfc76dd9c09adfa1516578d721164f95dbc9 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 4 Jun 2020 17:42:21 -0500 Subject: [PATCH 09/88] LAPACK dependencies. --- cpp/include/raft/spectral/lapack.hpp | 57 ++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 cpp/include/raft/spectral/lapack.hpp diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp new file mode 100644 index 0000000000..430a7d5144 --- /dev/null +++ b/cpp/include/raft/spectral/lapack.hpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { + +template +class Lapack +{ +private: + Lapack(); + ~Lapack(); +public: + static void check_lapack_enabled(); + + static void gemm(bool transa, bool transb, int m, int n, int k, T alpha, const T * A, int lda, const T * B, int ldb, T beta, T * C, int ldc); + + // special QR for lanczos + static void sterf(int n, T * d, T * e); + static void steqr(char compz, int n, T * d, T * e, T * z, int ldz, T * work); + + // QR + // computes the QR factorization of a general matrix + static void geqrf (int m, int n, T *a, int lda, T *tau, T *work, int *lwork); + // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf. + //static void orgqr( int m, int n, int k, T* a, int lda, const T* tau, T* work, int* lwork ); + // multiply C by implicit Q + static void ormqr (bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork); + //static void unmqr (bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork); + //static void qrf (int n, T *H, T *Q, T *R); + + //static void hseqr (T* Q, T* R, T* eigenvalues,T* eigenvectors, int dim, int ldh, int ldq); + static void geev(T* A, T* eigenvalues, int dim, int lda); + static void geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr); + static void geev(T* A, T* eigenvalues_r, T* eigenvalues_i, T* eigenvectors_r, T* eigenvectors_i, int dim, int lda, int ldvr); + +}; + + + +} // namespace raft From 22c9f49ddba2bb2307ee7919cb5a607bc4288037 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 5 Jun 2020 16:40:57 -0500 Subject: [PATCH 10/88] Added allocation functionality via raft handle. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 126 +++++++++++++++--- 1 file changed, 111 insertions(+), 15 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 9dc75fdd77..5949895178 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -16,7 +16,9 @@ #pragma once #include -#include +#include // ? +#include + namespace raft{ namespace matrix { @@ -26,15 +28,67 @@ using size_type = int; // for now; TODO: move it in appropriate header // Vector "view"-like aggregate for linear algebra purposes // template -struct vector_t { +struct vector_view_t { value_type* buffer_; size_type size_; - vector_t(value_type* buffer, size_type sz): + vector_view_t(value_type* buffer, size_type sz): buffer_(buffer), size_(sz) { } + + vector_view_t(vector_view_t&& other): + buffer_(other.buffer_), + size_(other.size_) + { + other.buffer_ = nullptr; + other.size_ = 0; + } + + vector_view_t& operator = (vector_view_t&& other) + { + buffer_ = other.buffer_; + size_ = other.size_; + + other.buffer_ = nullptr; + other.size_ = 0; + } +}; + +// allocatable vector, using raft handle allocator +// +template +class vector_t { + handle_t const& handle_; + value_type* buffer_; + size_type size_; + cudaStream_t stream_; +public: + + vector_t(handle_t const& raft_handle, size_type sz, cudaStream_t stream = 0): + handle_(raft_handle), + buffer_(static_cast(raft_handle.get_device_allocator()->allocate(sz*sizeof(value_type), stream))), + size_(sz), + stream_(stream) + { + } + + virtual ~vector_t(void) + { + handle_.get_device_allocator()->deallocate(buffer_, size_, stream_); + } + + size_type size(void) const + { + return size_; + } + +protected: + value_type* buffer(void) + { + return buffer_; + } }; template @@ -84,29 +138,71 @@ struct sparse_matrix_t { }; template -struct laplacian_matrix_t : sparse_matrix_t { - laplacian_matrix_t(index_type const* row_offsets, +struct laplacian_matrix_t : sparse_matrix_t, vector_t { + laplacian_matrix_t(handle_t const& raft_handle, + index_type const* row_offsets, index_type const* col_indices, value_type const* values, - vector_t&& diagonal, + index_type const nrows, index_type const nnz, - index_type const nrows) : - sparse_matrix_t(row_offsets,col_indices,values,nrows,nnz), - diagonal_(diagonal) + cudaStream_t stream = 0) : + sparse_matrix_t(row_offsets,col_indices,values,nrows,nnz), + vector_t(raft_handle, nrows, stream) { + auto* v = vector_t::buffer(); } - laplacian_matrix_t(GraphCSRView const& csr_view, vector_t&& diagonal): - sparse_matrix_t(csr_view), - diagonal_(diagonal) + laplacian_matrix_t(handle_t const& raft_handle, + GraphCSRView const& csr_view, + cudaStream_t stream = 0): + sparse_matrix_t(csr_view), + vector_t(raft_handle, csr_view.number_of_vertices_, stream) { } - - vector_t diagonal_; + + // y = alpha*A*x + beta*y + // + void mv(value_type alpha, + value_type const* __restrict__ x, + value_type beta, + value_type* __restrict__ y) const override + { + //TODO: call cusparse::csrmv + } }; template -using modularity_matrix_t = laplacian_matrix_t; // for now; TODO: if it turns out modularity matrix actually behaves differently than Laplacian matrix, this should be made a separate class; +struct modularity_matrix_t: laplacian_matrix_t +{ + modularity_matrix_t(handle_t const& raft_handle, + index_type const* row_offsets, + index_type const* col_indices, + value_type const* values, + index_type const nrows, + index_type const nnz, + cudaStream_t stream = 0) : + laplacian_matrix_t(raft_handle, row_offsets, col_indices, values, nrows, nnz, stream) + { + auto* v = vector_t::buffer(); + } + + modularity_matrix_t(handle_t const& raft_handle, + GraphCSRView const& csr_view, + cudaStream_t stream = 0): + laplacian_matrix_t(raft_handle, csr_view, stream) + { + } + + // y = alpha*A*x + beta*y + // + void mv(value_type alpha, + value_type const* __restrict__ x, + value_type beta, + value_type* __restrict__ y) const override + { + //TODO: call cusparse::csrmv + } +}; } // namespace matrix } // namespace raft From 0f3617b976601f4061644a93753f1bbbef21f3c9 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 5 Jun 2020 17:24:10 -0500 Subject: [PATCH 11/88] Fixed allocator dependencies in Lanczos. --- cpp/include/raft/spectral/lanczos.hpp | 15 +++++++++------ cpp/include/raft/spectral/matrix_wrappers.hpp | 19 +++++++++++-------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 2cc9f002d1..f83652c157 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -27,6 +27,7 @@ #include #include +#include // ========================================================= // Useful macros @@ -921,7 +922,8 @@ int computeSmallestEigenvectors(sparse_matrix_t const* A * @return error flag. */ template -int computeSmallestEigenvectors(sparse_matrix_t const& A, +int computeSmallestEigenvectors(handle_t handle, + sparse_matrix_t const& A, IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, @@ -951,8 +953,8 @@ int computeSmallestEigenvectors(sparse_matrix_t const& A ValueType_ *beta_host = beta_host_v.data(); //TODO: replace and fix allocation via RAFT handle - AllocatableVector lanczosVecs_dev(n * (restartIter + 1), stream); - AllocatableVector work_dev((n + restartIter) * restartIter, stream); + vector_t lanczosVecs_dev(handle, n * (restartIter + 1), stream); + vector_t work_dev(handle, (n + restartIter) * restartIter, stream); // Perform Lanczos method IndexType_ effIter; @@ -1295,7 +1297,8 @@ int computeLargestEigenvectors(sparse_matrix_t const* A, * @return error flag. */ template -int computeLargestEigenvectors(sparse_matrix_t const& A, +int computeLargestEigenvectors(handle_t handle, + sparse_matrix_t const& A, IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, @@ -1325,8 +1328,8 @@ int computeLargestEigenvectors(sparse_matrix_t const& A, ValueType_ *beta_host = beta_host_v.data(); //TODO: replace and fix allocation via RAFT handle - AllocatableVector lanczosVecs_dev(n * (restartIter + 1), stream); - AllocatableVector work_dev((n + restartIter) * restartIter, stream); + vector_t lanczosVecs_dev(handle, n * (restartIter + 1), stream); + vector_t work_dev(handle, (n + restartIter) * restartIter, stream); // Perform Lanczos method IndexType_ effIter; diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 5949895178..f3fb509e12 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -74,7 +74,7 @@ class vector_t { { } - virtual ~vector_t(void) + ~vector_t(void) { handle_.get_device_allocator()->deallocate(buffer_, size_, stream_); } @@ -84,8 +84,7 @@ class vector_t { return size_; } -protected: - value_type* buffer(void) + value_type* raw(void) { return buffer_; } @@ -138,7 +137,7 @@ struct sparse_matrix_t { }; template -struct laplacian_matrix_t : sparse_matrix_t, vector_t { +struct laplacian_matrix_t : sparse_matrix_t { laplacian_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, index_type const* col_indices, @@ -147,16 +146,17 @@ struct laplacian_matrix_t : sparse_matrix_t, vector_t(row_offsets,col_indices,values,nrows,nnz), - vector_t(raft_handle, nrows, stream) + diagonal_(raft_handle, nrows, stream) { - auto* v = vector_t::buffer(); + auto* v = diagonal_.raw(); + //TODO: more work, here... } laplacian_matrix_t(handle_t const& raft_handle, GraphCSRView const& csr_view, cudaStream_t stream = 0): sparse_matrix_t(csr_view), - vector_t(raft_handle, csr_view.number_of_vertices_, stream) + diagonal_(raft_handle, csr_view.number_of_vertices_, stream) { } @@ -169,6 +169,8 @@ struct laplacian_matrix_t : sparse_matrix_t, vector_t diagonal_; }; template @@ -183,7 +185,8 @@ struct modularity_matrix_t: laplacian_matrix_t cudaStream_t stream = 0) : laplacian_matrix_t(raft_handle, row_offsets, col_indices, values, nrows, nnz, stream) { - auto* v = vector_t::buffer(); + auto* v = laplacian_matrix_t::diagonal_.raw(); + //TODO: more work, here... } modularity_matrix_t(handle_t const& raft_handle, From ee3102ba4516ceb4cc42531323800b849713d125 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 5 Jun 2020 20:58:58 -0500 Subject: [PATCH 12/88] Lapack dependencies. --- cpp/include/raft/spectral/error_temp.hpp | 2 + cpp/include/raft/spectral/lapack.hpp | 442 +++++++++++++++++++++++ 2 files changed, 444 insertions(+) diff --git a/cpp/include/raft/spectral/error_temp.hpp b/cpp/include/raft/spectral/error_temp.hpp index f8dabf994b..82beb75640 100644 --- a/cpp/include/raft/spectral/error_temp.hpp +++ b/cpp/include/raft/spectral/error_temp.hpp @@ -8,6 +8,8 @@ #define RAFT_TRY(error_expression) +//assume RAFT_FAIL() can take a std::string `reason` +// #define RAFT_FAIL(reason) #define CUDA_TRY(call) diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp index 430a7d5144..d86343990d 100644 --- a/cpp/include/raft/spectral/lapack.hpp +++ b/cpp/include/raft/spectral/lapack.hpp @@ -17,9 +17,87 @@ #pragma once #include +#include +#include + +//for now; TODO: check if/where this `define` should be; +// +#define USE_LAPACK namespace raft { +#define lapackCheckError(status) \ + { \ + if (status < 0) { \ + std::stringstream ss; \ + ss << "Lapack error: argument number " << -status << " had an illegal value."; \ + RAFT_FAIL(ss.str()); \ + } else if (status > 0) \ + RAFT_FAIL("Lapack error: internal error."); \ + } + + +extern "C" void sgeqrf_( + int *m, int *n, float *a, int *lda, float *tau, float *work, int *lwork, int *info); +extern "C" void dgeqrf_( + int *m, int *n, double *a, int *lda, double *tau, double *work, int *lwork, int *info); +extern "C" void sormqr_(char *side, + char *trans, + int *m, + int *n, + int *k, + float *a, + int *lda, + const float *tau, + float *c, + int *ldc, + float *work, + int *lwork, + int *info); +extern "C" void dormqr_(char *side, + char *trans, + int *m, + int *n, + int *k, + double *a, + int *lda, + const double *tau, + double *c, + int *ldc, + double *work, + int *lwork, + int *info); +extern "C" int dgeev_(char *jobvl, + char *jobvr, + int *n, + double *a, + int *lda, + double *wr, + double *wi, + double *vl, + int *ldvl, + double *vr, + int *ldvr, + double *work, + int *lwork, + int *info); + +extern "C" int sgeev_(char *jobvl, + char *jobvr, + int *n, + float *a, + int *lda, + float *wr, + float *wi, + float *vl, + int *ldvl, + float *vr, + int *ldvr, + float *work, + int *lwork, + int *info); + + template class Lapack { @@ -50,8 +128,372 @@ class Lapack static void geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr); static void geev(T* A, T* eigenvalues_r, T* eigenvalues_i, T* eigenvectors_r, T* eigenvectors_i, int dim, int lda, int ldvr); +private: + static void lapack_gemm(const char transa, + const char transb, + int m, + int n, + int k, + float alpha, + const float *a, + int lda, + const float *b, + int ldb, + float beta, + float *c, + int ldc) + { + cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnSgemmHost( + cublas_transa, cublas_transb, m, n, k, &alpha, (float *)a, lda, (float *)b, ldb, &beta, c, ldc); + } + + static void lapack_gemm(const signed char transa, + const signed char transb, + int m, + int n, + int k, + double alpha, + const double *a, + int lda, + const double *b, + int ldb, + double beta, + double *c, + int ldc) + { + cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnDgemmHost(cublas_transa, + cublas_transb, + m, + n, + k, + &alpha, + (double *)a, + lda, + (double *)b, + ldb, + &beta, + c, + ldc); + } + + + static void lapack_sterf(int n, float *d, float *e, int *info) + { + cusolverDnSsterfHost(n, d, e, info); + } + + static void lapack_sterf(int n, double *d, double *e, int *info) + { + cusolverDnDsterfHost(n, d, e, info); + } + + static void void lapack_steqr(const signed char compz, int n, float *d, float *e, float *z, int ldz, float *work, int *info) + { + cusolverDnSsteqrHost(&compz, n, d, e, z, ldz, work, info); + } + + static void lapack_steqr(const signed char compz, int n, double *d, double *e, double *z, int ldz, double *work, int *info) + { + cusolverDnDsteqrHost(&compz, n, d, e, z, ldz, work, info); + } + + static void lapack_geqrf(int m, int n, float *a, int lda, float *tau, float *work, int *lwork, int *info) + { + sgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); + } + + static void lapack_geqrf(int m, int n, double *a, int lda, double *tau, double *work, int *lwork, int *info) + { + dgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); + } + + static void lapack_ormqr(char side, + char trans, + int m, + int n, + int k, + float *a, + int lda, + float *tau, + float *c, + int ldc, + float *work, + int *lwork, + int *info) + { + sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); + } + + static void lapack_ormqr(char side, + char trans, + int m, + int n, + int k, + double *a, + int lda, + double *tau, + double *c, + int ldc, + double *work, + int *lwork, + int *info) + { + dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); + } + + static int lapack_geev_dispatch(char *jobvl, + char *jobvr, + int *n, + double *a, + int *lda, + double *wr, + double *wi, + double *vl, + int *ldvl, + double *vr, + int *ldvr, + double *work, + int *lwork, + int *info) + { + return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); + } + + static int lapack_geev_dispatch(char *jobvl, + char *jobvr, + int *n, + float *a, + int *lda, + float *wr, + float *wi, + float *vl, + int *ldvl, + float *vr, + int *ldvr, + float *work, + int *lwork, + int *info) + { + return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); + } + + // real eigenvalues + static + void lapack_geev(T *A, T *eigenvalues, int dim, int lda) + { + char job = 'N'; + std::vector WI(dim); + int ldv = 1; + T *vl = 0; + int work_size = 6 * dim; + std::vector work(work_size); + int info; + lapack_geev_dispatch(&job, + &job, + &dim, + A, + &lda, + eigenvalues, + WI.data(), + vl, + &ldv, + vl, + &ldv, + work.data(), + &work_size, + &info); + lapackCheckError(info); + } + + // real eigenpairs + static + void lapack_geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, int ldvr) + { + char jobvl = 'N'; + char jobvr = 'V'; + std::vector WI(dim); + int work_size = 6 * dim; + T *vl = 0; + int ldvl = 1; + std::vector work(work_size); + int info; + lapack_geev_dispatch(&jobvl, + &jobvr, + &dim, + A, + &lda, + eigenvalues, + WI.data(), + vl, + &ldvl, + eigenvectors, + &ldvr, + work.data(), + &work_size, + &info); + lapackCheckError(info); + } + + // complex eigenpairs + static + void lapack_geev(T *A, + T *eigenvalues_r, + T *eigenvalues_i, + T *eigenvectors_r, + T *eigenvectors_i, + int dim, + int lda, + int ldvr) + { + char jobvl = 'N'; + char jobvr = 'V'; + int work_size = 8 * dim; + int ldvl = 1; + std::vector work(work_size); + int info; + lapack_geev_dispatch(&jobvl, + &jobvr, + &dim, + A, + &lda, + eigenvalues_r, + eigenvalues_i, + 0, + &ldvl, + eigenvectors_r, + &ldvr, + work.data(), + &work_size, + &info); + lapackCheckError(info); + } + }; +template +void Lapack::check_lapack_enabled() +{ +#ifndef USE_LAPACK + RAFT_FAIL("Error: LAPACK not enabled."); +#endif +} + +template +void Lapack::gemm(bool transa, + bool transb, + int m, + int n, + int k, + T alpha, + const T *A, + int lda, + const T *B, + int ldb, + T beta, + T *C, + int ldc) +{ + // check_lapack_enabled(); + //#ifdef NVGRAPH_USE_LAPACK + const char transA_char = transa ? 'T' : 'N'; + const char transB_char = transb ? 'T' : 'N'; + lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + //#endif +} + +template +void Lapack::sterf(int n, T *d, T *e) +{ + // check_lapack_enabled(); + //#ifdef NVGRAPH_USE_LAPACK + int info; + lapack_sterf(n, d, e, &info); + lapackCheckError(info); + //#endif +} + +template +void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) +{ + // check_lapack_enabled(); + //#ifdef NVGRAPH_USE_LAPACK + int info; + lapack_steqr(compz, n, d, e, z, ldz, work, &info); + lapackCheckError(info); + //#endif +} +template +void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork) +{ + check_lapack_enabled(); +#ifdef USE_LAPACK + int info; + lapack_geqrf(m, n, a, lda, tau, work, lwork, &info); + lapackCheckError(info); +#endif +} +template +void Lapack::ormqr(bool right_side, + bool transq, + int m, + int n, + int k, + T *a, + int lda, + T *tau, + T *c, + int ldc, + T *work, + int *lwork) +{ + check_lapack_enabled(); +#ifdef USE_LAPACK + char side = right_side ? 'R' : 'L'; + char trans = transq ? 'T' : 'N'; + int info; + lapack_ormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info); + lapackCheckError(info); +#endif +} + +// real eigenvalues +template +void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) +{ + check_lapack_enabled(); +#ifdef USE_LAPACK + lapack_geev(A, eigenvalues, dim, lda); +#endif +} +// real eigenpairs +template +void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, int ldvr) +{ + check_lapack_enabled(); +#ifdef USE_LAPACK + lapack_geev(A, eigenvalues, eigenvectors, dim, lda, ldvr); +#endif +} +// complex eigenpairs +template +void Lapack::geev(T *A, + T *eigenvalues_r, + T *eigenvalues_i, + T *eigenvectors_r, + T *eigenvectors_i, + int dim, + int lda, + int ldvr) +{ + check_lapack_enabled(); +#ifdef USE_LAPACK + lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, dim, lda, ldvr); +#endif +} } // namespace raft From b0c12aaf430b47739566de164d8a0746824c47aa Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 9 Jun 2020 14:50:33 -0500 Subject: [PATCH 13/88] Added missing cusparse API. --- cpp/include/raft/sparse/cusparse_wrappers.h | 168 +++++++++++++++++++- 1 file changed, 166 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 1c63d2348b..1853b82b07 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -146,21 +146,185 @@ inline void cusparsecoosortByRow( // NOLINT * @defgroup Gemmi cusparse gemmi operations * @{ */ +template +cusparseStatus_t cusparsegemmi( // NOLINT + cusparseHandle_t handle, int m, int n, int k, int nnz, const T* alpha, const T* A, int lda, const T* cscValB, const int* cscColPtrB, const int* cscRowIndB, const T* beta, T* C, int ldc, cudaStream_t stream); +template <> inline cusparseStatus_t cusparsegemmi( cusparseHandle_t handle, int m, int n, int k, int nnz, const float* alpha, const float* A, int lda, const float* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const float* beta, float* C, int ldc) { + const int* cscRowIndB, const float* beta, float* C, int ldc, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseSgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); } +template<> inline cusparseStatus_t cusparsegemmi( cusparseHandle_t handle, int m, int n, int k, int nnz, const double* alpha, const double* A, int lda, const double* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const double* beta, double* C, int ldc) { + const int* cscRowIndB, const double* beta, double* C, int ldc, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseDgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); } /** @} */ +/** + * @defgroup Csrmv cusparse csrmv operations + * @{ + */ +template +cusparseStatus_t cusparsecsrmv( // NOLINT + cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int nnz, + const T* alpha, + const cusparseMatDescr_t descr, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const T* x, + const T* beta, + T* y, + cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int nnz, + const float* alpha, + const cusparseMatDescr_t descr, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const float* x, + const float* beta, + float* y, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseScsrmv( + handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); +} +template <> +inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int nnz, + const double* alpha, + const cusparseMatDescr_t descr, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const double* x, + const double* beta, + double* y, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseDcsrmv( + handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); +} +/** @} */ + +/** + * @defgroup Csrmm cusparse csrmm operations + * @{ + */ +template +cusparseStatus_t cusparsecsrmm( // NOLINT + cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int k, + int nnz, + const T* alpha, + const cusparseMatDescr_t descr, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const T* x, + const int ldx, + const T* beta, + T* y, + const int ldy, + cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int k, + int nnz, + const float* alpha, + const cusparseMatDescr_t descr, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const float* x, + const int ldx, + const float* beta, + float* y, + const int ldy, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseScsrmm( + handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); +} +template <> +inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int k, + int nnz, + const double* alpha, + const cusparseMatDescr_t descr, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const double* x, + const int ldx, + const double* beta, + double* y, + const int ldy, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseDcsrmm( + handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); +} +/** @} */ + +/** + * @defgroup csr2coo cusparse CSR to COO converter methods + * @{ + */ +template +void cusparsecsr2coo( // NOLINT + cusparseHandle_t handle, + const int n, + const int nnz, + const T* csrRowPtr, + T* cooRowInd, + cudaStream_t stream); +template <> +inline void cusparsecsr2coo(cusparseHandle_t handle, + const int n, + const int nnz, + const int* csrRowPtr, + int* cooRowInd, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, CUSPARSE_INDEX_BASE_ZERO)); +} +/** @} */ }; // namespace sparse }; // namespace raft From 001eec838d2ff5adbc49366e168fa765b07d5661 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 9 Jun 2020 15:30:56 -0500 Subject: [PATCH 14/88] Added cusparsesetpointermode. --- cpp/include/raft/sparse/cusparse_wrappers.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 1853b82b07..25e7146316 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -325,6 +325,27 @@ inline void cusparsecsr2coo(cusparseHandle_t handle, CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, CUSPARSE_INDEX_BASE_ZERO)); } /** @} */ + +/** + * @defgroup setpointermode cusparse set pointer mode method + * @{ + */ +// no T dependency... +// template +// cusparseStatus_t cusparsesetpointermode( // NOLINT +// cusparseHandle_t handle, +// cusparsePointerMode_t mode, +// cudaStream_t stream); + +// template<> +inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle, + cusparsePointerMode_t mode, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSetPointerMode(handle, mode); +} +/** @} */ }; // namespace sparse }; // namespace raft From 48e9d09fe418862b4e75e0fd5aa5868255771e7f Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 9 Jun 2020 16:04:16 -0500 Subject: [PATCH 15/88] Added setpointer mode to cusparse, cublas and clang-formatted. --- cpp/include/raft/linalg/cublas_wrappers.h | 20 ++ cpp/include/raft/sparse/cusparse_wrappers.h | 203 +++++++------------- 2 files changed, 88 insertions(+), 135 deletions(-) diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h index cd8a508a84..5b09a792ef 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.h +++ b/cpp/include/raft/linalg/cublas_wrappers.h @@ -542,5 +542,25 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x, } /** @} */ +/** + * @defgroup setpointermode cublas set pointer mode method + * @{ + */ +// no T dependency... +// template +// cublasStatus_t cublassetpointermode( // NOLINT +// cublasHandle_t handle, +// cublasPointerMode_t mode, +// cudaStream_t stream); + +// template<> +inline cublasStatus_t cublassetpointermode(cublasHandle_t handle, + cublasPointerMode_t mode, + cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSetPointerMode(handle, mode); +} +/** @} */ + }; // namespace linalg }; // namespace raft diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 25e7146316..865f93843d 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -148,21 +148,29 @@ inline void cusparsecoosortByRow( // NOLINT */ template cusparseStatus_t cusparsegemmi( // NOLINT - cusparseHandle_t handle, int m, int n, int k, int nnz, const T* alpha, const T* A, int lda, const T* cscValB, const int* cscColPtrB, const int* cscRowIndB, const T* beta, T* C, int ldc, cudaStream_t stream); + cusparseHandle_t handle, int m, int n, int k, int nnz, const T* alpha, + const T* A, int lda, const T* cscValB, const int* cscColPtrB, + const int* cscRowIndB, const T* beta, T* C, int ldc, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsegemmi( - cusparseHandle_t handle, int m, int n, int k, int nnz, const float* alpha, - const float* A, int lda, const float* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const float* beta, float* C, int ldc, cudaStream_t stream) { +inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, + int k, int nnz, const float* alpha, + const float* A, int lda, + const float* cscValB, + const int* cscColPtrB, + const int* cscRowIndB, const float* beta, + float* C, int ldc, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseSgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); } -template<> -inline cusparseStatus_t cusparsegemmi( - cusparseHandle_t handle, int m, int n, int k, int nnz, const double* alpha, - const double* A, int lda, const double* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const double* beta, double* C, int ldc, cudaStream_t stream) { +template <> +inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, + int k, int nnz, const double* alpha, + const double* A, int lda, + const double* cscValB, + const int* cscColPtrB, + const int* cscRowIndB, const double* beta, + double* C, int ldc, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseDgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); @@ -174,60 +182,30 @@ inline cusparseStatus_t cusparsegemmi( */ template cusparseStatus_t cusparsecsrmv( // NOLINT - cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int nnz, - const T* alpha, - const cusparseMatDescr_t descr, - const T* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const T* x, - const T* beta, - T* y, - cudaStream_t stream); + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, + const T* alpha, const cusparseMatDescr_t descr, const T* csrVal, + const int* csrRowPtr, const int* csrColInd, const T* x, const T* beta, T* y, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int nnz, - const float* alpha, - const cusparseMatDescr_t descr, - const float* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const float* x, - const float* beta, - float* y, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsrmv( + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, + const float* alpha, const cusparseMatDescr_t descr, const float* csrVal, + const int* csrRowPtr, const int* csrColInd, const float* x, const float* beta, + float* y, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsrmv( - handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); + return cusparseScsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal, + csrRowPtr, csrColInd, x, beta, y); } template <> -inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int nnz, - const double* alpha, - const cusparseMatDescr_t descr, - const double* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const double* x, - const double* beta, - double* y, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsrmv( + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, + const double* alpha, const cusparseMatDescr_t descr, const double* csrVal, + const int* csrRowPtr, const int* csrColInd, const double* x, + const double* beta, double* y, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsrmv( - handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); -} + return cusparseDcsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal, + csrRowPtr, csrColInd, x, beta, y); +} /** @} */ /** @@ -236,68 +214,31 @@ inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, */ template cusparseStatus_t cusparsecsrmm( // NOLINT - cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int k, - int nnz, - const T* alpha, - const cusparseMatDescr_t descr, - const T* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const T* x, - const int ldx, - const T* beta, - T* y, - const int ldy, - cudaStream_t stream); + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, + int nnz, const T* alpha, const cusparseMatDescr_t descr, const T* csrVal, + const int* csrRowPtr, const int* csrColInd, const T* x, const int ldx, + const T* beta, T* y, const int ldy, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int k, - int nnz, - const float* alpha, - const cusparseMatDescr_t descr, - const float* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const float* x, - const int ldx, - const float* beta, - float* y, - const int ldy, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsrmm( + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, + int nnz, const float* alpha, const cusparseMatDescr_t descr, + const float* csrVal, const int* csrRowPtr, const int* csrColInd, + const float* x, const int ldx, const float* beta, float* y, const int ldy, + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsrmm( - handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); + return cusparseScsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, + csrRowPtr, csrColInd, x, ldx, beta, y, ldy); } template <> -inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int k, - int nnz, - const double* alpha, - const cusparseMatDescr_t descr, - const double* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const double* x, - const int ldx, - const double* beta, - double* y, - const int ldy, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsrmm( + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, + int nnz, const double* alpha, const cusparseMatDescr_t descr, + const double* csrVal, const int* csrRowPtr, const int* csrColInd, + const double* x, const int ldx, const double* beta, double* y, const int ldy, + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsrmm( - handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); + return cusparseDcsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, + csrRowPtr, csrColInd, x, ldx, beta, y, ldy); } /** @} */ @@ -307,25 +248,18 @@ inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, */ template void cusparsecsr2coo( // NOLINT - cusparseHandle_t handle, - const int n, - const int nnz, - const T* csrRowPtr, - T* cooRowInd, - cudaStream_t stream); + cusparseHandle_t handle, const int n, const int nnz, const T* csrRowPtr, + T* cooRowInd, cudaStream_t stream); template <> -inline void cusparsecsr2coo(cusparseHandle_t handle, - const int n, - const int nnz, - const int* csrRowPtr, - int* cooRowInd, - cudaStream_t stream) -{ +inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz, + const int* csrRowPtr, int* cooRowInd, + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, + CUSPARSE_INDEX_BASE_ZERO)); } /** @} */ - + /** * @defgroup setpointermode cusparse set pointer mode method * @{ @@ -340,12 +274,11 @@ inline void cusparsecsr2coo(cusparseHandle_t handle, // template<> inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle, cusparsePointerMode_t mode, - cudaStream_t stream) -{ + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSetPointerMode(handle, mode); + return cusparseSetPointerMode(handle, mode); } - /** @} */ + }; // namespace sparse }; // namespace raft From e311c3e34f975c1bea632e01a96b30d45b1f35b9 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 9 Jun 2020 18:59:28 -0500 Subject: [PATCH 16/88] Added sscal to cublas wrappers. --- cpp/include/raft/linalg/cublas_wrappers.h | 26 +++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h index 5b09a792ef..5d80b62458 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.h +++ b/cpp/include/raft/linalg/cublas_wrappers.h @@ -562,5 +562,31 @@ inline cublasStatus_t cublassetpointermode(cublasHandle_t handle, } /** @} */ +/** + * @defgroup scal cublas dot calls + * @{ + */ +template +cublasStatus_t cublasscal(cublasHandle_t handle, int n, const T *alpha, T *x, + int incx, cudaStream_t stream); + +template <> +inline cublasStatus_t cublasscal(cublasHandle_t handle, int n, + const float *alpha, float *x, int incx, + cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasSscal(handle, n, alpha, x, incx); +} + +template <> +inline cublasStatus_t cublasscal(cublasHandle_t handle, int n, + const double *alpha, double *x, int incx, + cudaStream_t stream) { + CUBLAS_CHECK(cublasSetStream(handle, stream)); + return cublasDscal(handle, n, alpha, x, incx); +} + +/** @} */ + }; // namespace linalg }; // namespace raft From 022948aab6ef9c27613137a11d5c831680215354 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 9 Jun 2020 20:51:55 -0500 Subject: [PATCH 17/88] Updated cublas calls in Lanczos (partially). --- cpp/include/raft/spectral/lanczos.hpp | 246 +++++++++++++++----------- 1 file changed, 144 insertions(+), 102 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index f83652c157..c375ebbb3e 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -25,9 +25,11 @@ #include #include +#include +#include #include #include -#include + // ========================================================= // Useful macros @@ -73,7 +75,8 @@ using namespace matrix; * @return Zero if successful. Otherwise non-zero. */ template -int performLanczosIteration(sparse_matrix_t const* A, +int performLanczosIteration(handle_t handle, + sparse_matrix_t const* A, IndexType_ *iter, IndexType_ maxIter, ValueType_ shift, @@ -93,8 +96,13 @@ int performLanczosIteration(sparse_matrix_t const* A, const ValueType_ negOne = -1; const ValueType_ zero = 0; - IndexType_ n = A->nrows; + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + RAFT_EXPECT( A != nullptr, "Null matrix pointer."); + IndexType_ n = A->nrows; + // ------------------------------------------------------- // Compute second Lanczos vector // ------------------------------------------------------- @@ -103,20 +111,22 @@ int performLanczosIteration(sparse_matrix_t const* A, // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync( - lanczosVecs_dev + n, lanczosVecs_dev, n * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev, n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); // Orthogonalize Lanczos vector - Cublas::dot(n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host); - Cublas::axpy(n, -alpha_host[0], lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1); - beta_host[0] = Cublas::nrm2(n, lanczosVecs_dev + IDX(0, 1, n), 1); + CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream)); + + auto alpha = -alpha_host[0]; + CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream)); // Check if Lanczos has converged if (beta_host[0] <= tol) return 0; // Normalize Lanczos vector - Cublas::scal(n, 1 / beta_host[0], lanczosVecs_dev + IDX(0, 1, n), 1); + alpha = 1 / beta_host[0]; + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); } // ------------------------------------------------------- @@ -131,91 +141,115 @@ int performLanczosIteration(sparse_matrix_t const* A, CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n, n * sizeof(ValueType_), - cudaMemcpyDeviceToDevice)); + cudaMemcpyDeviceToDevice, stream)); A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); // Full reorthogonalization // "Twice is enough" algorithm per Kahan and Parlett if (reorthogonalize) { - Cublas::gemv(true, - n, - *iter, - &one, - lanczosVecs_dev, - n, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - &zero, - work_dev, - 1); - Cublas::gemv(false, - n, - *iter, - &negOne, - lanczosVecs_dev, - n, - work_dev, - 1, - &one, - lanczosVecs_dev + IDX(0, *iter, n), - 1); + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); + CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), - work_dev + (*iter - 1), - sizeof(ValueType_), - cudaMemcpyDeviceToHost)); - Cublas::gemv(true, - n, - *iter, - &one, - lanczosVecs_dev, - n, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - &zero, - work_dev, - 1); - Cublas::gemv(false, - n, - *iter, - &negOne, - lanczosVecs_dev, - n, - work_dev, - 1, - &one, - lanczosVecs_dev + IDX(0, *iter, n), - 1); + work_dev + (*iter - 1), + sizeof(ValueType_), + cudaMemcpyDeviceToHost, stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); } // Orthogonalization with 3-term recurrence relation else { - Cublas::dot(n, - lanczosVecs_dev + IDX(0, *iter - 1, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - alpha_host + (*iter - 1)); - Cublas::axpy(n, - -alpha_host[*iter - 1], - lanczosVecs_dev + IDX(0, *iter - 1, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1); - Cublas::axpy(n, - -beta_host[*iter - 2], - lanczosVecs_dev + IDX(0, *iter - 2, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1); + CUBLAS_CHECK(cublasdot(cublas_h, + n, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + alpha_host + (*iter - 1), + stream)); + + auto alpha = -alpha_host[*iter - 1]; + CUBLAS_CHECK(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); + + alpha = -beta_host[*iter - 2]; + CUBLAS_CHECK(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 2, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); } // Compute residual - beta_host[*iter - 1] = Cublas::nrm2(n, lanczosVecs_dev + IDX(0, *iter, n), 1); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream)); // Check if Lanczos has converged if (beta_host[*iter - 1] <= tol) break; + // Normalize Lanczos vector - Cublas::scal(n, 1 / beta_host[*iter - 1], lanczosVecs_dev + IDX(0, *iter, n), 1); + alpha = 1 / beta_host[*iter - 1]; + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } CUDA_TRY(cudaDeviceSynchronize()); @@ -557,10 +591,10 @@ static int lanczosRestart(IndexType_ n, // Obtain new residual CUDA_TRY( - cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(ValueType_), cudaMemcpyHostToDevice)); + cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(ValueType_), cudaMemcpyHostToDevice, stream)); beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; - Cublas::gemv(false, + cublasgemv(false, n, iter, beta_host + iter_new - 1, @@ -573,19 +607,18 @@ static int lanczosRestart(IndexType_ n, 1); // Obtain new Lanczos vectors - Cublas::gemm( + cublasgemm( false, false, n, iter_new, iter, &one, lanczosVecs_dev, n, V_dev, iter, &zero, work_dev, n); - CUDA_TRY(cudaMemcpyAsync( - lanczosVecs_dev, work_dev, n * iter_new * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev, n * iter_new * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); // Normalize residual to obtain new Lanczos vector CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n), n * sizeof(ValueType_), - cudaMemcpyDeviceToDevice)); - beta_host[iter_new - 1] = Cublas::nrm2(n, lanczosVecs_dev + IDX(0, iter_new, n), 1); - Cublas::scal(n, 1 / beta_host[iter_new - 1], lanczosVecs_dev + IDX(0, iter_new, n), 1); + cudaMemcpyDeviceToDevice, stream)); + beta_host[iter_new - 1] = cublasnrm2(n, lanczosVecs_dev + IDX(0, iter_new, n), 1); + cublasscal(n, 1 / beta_host[iter_new - 1], lanczosVecs_dev + IDX(0, iter_new, n), 1); return 0; } @@ -643,7 +676,8 @@ static int lanczosRestart(IndexType_ n, * @return error flag. */ template -int computeSmallestEigenvectors(sparse_matrix_t const* A, +int computeSmallestEigenvectors(handle_t handle, + sparse_matrix_t const* A, IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, @@ -718,7 +752,7 @@ int computeSmallestEigenvectors(sparse_matrix_t const* A work_host = work_host_v.data(); // Initialize cuBLAS - Cublas::set_pointer_mode_host(); + cublasset_pointer_mode_host(); // ------------------------------------------------------- // Compute largest eigenvalue to determine shift @@ -736,8 +770,8 @@ int computeSmallestEigenvectors(sparse_matrix_t const* A // CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, time(NULL))); // Initialize initial Lanczos vector CUDA_TRY(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); - ValueType_ normQ1 = Cublas::nrm2(n, lanczosVecs_dev, 1); - Cublas::scal(n, 1 / normQ1, lanczosVecs_dev, 1); + ValueType_ normQ1 = cublasnrm2(n, lanczosVecs_dev, 1); + cublasscal(n, 1 / normQ1, lanczosVecs_dev, 1); // Estimate number of Lanczos iterations // See bounds in Kuczynski and Wozniakowski (1992). @@ -749,7 +783,8 @@ int computeSmallestEigenvectors(sparse_matrix_t const* A // Obtain tridiagonal matrix with Lanczos *effIter = 0; *shift = 0; - status = performLanczosIteration(A, + status = performLanczosIteration(handle, + A, effIter, maxIter_curr, *shift, @@ -773,7 +808,8 @@ int computeSmallestEigenvectors(sparse_matrix_t const* A // Obtain tridiagonal matrix with Lanczos *effIter = 0; // maxIter_curr = min(maxIter, restartIter); - status = performLanczosIteration(A, + status = performLanczosIteration(handle, + A, effIter, maxIter_curr, *shift, @@ -819,7 +855,8 @@ int computeSmallestEigenvectors(sparse_matrix_t const* A // Proceed with Lanczos method // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); - status = performLanczosIteration(A, + status = performLanczosIteration(handle, + A, effIter, maxIter_curr, *shift, @@ -866,7 +903,7 @@ int computeSmallestEigenvectors(sparse_matrix_t const* A work_dev, Z_host, (*effIter) * nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); // Convert eigenvectors from Lanczos basis to standard basis - Cublas::gemm(false, + cublasgemm(false, false, n, nEigVecs, @@ -959,7 +996,8 @@ int computeSmallestEigenvectors(handle_t handle, // Perform Lanczos method IndexType_ effIter; ValueType_ shift; - int status = computeSmallestEigenvectors(&A, + int status = computeSmallestEigenvectors(handle, + &A, nEigVecs, maxIter, restartIter, @@ -1026,7 +1064,8 @@ int computeSmallestEigenvectors(handle_t handle, * @return error flag. */ template -int computeLargestEigenvectors(sparse_matrix_t const* A, +int computeLargestEigenvectors(handle_t handle, + sparse_matrix_t const* A, IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, @@ -1095,7 +1134,7 @@ int computeLargestEigenvectors(sparse_matrix_t const* A, work_host = work_host_v.data(); // Initialize cuBLAS - Cublas::set_pointer_mode_host(); + cublasset_pointer_mode_host(); // ------------------------------------------------------- // Compute largest eigenvalue @@ -1108,8 +1147,8 @@ int computeLargestEigenvectors(sparse_matrix_t const* A, CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, seed)); // Initialize initial Lanczos vector CUDA_TRY(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); - ValueType_ normQ1 = Cublas::nrm2(n, lanczosVecs_dev, 1); - Cublas::scal(n, 1 / normQ1, lanczosVecs_dev, 1); + ValueType_ normQ1 = cublasnrm2(n, lanczosVecs_dev, 1); + cublasscal(n, 1 / normQ1, lanczosVecs_dev, 1); // Estimate number of Lanczos iterations // See bounds in Kuczynski and Wozniakowski (1992). @@ -1123,7 +1162,8 @@ int computeLargestEigenvectors(sparse_matrix_t const* A, ValueType_ shift_val = 0.0; ValueType_ *shift = &shift_val; // maxIter_curr = min(maxIter, restartIter); - status = performLanczosIteration(A, + status = performLanczosIteration(handle, + A, effIter, maxIter_curr, *shift, @@ -1169,7 +1209,8 @@ int computeLargestEigenvectors(sparse_matrix_t const* A, // Proceed with Lanczos method // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); - status = performLanczosIteration(A, + status = performLanczosIteration(handle, + A, effIter, maxIter_curr, *shift, @@ -1241,7 +1282,7 @@ int computeLargestEigenvectors(sparse_matrix_t const* A, cudaMemcpyHostToDevice)); // Convert eigenvectors from Lanczos basis to standard basis - Cublas::gemm(false, + cublasgemm(false, false, n, nEigVecs, @@ -1333,7 +1374,8 @@ int computeLargestEigenvectors(handle_t handle, // Perform Lanczos method IndexType_ effIter; - int status = computeLargestEigenvectors(&A, + int status = computeLargestEigenvectors(handle, + &A, nEigVecs, maxIter, restartIter, From 43d3f78f9200cd88a94242b6729b991885ef84af Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 10 Jun 2020 12:31:26 -0500 Subject: [PATCH 18/88] Updated cublas depends. in Lanczos. --- cpp/include/raft/spectral/lanczos.hpp | 698 ++++++++++---------------- 1 file changed, 260 insertions(+), 438 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index c375ebbb3e..938c4421ab 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -25,11 +25,10 @@ #include #include -#include #include -#include +#include #include - +#include // ========================================================= // Useful macros @@ -43,7 +42,7 @@ namespace raft { namespace { using namespace matrix; - + // ========================================================= // Helper functions // ========================================================= @@ -75,34 +74,28 @@ using namespace matrix; * @return Zero if successful. Otherwise non-zero. */ template -int performLanczosIteration(handle_t handle, - sparse_matrix_t const* A, - IndexType_ *iter, - IndexType_ maxIter, - ValueType_ shift, - ValueType_ tol, - bool reorthogonalize, - ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev) -{ +int performLanczosIteration( + handle_t handle, sparse_matrix_t const *A, + IndexType_ *iter, IndexType_ maxIter, ValueType_ shift, ValueType_ tol, + bool reorthogonalize, ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful variables - const ValueType_ one = 1; + const ValueType_ one = 1; const ValueType_ negOne = -1; - const ValueType_ zero = 0; + const ValueType_ zero = 0; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); - RAFT_EXPECT( A != nullptr, "Null matrix pointer."); + RAFT_EXPECT(A != nullptr, "Null matrix pointer."); IndexType_ n = A->nrows; - + // ------------------------------------------------------- // Compute second Lanczos vector // ------------------------------------------------------- @@ -111,22 +104,29 @@ int performLanczosIteration(handle_t handle, // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev, n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev, + n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, + stream)); A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); // Orthogonalize Lanczos vector - CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream)); + CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1, + lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, + stream)); auto alpha = -alpha_host[0]; - CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1, + lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, + beta_host, stream)); // Check if Lanczos has converged if (beta_host[0] <= tol) return 0; // Normalize Lanczos vector alpha = 1 / beta_host[0]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), + 1, stream)); } // ------------------------------------------------------- @@ -138,118 +138,65 @@ int performLanczosIteration(handle_t handle, // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, - lanczosVecs_dev + (*iter - 1) * n, - n * sizeof(ValueType_), - cudaMemcpyDeviceToDevice, stream)); - A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); + CUDA_TRY(cudaMemcpyAsync( + lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n, + n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); + A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, + lanczosVecs_dev + IDX(0, *iter, n)); // Full reorthogonalization // "Twice is enough" algorithm per Kahan and Parlett if (reorthogonalize) { - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_T, - n, - *iter, - &one, - lanczosVecs_dev, - n, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - &zero, - work_dev, - 1, - stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_N, - n, - *iter, - &negOne, - lanczosVecs_dev, - n, - work_dev, - 1, - &one, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); - - CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), - work_dev + (*iter - 1), - sizeof(ValueType_), - cudaMemcpyDeviceToHost, stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_T, - n, - *iter, - &one, - lanczosVecs_dev, - n, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - &zero, - work_dev, - 1, - stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_N, - n, - *iter, - &negOne, - lanczosVecs_dev, - n, - work_dev, - 1, - &one, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); + CUBLAS_CHECK(cublasgemv( + cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n, + lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne, + lanczosVecs_dev, n, work_dev, 1, &one, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + + CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1), + sizeof(ValueType_), cudaMemcpyDeviceToHost, + stream)); + + CUBLAS_CHECK(cublasgemv( + cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n, + lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne, + lanczosVecs_dev, n, work_dev, 1, &one, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } // Orthogonalization with 3-term recurrence relation else { - CUBLAS_CHECK(cublasdot(cublas_h, - n, - lanczosVecs_dev + IDX(0, *iter - 1, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - alpha_host + (*iter - 1), - stream)); + CUBLAS_CHECK(cublasdot(cublas_h, n, + lanczosVecs_dev + IDX(0, *iter - 1, n), 1, + lanczosVecs_dev + IDX(0, *iter, n), 1, + alpha_host + (*iter - 1), stream)); auto alpha = -alpha_host[*iter - 1]; - CUBLAS_CHECK(cublasaxpy(cublas_h, - n, - &alpha, - lanczosVecs_dev + IDX(0, *iter - 1, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, + lanczosVecs_dev + IDX(0, *iter - 1, n), 1, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); alpha = -beta_host[*iter - 2]; - CUBLAS_CHECK(cublasaxpy(cublas_h, - n, - &alpha, - lanczosVecs_dev + IDX(0, *iter - 2, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, + lanczosVecs_dev + IDX(0, *iter - 2, n), 1, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } // Compute residual - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, + beta_host + *iter - 1, stream)); // Check if Lanczos has converged if (beta_host[*iter - 1] <= tol) break; - + // Normalize Lanczos vector alpha = 1 / beta_host[*iter - 1]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } CUDA_TRY(cudaDeviceSynchronize()); @@ -273,8 +220,7 @@ int performLanczosIteration(handle_t handle, * matrix. Matrix dimensions are 3 x 3. */ template -static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) -{ +static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) { // Compute norm of vector *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); @@ -284,7 +230,8 @@ static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) v[0] -= *Pv; // Normalize Householder vector - ValueType_ normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + ValueType_ normHouseholder = + std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); if (normHouseholder != 0) { v[0] /= normHouseholder; v[1] /= normHouseholder; @@ -312,8 +259,7 @@ static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. */ template -static void applyHouseholder3(const ValueType_ *v, ValueType_ *A) -{ +static void applyHouseholder3(const ValueType_ *v, ValueType_ *A) { // Loop indices IndexType_ i, j; // Dot product between Householder vector and matrix row/column @@ -353,14 +299,10 @@ static void applyHouseholder3(const ValueType_ *v, ValueType_ *A) * @return Zero if successful. Otherwise non-zero. */ template -static int francisQRIteration(IndexType_ n, - ValueType_ shift1, - ValueType_ shift2, - ValueType_ *alpha, - ValueType_ *beta, - ValueType_ *V, - ValueType_ *work) -{ +static int francisQRIteration(IndexType_ n, ValueType_ shift1, + ValueType_ shift2, ValueType_ *alpha, + ValueType_ *beta, ValueType_ *V, + ValueType_ *work) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- @@ -390,7 +332,8 @@ static int francisQRIteration(IndexType_ n, householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c; householder[1] = beta[0] * (alpha[0] + alpha[1] + b); householder[2] = beta[0] * beta[1]; - findHouseholder3(householder, &temp, householderMatrix); + findHouseholder3(householder, &temp, + householderMatrix); // Apply initial Householder transform to create bulge memset(bulge, 0, 16 * sizeof(ValueType_)); @@ -400,13 +343,14 @@ static int francisQRIteration(IndexType_ n, bulge[IDX(i, i + 1, 4)] = beta[i]; } applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n); + Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, + 0, work, n); memcpy(V, work, 3 * n * sizeof(ValueType_)); // Chase bulge to bottom-right of matrix with Householder transforms for (pos = 0; pos < n - 4; ++pos) { // Move to next position - alpha[pos] = bulge[IDX(0, 0, 4)]; + alpha[pos] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = bulge[IDX(3, 0, 4)]; @@ -421,16 +365,17 @@ static int francisQRIteration(IndexType_ n, bulge[IDX(3, 3, 4)] = alpha[pos + 4]; // Apply Householder transform - findHouseholder3(householder, beta + pos, householderMatrix); + findHouseholder3(householder, beta + pos, + householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm( - false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n); + Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), + n, householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(ValueType_)); } // Apply penultimate Householder transform // Values in the last row and column are zero - alpha[n - 4] = bulge[IDX(0, 0, 4)]; + alpha[n - 4] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = bulge[IDX(3, 0, 4)]; @@ -443,30 +388,32 @@ static int francisQRIteration(IndexType_ n, bulge[IDX(1, 3, 4)] = 0; bulge[IDX(2, 3, 4)] = 0; bulge[IDX(3, 3, 4)] = 0; - findHouseholder3(householder, beta + n - 4, householderMatrix); + findHouseholder3(householder, beta + n - 4, + householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm( - false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n); + Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, + householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(ValueType_)); // Apply final Householder transform // Values in the last two rows and columns are zero - alpha[n - 3] = bulge[IDX(0, 0, 4)]; + alpha[n - 3] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = 0; for (j = 0; j < 3; ++j) for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; - findHouseholder3(householder, beta + n - 3, householderMatrix); + findHouseholder3(householder, beta + n - 3, + householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm( - false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n); + Lapack::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, + householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(ValueType_)); // Bulge has been eliminated alpha[n - 2] = bulge[IDX(0, 0, 4)]; alpha[n - 1] = bulge[IDX(1, 1, 4)]; - beta[n - 2] = bulge[IDX(1, 0, 4)]; + beta[n - 2] = bulge[IDX(1, 0, 4)]; return 0; } @@ -501,26 +448,23 @@ static int francisQRIteration(IndexType_ n, * Workspace. */ template -static int lanczosRestart(IndexType_ n, - IndexType_ iter, - IndexType_ iter_new, - ValueType_ *shiftUpper, - ValueType_ *shiftLower, - ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ V_host, - ValueType_ *__restrict__ work_host, - ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev, - bool smallest_eig) -{ +static int lanczosRestart( + handle_t handle, IndexType_ n, IndexType_ iter, IndexType_ iter_new, + ValueType_ *shiftUpper, ValueType_ *shiftLower, + ValueType_ *__restrict__ alpha_host, ValueType_ *__restrict__ beta_host, + ValueType_ *__restrict__ V_host, ValueType_ *__restrict__ work_host, + ValueType_ *__restrict__ lanczosVecs_dev, ValueType_ *__restrict__ work_dev, + bool smallest_eig) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful constants const ValueType_ zero = 0; - const ValueType_ one = 1; + const ValueType_ one = 1; + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); // Loop index IndexType_ i; @@ -578,52 +522,54 @@ static int lanczosRestart(IndexType_ n, // Calculate Chebyshev nodes as shifts shifts_host = ritzVals_host; for (i = 0; i < restartSteps; ++i) { - shifts_host[i] = cos((i + 0.5) * static_cast(M_PI) / restartSteps); + shifts_host[i] = + cos((i + 0.5) * static_cast(M_PI) / restartSteps); shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower)); shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower)); } // Apply Francis QR algorithm to implicitly restart Lanczos for (i = 0; i < restartSteps; i += 2) - if (francisQRIteration( - iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host)) + if (francisQRIteration(iter, shifts_host[i], shifts_host[i + 1], alpha_host, + beta_host, V_host, work_host)) WARNING("error in implicitly shifted QR algorithm"); // Obtain new residual - CUDA_TRY( - cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(ValueType_), cudaMemcpyHostToDevice, stream)); - - beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; - cublasgemv(false, - n, - iter, - beta_host + iter_new - 1, - lanczosVecs_dev, - n, - V_dev + IDX(0, iter_new, iter), - 1, - beta_host + iter - 1, - lanczosVecs_dev + IDX(0, iter, n), - 1); + CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(ValueType_), + cudaMemcpyHostToDevice, stream)); + + beta_host[iter - 1] = + beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; + CUBLAS_CHECK(cublasgemv( + cublas_h, CUBLAS_OP_N, n, iter, beta_host + iter_new - 1, lanczosVecs_dev, + n, V_dev + IDX(0, iter_new, iter), 1, beta_host + iter - 1, + lanczosVecs_dev + IDX(0, iter, n), 1, stream)); // Obtain new Lanczos vectors - cublasgemm( - false, false, n, iter_new, iter, &one, lanczosVecs_dev, n, V_dev, iter, &zero, work_dev, n); + CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, iter_new, iter, + &one, lanczosVecs_dev, n, V_dev, iter, &zero, + work_dev, n, stream)); - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev, n * iter_new * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev, + n * iter_new * sizeof(ValueType_), + cudaMemcpyDeviceToDevice, stream)); // Normalize residual to obtain new Lanczos vector - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), - lanczosVecs_dev + IDX(0, iter, n), - n * sizeof(ValueType_), - cudaMemcpyDeviceToDevice, stream)); - beta_host[iter_new - 1] = cublasnrm2(n, lanczosVecs_dev + IDX(0, iter_new, n), 1); - cublasscal(n, 1 / beta_host[iter_new - 1], lanczosVecs_dev + IDX(0, iter_new, n), 1); + CUDA_TRY(cudaMemcpyAsync( + lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n), + n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); + + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, + beta_host + iter_new - 1, stream)); + + auto h_beta = 1 / beta_host[iter_new - 1]; + CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, + lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); return 0; } -} // anonym. namespace +} // namespace // ========================================================= // Eigensolver @@ -676,30 +622,20 @@ static int lanczosRestart(IndexType_ n, * @return error flag. */ template -int computeSmallestEigenvectors(handle_t handle, - sparse_matrix_t const* A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ *effIter, - IndexType_ *totalIter, - ValueType_ *shift, - ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev, - unsigned long long seed) -{ +int computeSmallestEigenvectors( + handle_t handle, sparse_matrix_t const *A, + IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, + ValueType_ tol, bool reorthogonalize, IndexType_ *effIter, + IndexType_ *totalIter, ValueType_ *shift, ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev, ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev, unsigned long long seed) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful constants - const ValueType_ one = 1; + const ValueType_ one = 1; const ValueType_ zero = 0; // Matrix dimension @@ -730,12 +666,14 @@ int computeSmallestEigenvectors(handle_t handle, // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECT(nEigVecs > 0 && nEigVecs<=n, "Invalid number of eigenvectors."); + RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); RAFT_EXPECT(tol > 0, "Invalid tolerance."); RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); - + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); // ------------------------------------------------------- // Variable initialization @@ -748,11 +686,12 @@ int computeSmallestEigenvectors(handle_t handle, std::vector Z_host_v(restartIter * restartIter); std::vector work_host_v(4 * restartIter); - Z_host = Z_host_v.data(); + Z_host = Z_host_v.data(); work_host = work_host_v.data(); // Initialize cuBLAS - cublasset_pointer_mode_host(); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, + stream)); // ????? TODO: check / remove // ------------------------------------------------------- // Compute largest eigenvalue to determine shift @@ -769,9 +708,13 @@ int computeSmallestEigenvectors(handle_t handle, CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, seed /*time(NULL)*/)); // CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, time(NULL))); // Initialize initial Lanczos vector - CUDA_TRY(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); - ValueType_ normQ1 = cublasnrm2(n, lanczosVecs_dev, 1); - cublasscal(n, 1 / normQ1, lanczosVecs_dev, 1); + CUDA_TRY( + curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); + ValueType_ normQ1; + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); + + auto h_val = 1 / normQ1; + CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); // Estimate number of Lanczos iterations // See bounds in Kuczynski and Wozniakowski (1992). @@ -782,18 +725,10 @@ int computeSmallestEigenvectors(handle_t handle, // Obtain tridiagonal matrix with Lanczos *effIter = 0; - *shift = 0; - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - 0.0, - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); + *shift = 0; + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, 0.0, reorthogonalize, alpha_host, + beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); // Determine largest eigenvalue @@ -808,17 +743,9 @@ int computeSmallestEigenvectors(handle_t handle, // Obtain tridiagonal matrix with Lanczos *effIter = 0; // maxIter_curr = min(maxIter, restartIter); - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - 0, - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, + beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter; @@ -835,18 +762,9 @@ int computeSmallestEigenvectors(handle_t handle, if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart(n, - *effIter, - iter_new, - &shiftUpper, - &shiftLower, - alpha_host, - beta_host, - Z_host, - work_host, - lanczosVecs_dev, - work_dev, - true); + status = lanczosRestart( + handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, + beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, true); if (status) WARNING("error in Lanczos implicit restart"); *effIter = iter_new; @@ -855,17 +773,9 @@ int computeSmallestEigenvectors(handle_t handle, // Proceed with Lanczos method // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - tol * fabs(shiftLower), - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), + reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter - iter_new; } @@ -876,14 +786,12 @@ int computeSmallestEigenvectors(handle_t handle, } // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(ValueType_)); - memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(ValueType_)); - Lapack::steqr('I', - *effIter, - work_host + 2 * (*effIter), - work_host + 3 * (*effIter), - Z_host, - *effIter, + memcpy(work_host + 2 * (*effIter), alpha_host, + (*effIter) * sizeof(ValueType_)); + memcpy(work_host + 3 * (*effIter), beta_host, + (*effIter - 1) * sizeof(ValueType_)); + Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), + work_host + 3 * (*effIter), Z_host, *effIter, work_host); // Obtain desired eigenvalues by applying shift @@ -891,31 +799,20 @@ int computeSmallestEigenvectors(handle_t handle, for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory - CUDA_TRY(cudaMemcpy(eigVals_dev, - work_host + 2 * (*effIter), - nEigVecs * sizeof(ValueType_), - cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpy(eigVals_dev, work_host + 2 * (*effIter), + nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); // for (int i = 0; i < nEigVecs; ++i) //{ // std::cout <<*(work_host+(2*(*effIter)+i))<< std::endl; //} - CUDA_TRY(cudaMemcpy( - work_dev, Z_host, (*effIter) * nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpy(work_dev, Z_host, + (*effIter) * nEigVecs * sizeof(ValueType_), + cudaMemcpyHostToDevice)); // Convert eigenvectors from Lanczos basis to standard basis - cublasgemm(false, - false, - n, - nEigVecs, - *effIter, - &one, - lanczosVecs_dev, - n, - work_dev, - *effIter, - &zero, - eigVecs_dev, - n); + CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, + *effIter, &one, lanczosVecs_dev, n, work_dev, + *effIter, &zero, eigVecs_dev, n, stream)); // Clean up and exit CUDA_TRY(curandDestroyGenerator(randGen)); @@ -959,24 +856,17 @@ int computeSmallestEigenvectors(handle_t handle, * @return error flag. */ template -int computeSmallestEigenvectors(handle_t handle, - sparse_matrix_t const& A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ &iter, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev, - unsigned long long seed = 1234567, - cudaStream_t stream = 0) -{ +int computeSmallestEigenvectors( + handle_t handle, sparse_matrix_t const &A, + IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, + ValueType_ tol, bool reorthogonalize, IndexType_ &iter, + ValueType_ *__restrict__ eigVals_dev, ValueType_ *__restrict__ eigVecs_dev, + unsigned long long seed = 1234567) { // Matrix dimension IndexType_ n = A.nrows; // Check that parameters are valid - RAFT_EXPECT(nEigVecs > 0 && nEigVecs<=n, "Invalid number of eigenvectors."); + RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); RAFT_EXPECT(tol > 0, "Invalid tolerance."); RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); @@ -987,32 +877,20 @@ int computeSmallestEigenvectors(handle_t handle, std::vector beta_host_v(restartIter); ValueType_ *alpha_host = alpha_host_v.data(); - ValueType_ *beta_host = beta_host_v.data(); + ValueType_ *beta_host = beta_host_v.data(); //TODO: replace and fix allocation via RAFT handle vector_t lanczosVecs_dev(handle, n * (restartIter + 1), stream); - vector_t work_dev(handle, (n + restartIter) * restartIter, stream); + vector_t work_dev(handle, (n + restartIter) * restartIter, + stream); // Perform Lanczos method IndexType_ effIter; ValueType_ shift; - int status = computeSmallestEigenvectors(handle, - &A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - &effIter, - &iter, - &shift, - alpha_host, - beta_host, - lanczosVecs_dev.raw(), - work_dev.raw(), - eigVals_dev, - eigVecs_dev, - seed); + int status = computeSmallestEigenvectors( + handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, + &iter, &shift, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), + eigVals_dev, eigVecs_dev, seed); // Clean up and return return status; @@ -1064,29 +942,20 @@ int computeSmallestEigenvectors(handle_t handle, * @return error flag. */ template -int computeLargestEigenvectors(handle_t handle, - sparse_matrix_t const* A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ *effIter, - IndexType_ *totalIter, - ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev, - unsigned long long seed) -{ +int computeLargestEigenvectors( + handle_t handle, sparse_matrix_t const *A, + IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, + ValueType_ tol, bool reorthogonalize, IndexType_ *effIter, + IndexType_ *totalIter, ValueType_ *__restrict__ alpha_host, + ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ lanczosVecs_dev, + ValueType_ *__restrict__ work_dev, ValueType_ *__restrict__ eigVals_dev, + ValueType_ *__restrict__ eigVecs_dev, unsigned long long seed) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful constants - const ValueType_ one = 1; + const ValueType_ one = 1; const ValueType_ zero = 0; // Matrix dimension @@ -1113,12 +982,15 @@ int computeLargestEigenvectors(handle_t handle, // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECT(nEigVecs > 0 && nEigVecs<=n, "Invalid number of eigenvectors."); + RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); RAFT_EXPECT(tol > 0, "Invalid tolerance."); RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + // ------------------------------------------------------- // Variable initialization // ------------------------------------------------------- @@ -1130,11 +1002,12 @@ int computeLargestEigenvectors(handle_t handle, std::vector Z_host_v(restartIter * restartIter); std::vector work_host_v(4 * restartIter); - Z_host = Z_host_v.data(); + Z_host = Z_host_v.data(); work_host = work_host_v.data(); // Initialize cuBLAS - cublasset_pointer_mode_host(); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, + stream)); // ????? TODO: check / remove // ------------------------------------------------------- // Compute largest eigenvalue @@ -1146,9 +1019,13 @@ int computeLargestEigenvectors(handle_t handle, CUDA_TRY(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, seed)); // Initialize initial Lanczos vector - CUDA_TRY(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); - ValueType_ normQ1 = cublasnrm2(n, lanczosVecs_dev, 1); - cublasscal(n, 1 / normQ1, lanczosVecs_dev, 1); + CUDA_TRY( + curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); + ValueType_ normQ1; + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); + + auto h_val = 1 / normQ1; + CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); // Estimate number of Lanczos iterations // See bounds in Kuczynski and Wozniakowski (1992). @@ -1158,21 +1035,13 @@ int computeLargestEigenvectors(handle_t handle, // maxIter_curr = min(maxIter_curr, restartIter); // Obtain tridiagonal matrix with Lanczos - *effIter = 0; + *effIter = 0; ValueType_ shift_val = 0.0; - ValueType_ *shift = &shift_val; + ValueType_ *shift = &shift_val; // maxIter_curr = min(maxIter, restartIter); - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - 0, - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, + beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter; @@ -1189,18 +1058,9 @@ int computeLargestEigenvectors(handle_t handle, if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart(n, - *effIter, - iter_new, - &shiftUpper, - &shiftLower, - alpha_host, - beta_host, - Z_host, - work_host, - lanczosVecs_dev, - work_dev, - false); + status = lanczosRestart( + handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, + beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, false); if (status) WARNING("error in Lanczos implicit restart"); *effIter = iter_new; @@ -1209,17 +1069,9 @@ int computeLargestEigenvectors(handle_t handle, // Proceed with Lanczos method // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - tol * fabs(shiftLower), - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), + reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter - iter_new; } @@ -1232,14 +1084,12 @@ int computeLargestEigenvectors(handle_t handle, for (int j = 0; j < restartIter; ++j) Z_host[i * restartIter + j] = 0; } // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(ValueType_)); - memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(ValueType_)); - Lapack::steqr('I', - *effIter, - work_host + 2 * (*effIter), - work_host + 3 * (*effIter), - Z_host, - *effIter, + memcpy(work_host + 2 * (*effIter), alpha_host, + (*effIter) * sizeof(ValueType_)); + memcpy(work_host + 3 * (*effIter), beta_host, + (*effIter - 1) * sizeof(ValueType_)); + Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), + work_host + 3 * (*effIter), Z_host, *effIter, work_host); // note: We need to pick the top nEigVecs eigenvalues @@ -1266,35 +1116,24 @@ int computeLargestEigenvectors(handle_t handle, // Obtain desired eigenvalues by applying shift for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; - for (i = 0; i < top_eigenparis_idx_offset; ++i) work_host[i + 2 * (*effIter)] = 0; + for (i = 0; i < top_eigenparis_idx_offset; ++i) + work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory // skip smallest eigenvalue if needed CUDA_TRY(cudaMemcpy(eigVals_dev, - work_host + 2 * (*effIter) + top_eigenparis_idx_offset, - nEigVecs * sizeof(ValueType_), - cudaMemcpyHostToDevice)); + work_host + 2 * (*effIter) + top_eigenparis_idx_offset, + nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); // skip smallest eigenvector if needed - CUDA_TRY(cudaMemcpy(work_dev, - Z_host + (top_eigenparis_idx_offset * (*effIter)), - (*effIter) * nEigVecs * sizeof(ValueType_), - cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpy( + work_dev, Z_host + (top_eigenparis_idx_offset * (*effIter)), + (*effIter) * nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); // Convert eigenvectors from Lanczos basis to standard basis - cublasgemm(false, - false, - n, - nEigVecs, - *effIter, - &one, - lanczosVecs_dev, - n, - work_dev, - *effIter, - &zero, - eigVecs_dev, - n); + CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, + *effIter, &one, lanczosVecs_dev, n, work_dev, + *effIter, &zero, eigVecs_dev, n, stream)); // Clean up and exit CUDA_TRY(curandDestroyGenerator(randGen)); @@ -1339,23 +1178,18 @@ int computeLargestEigenvectors(handle_t handle, */ template int computeLargestEigenvectors(handle_t handle, - sparse_matrix_t const& A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ &iter, + sparse_matrix_t const &A, + IndexType_ nEigVecs, IndexType_ maxIter, + IndexType_ restartIter, ValueType_ tol, + bool reorthogonalize, IndexType_ &iter, ValueType_ *__restrict__ eigVals_dev, ValueType_ *__restrict__ eigVecs_dev, - unsigned long long seed = 123456, - cudaStream_t stream = 0) -{ + unsigned long long seed = 123456) { // Matrix dimension IndexType_ n = A.nrows; // Check that parameters are valid - RAFT_EXPECT(nEigVecs > 0 && nEigVecs<=n, "Invalid number of eigenvectors."); + RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); RAFT_EXPECT(tol > 0, "Invalid tolerance."); RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); @@ -1366,34 +1200,22 @@ int computeLargestEigenvectors(handle_t handle, std::vector beta_host_v(restartIter); ValueType_ *alpha_host = alpha_host_v.data(); - ValueType_ *beta_host = beta_host_v.data(); + ValueType_ *beta_host = beta_host_v.data(); //TODO: replace and fix allocation via RAFT handle vector_t lanczosVecs_dev(handle, n * (restartIter + 1), stream); - vector_t work_dev(handle, (n + restartIter) * restartIter, stream); + vector_t work_dev(handle, (n + restartIter) * restartIter, + stream); // Perform Lanczos method IndexType_ effIter; - int status = computeLargestEigenvectors(handle, - &A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - &effIter, - &iter, - alpha_host, - beta_host, - lanczosVecs_dev.raw(), - work_dev.raw(), - eigVals_dev, - eigVecs_dev, - seed); + int status = computeLargestEigenvectors( + handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, + &iter, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), + eigVals_dev, eigVecs_dev, seed); // Clean up and return return status; } - } // namespace raft From e0d6cf674d375bf1ff097bc0d9c5c33ecac7177c Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 10 Jun 2020 15:24:13 -0500 Subject: [PATCH 19/88] Kmeans updates. --- cpp/include/raft/spectral/kmeans.hpp | 198 ++++++++++++++------------- 1 file changed, 100 insertions(+), 98 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 691df3e5ce..69ebbada91 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,14 +14,9 @@ * limitations under the License. */ -//#ifdef NVGRAPH_PARTITION -//#ifdef DEBUG - -#include "include/kmeans.hxx" - -#include -#include -#include +#include +#include +#include #include #include @@ -32,13 +27,10 @@ #include #include -#include "include/atomics.hxx" -#include "include/debug_macros.h" -#include "include/nvgraph_cublas.hxx" -#include "include/nvgraph_vector.hxx" -#include "include/sm_utils.h" - -using namespace nvgraph; +#include +#include +#include +#include // ========================================================= // Useful macros @@ -342,7 +334,8 @@ static __global__ void divideCentroids(IndexType_ d, * @return Zero if successful. Otherwise non-zero. */ template -static int chooseNewCentroid(IndexType_ n, +static int chooseNewCentroid(handle_t handle, + IndexType_ n, IndexType_ d, IndexType_ k, ValueType_ rand, @@ -359,9 +352,12 @@ static int chooseNewCentroid(IndexType_ n, // Observation vector that is chosen as new centroid IndexType_ obsIndex; + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + // Compute cumulative sum of distances - inclusive_scan( - device_pointer_cast(dists), device_pointer_cast(dists + n), device_pointer_cast(distsCumSum)); + thrust::inclusive_scan( + thrust::device_pointer_cast(dists),thrust::device_pointer_cast(dists + n),thrust::device_pointer_cast(distsCumSum)); cudaCheckError(); CHECK_CUDA( cudaMemcpy(&distsSum, distsCumSum + n - 1, sizeof(ValueType_), cudaMemcpyDeviceToHost)); @@ -370,16 +366,16 @@ static int chooseNewCentroid(IndexType_ n, // Probabilities are proportional to square of distance to closest // centroid (see k-means++ algorithm) obsIndex = - (lower_bound( - device_pointer_cast(distsCumSum), device_pointer_cast(distsCumSum + n), distsSum * rand) - - device_pointer_cast(distsCumSum)); + (thrust::lower_bound( + thrust::device_pointer_cast(distsCumSum),thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - + thrust::device_pointer_cast(distsCumSum)); cudaCheckError(); obsIndex = max(obsIndex, 0); obsIndex = min(obsIndex, n - 1); // Record new centroid position CHECK_CUDA(cudaMemcpyAsync( - centroid, obs + IDX(0, obsIndex, d), d * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); + centroid, obs + IDX(0, obsIndex, d), d * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); return 0; } @@ -406,14 +402,16 @@ static int chooseNewCentroid(IndexType_ n, * @return Zero if successful. Otherwise non-zero. */ template -static int initializeCentroids(IndexType_ n, +static int initializeCentroids(handle_t handle, + IndexType_ n, IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, ValueType_* __restrict__ centroids, IndexType_* __restrict__ codes, IndexType_* __restrict__ clusterSizes, - ValueType_* __restrict__ dists) + ValueType_* __restrict__ dists, + unsigned long long seed = 123456) { // ------------------------------------------------------- // Variable declarations @@ -426,9 +424,12 @@ static int initializeCentroids(IndexType_ n, dim3 blockDim_warp, gridDim_warp, gridDim_block; // Random number generator - thrust::default_random_engine rng(123456); + thrust::default_random_engine rng(seed); thrust::uniform_real_distribution uniformDist(0, 1); + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + // ------------------------------------------------------- // Implementation // ------------------------------------------------------- @@ -445,40 +446,40 @@ static int initializeCentroids(IndexType_ n, gridDim_block.z = 1; // Assign observation vectors to code 0 - CHECK_CUDA(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_))); + CHECK_CUDA(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_), stream)); // Choose first centroid thrust::fill(thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), 1); cudaCheckError(); - if (chooseNewCentroid(n, d, k, uniformDist(rng), obs, dists, centroids)) + if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists, centroids)) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from first centroid - CHECK_CUDA(cudaMemsetAsync(dists, 0, n * sizeof(ValueType_))); - computeDistances<<>>(n, d, 1, obs, centroids, dists); + CHECK_CUDA(cudaMemsetAsync(dists, 0, n * sizeof(ValueType_), stream)); + computeDistances<<>>(n, d, 1, obs, centroids, dists); cudaCheckError() // Choose remaining centroids for (i = 1; i < k; ++i) { // Choose ith centroid - if (chooseNewCentroid(n, d, k, uniformDist(rng), obs, dists, centroids + IDX(0, i, d))) + if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists, centroids + IDX(0, i, d))) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from ith centroid - CHECK_CUDA(cudaMemsetAsync(dists + n, 0, n * sizeof(ValueType_))); - computeDistances<<>>( + CHECK_CUDA(cudaMemsetAsync(dists + n, 0, n * sizeof(ValueType_), stream)); + computeDistances<<>>( n, d, 1, obs, centroids + IDX(0, i, d), dists + n); cudaCheckError(); // Recompute minimum distances - minDistances2<<>>(n, dists, dists + n, codes, i); + minDistances2<<>>(n, dists, dists + n, codes, i); cudaCheckError(); } // Compute cluster sizes - CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_))); - computeClusterSizes<<>>(n, k, codes, clusterSizes); + CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); + computeClusterSizes<<>>(n, k, codes, clusterSizes); cudaCheckError(); return 0; @@ -508,7 +509,8 @@ static int initializeCentroids(IndexType_ n, * @return Zero if successful. Otherwise non-zero. */ template -static int assignCentroids(IndexType_ n, +static int assignCentroids(handle_t handle, + IndexType_ n, IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, @@ -521,26 +523,29 @@ static int assignCentroids(IndexType_ n, // CUDA grid dimensions dim3 blockDim, gridDim; + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + // Compute distance between centroids and observation vectors - CHECK_CUDA(cudaMemsetAsync(dists, 0, n * k * sizeof(ValueType_))); + CHECK_CUDA(cudaMemsetAsync(dists, 0, n * k * sizeof(ValueType_), stream)); blockDim.x = WARP_SIZE; blockDim.y = 1; blockDim.z = BLOCK_SIZE / WARP_SIZE; gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); gridDim.y = min(k, 65535); gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); - computeDistances<<>>(n, d, k, obs, centroids, dists); + computeDistances<<>>(n, d, k, obs, centroids, dists); cudaCheckError(); // Find centroid closest to each observation vector - CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_))); + CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); blockDim.x = BLOCK_SIZE; blockDim.y = 1; blockDim.z = 1; gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); gridDim.y = 1; gridDim.z = 1; - minDistances<<>>(n, k, dists, codes, clusterSizes); + minDistances<<>>(n, k, dists, codes, clusterSizes); cudaCheckError(); // Compute residual sum of squares @@ -572,7 +577,8 @@ static int assignCentroids(IndexType_ n, * @return Zero if successful. Otherwise non-zero. */ template -static int updateCentroids(IndexType_ n, +static int updateCentroids(handle_t handle, + IndexType_ n, IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, @@ -582,7 +588,6 @@ static int updateCentroids(IndexType_ n, ValueType_* __restrict__ work, IndexType_* __restrict__ work_int) { - using namespace thrust; // ------------------------------------------------------- // Variable declarations @@ -592,40 +597,55 @@ static int updateCentroids(IndexType_ n, const ValueType_ one = 1; const ValueType_ zero = 0; + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + // CUDA grid dimensions dim3 blockDim, gridDim; // Device memory - device_ptr obs_copy(work); - device_ptr codes_copy(work_int); - device_ptr rows(work_int + d * n); + thrust::device_ptr obs_copy(work); + thrust::device_ptr codes_copy(work_int); + thrust::device_ptr rows(work_int + d * n); // Take transpose of observation matrix - Cublas::geam( - true, false, n, d, &one, obs, d, &zero, (ValueType_*)NULL, n, raw_pointer_cast(obs_copy), n); + CUBLAS_CHECK(cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + n, + d, + &one, + obs, + d, + &zero, + (ValueType_*)NULL, + n, + thrust::raw_pointer_cast(obs_copy), + n, + stream)); // Cluster assigned to each observation matrix entry - sequence(rows, rows + d * n); + thrust::sequence(rows, rows + d * n); cudaCheckError(); - transform(rows, rows + d * n, make_constant_iterator(n), rows, modulus()); + thrust::transform(rows, rows + d * n, make_constant_iterator(n), rows, modulus()); cudaCheckError(); - gather(rows, rows + d * n, device_pointer_cast(codes), codes_copy); + thrust::gather(rows, rows + d * n,thrust::device_pointer_cast(codes), codes_copy); cudaCheckError(); // Row associated with each observation matrix entry - sequence(rows, rows + d * n); + thrust::sequence(rows, rows + d * n); cudaCheckError(); - transform(rows, rows + d * n, make_constant_iterator(n), rows, divides()); + thrust::transform(rows, rows + d * n, make_constant_iterator(n), rows, divides()); cudaCheckError(); // Sort and reduce to add observation vectors in same cluster - stable_sort_by_key(codes_copy, codes_copy + d * n, make_zip_iterator(make_tuple(obs_copy, rows))); + thrust::stable_sort_by_key(codes_copy, codes_copy + d * n, make_zip_iterator(make_tuple(obs_copy, rows))); cudaCheckError(); - reduce_by_key(rows, + thrust::reduce_by_key(rows, rows + d * n, obs_copy, codes_copy, // Output to codes_copy is ignored - device_pointer_cast(centroids)); + thrust::device_pointer_cast(centroids)); cudaCheckError(); // Divide sums by cluster size to get centroid matrix @@ -635,7 +655,7 @@ static int updateCentroids(IndexType_ n, gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); gridDim.y = min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); gridDim.z = 1; - divideCentroids<<>>(d, k, clusterSizes, centroids); + divideCentroids<<>>(d, k, clusterSizes, centroids); cudaCheckError(); return 0; @@ -643,7 +663,7 @@ static int updateCentroids(IndexType_ n, } // namespace -namespace nvgraph { +namespace raft { // ========================================================= // k-means algorithm @@ -682,7 +702,8 @@ namespace nvgraph { * @return NVGRAPH error flag. */ template -NVGRAPH_ERROR kmeans(IndexType_ n, +NVGRAPH_ERROR kmeans(handle_t handle, + IndexType_ n, IndexType_ d, IndexType_ k, ValueType_ tol, @@ -694,7 +715,8 @@ NVGRAPH_ERROR kmeans(IndexType_ n, ValueType_* __restrict__ work, IndexType_* __restrict__ work_int, ValueType_* residual_host, - IndexType_* iters_host) + IndexType_* iters_host, + unsigned long long seed = 123456) { // ------------------------------------------------------- // Variable declarations @@ -707,7 +729,7 @@ NVGRAPH_ERROR kmeans(IndexType_ n, ValueType_ residualPrev = 0; // Random number generator - thrust::default_random_engine rng(123456); + thrust::default_random_engine rng(seed); thrust::uniform_real_distribution uniformDist(0, 1); // ------------------------------------------------------- @@ -736,11 +758,14 @@ NVGRAPH_ERROR kmeans(IndexType_ n, return NVGRAPH_ERR_BAD_PARAMETERS; } + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + // Trivial cases if (k == 1) { - CHECK_CUDA(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_))); - CHECK_CUDA(cudaMemcpyAsync(clusterSizes, &n, sizeof(IndexType_), cudaMemcpyHostToDevice)); - if (updateCentroids(n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) + CHECK_CUDA(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_), stream)); + CHECK_CUDA(cudaMemcpyAsync(clusterSizes, &n, sizeof(IndexType_), cudaMemcpyHostToDevice, stream)); + if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) WARNING("could not compute k-means centroids"); dim3 blockDim, gridDim; blockDim.x = WARP_SIZE; @@ -749,8 +774,8 @@ NVGRAPH_ERROR kmeans(IndexType_ n, gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); gridDim.y = 1; gridDim.z = min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), 65535); - CHECK_CUDA(cudaMemsetAsync(work, 0, n * k * sizeof(ValueType_))); - computeDistances<<>>(n, d, 1, obs, centroids, work); + CHECK_CUDA(cudaMemsetAsync(work, 0, n * k * sizeof(ValueType_), stream)); + computeDistances<<>>(n, d, 1, obs, centroids, work); cudaCheckError(); *residual_host = thrust::reduce(thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); @@ -763,15 +788,16 @@ NVGRAPH_ERROR kmeans(IndexType_ n, thrust::fill_n(thrust::device_pointer_cast(clusterSizes), n, 1); cudaCheckError(); - if (n < k) CHECK_CUDA(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(IndexType_))); + if (n < k) CHECK_CUDA(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(IndexType_), stream)); CHECK_CUDA( - cudaMemcpyAsync(centroids, obs, d * n * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); + cudaMemcpyAsync(centroids, obs, d * n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); *residual_host = 0; return NVGRAPH_OK; } // Initialize cuBLAS - Cublas::set_pointer_mode_host(); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, + stream)); // ????? TODO: check / remove // ------------------------------------------------------- // k-means++ algorithm @@ -784,7 +810,7 @@ NVGRAPH_ERROR kmeans(IndexType_ n, // Apply k-means iteration until convergence for (iter = 0; iter < maxiter; ++iter) { // Update cluster centroids - if (updateCentroids(n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) + if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) WARNING("could not update k-means centroids"); // Determine centroid closest to each observation @@ -815,17 +841,17 @@ NVGRAPH_ERROR kmeans(IndexType_ n, } // Check for convergence - if (fabs(residualPrev - (*residual_host)) / n < tol) { + if (std::fabs(residualPrev - (*residual_host)) / n < tol) { ++iter; break; } } // Warning if k-means has failed to converge - if (fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge"); + if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge"); *iters_host = iter; - return NVGRAPH_OK; + return 0; } /// Find clusters with k-means algorithm @@ -908,28 +934,4 @@ NVGRAPH_ERROR kmeans(IndexType_ n, &iters); } -// ========================================================= -// Explicit instantiations -// ========================================================= - -template NVGRAPH_ERROR kmeans(int n, - int d, - int k, - float tol, - int maxiter, - const float* __restrict__ obs, - int* __restrict__ codes, - float& residual, - int& iters); -template NVGRAPH_ERROR kmeans(int n, - int d, - int k, - double tol, - int maxiter, - const double* __restrict__ obs, - int* __restrict__ codes, - double& residual, - int& iters); -} // namespace nvgraph -//#endif //NVGRAPH_PARTITION -//#endif //debug +} // namespace raft From cca40eab5927d5662dda06f00f70281c0d88580a Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 10 Jun 2020 17:18:40 -0500 Subject: [PATCH 20/88] Update on kmeans and cleanup. --- cpp/include/raft/spectral/kmeans.hpp | 478 ++++---- cpp/include/raft/spectral/lanczos.hpp | 10 +- cpp/include/raft/spectral/matrix_wrappers.hpp | 191 ++- cpp/include/raft/spectral/spectral_matrix.hpp | 1044 ----------------- 4 files changed, 287 insertions(+), 1436 deletions(-) delete mode 100644 cpp/include/raft/spectral/spectral_matrix.hpp diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 69ebbada91..a9b1c1f049 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -71,13 +71,9 @@ namespace { * initialized to zero. */ template -static __global__ void computeDistances(IndexType_ n, - IndexType_ d, - IndexType_ k, - const ValueType_* __restrict__ obs, - const ValueType_* __restrict__ centroids, - ValueType_* __restrict__ dists) -{ +static __global__ void computeDistances( + IndexType_ n, IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, + const ValueType_* __restrict__ centroids, ValueType_* __restrict__ dists) { // Loop index IndexType_ i; @@ -115,7 +111,8 @@ static __global__ void computeDistances(IndexType_ n, dist_private += utils::shfl_down(dist_private, i, 2 * i); // Write result to global memory - if (threadIdx.x == 0) atomicFPAdd(dists + IDX(gidz, gidy, n), dist_private); + if (threadIdx.x == 0) + atomicFPAdd(dists + IDX(gidz, gidy, n), dist_private); // Move to another observation vector gidz += blockDim.z * gridDim.z; @@ -150,12 +147,10 @@ static __global__ void computeDistances(IndexType_ n, * cluster. Entries must be initialized to zero. */ template -static __global__ void minDistances(IndexType_ n, - IndexType_ k, +static __global__ void minDistances(IndexType_ n, IndexType_ k, ValueType_* __restrict__ dists, IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes) -{ + IndexType_* __restrict__ clusterSizes) { // Loop index IndexType_ i, j; @@ -174,8 +169,8 @@ static __global__ void minDistances(IndexType_ n, dist_min = dists[IDX(i, 0, n)]; for (j = 1; j < k; ++j) { dist_curr = dists[IDX(i, j, n)]; - code_min = (dist_curr < dist_min) ? j : code_min; - dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; + code_min = (dist_curr < dist_min) ? j : code_min; + dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; } // Transfer result to global memory @@ -212,8 +207,7 @@ static __global__ void minDistances2(IndexType_ n, ValueType_* __restrict__ dists_old, const ValueType_* __restrict__ dists_new, IndexType_* __restrict__ codes_old, - IndexType_ code_new) -{ + IndexType_ code_new) { // Loop index IndexType_ i; @@ -250,11 +244,9 @@ static __global__ void minDistances2(IndexType_ n, * cluster. Entries must be initialized to zero. */ template -static __global__ void computeClusterSizes(IndexType_ n, - IndexType_ k, - const IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes) -{ +static __global__ void computeClusterSizes( + IndexType_ n, IndexType_ k, const IndexType_* __restrict__ codes, + IndexType_* __restrict__ clusterSizes) { IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { atomicAdd(clusterSizes + codes[i], 1); @@ -282,11 +274,9 @@ static __global__ void computeClusterSizes(IndexType_ n, * column is the mean position of a cluster). */ template -static __global__ void divideCentroids(IndexType_ d, - IndexType_ k, - const IndexType_* __restrict__ clusterSizes, - ValueType_* __restrict__ centroids) -{ +static __global__ void divideCentroids( + IndexType_ d, IndexType_ k, const IndexType_* __restrict__ clusterSizes, + ValueType_* __restrict__ centroids) { // Global indices IndexType_ gidx, gidy; @@ -300,7 +290,7 @@ static __global__ void divideCentroids(IndexType_ d, clusterSize_private = clusterSizes[gidy]; // Add vector entries to centroid matrix - // Vector entris are determined by global x-index + // vector entris are determined by global x-index gidx = threadIdx.x + blockIdx.x * blockDim.x; while (gidx < d) { centroids[IDX(gidx, gidy, d)] /= clusterSize_private; @@ -333,16 +323,13 @@ static __global__ void divideCentroids(IndexType_ d, * coordinates. * @return Zero if successful. Otherwise non-zero. */ -template +template static int chooseNewCentroid(handle_t handle, - IndexType_ n, - IndexType_ d, - IndexType_ k, - ValueType_ rand, + ThrustExePolicy thrust_exec_policy, IndexType_ n, + IndexType_ d, IndexType_ k, ValueType_ rand, const ValueType_* __restrict__ obs, ValueType_* __restrict__ dists, - ValueType_* __restrict__ centroid) -{ + ValueType_* __restrict__ centroid) { using namespace thrust; // Cumulative sum of distances @@ -356,26 +343,28 @@ static int chooseNewCentroid(handle_t handle, auto stream = handle.get_stream(); // Compute cumulative sum of distances - thrust::inclusive_scan( - thrust::device_pointer_cast(dists),thrust::device_pointer_cast(dists + n),thrust::device_pointer_cast(distsCumSum)); - cudaCheckError(); - CHECK_CUDA( - cudaMemcpy(&distsSum, distsCumSum + n - 1, sizeof(ValueType_), cudaMemcpyDeviceToHost)); + thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n), + thrust::device_pointer_cast(distsCumSum)); + CUDA_CHECK_LAST(); + CUDA_TRY(cudaMemcpy(&distsSum, distsCumSum + n - 1, sizeof(ValueType_), + cudaMemcpyDeviceToHost)); // Randomly choose observation vector // Probabilities are proportional to square of distance to closest // centroid (see k-means++ algorithm) - obsIndex = - (thrust::lower_bound( - thrust::device_pointer_cast(distsCumSum),thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - - thrust::device_pointer_cast(distsCumSum)); - cudaCheckError(); + obsIndex = (thrust::lower_bound( + thrust_exec_policy, thrust::device_pointer_cast(distsCumSum), + thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - + thrust::device_pointer_cast(distsCumSum)); + CUDA_CHECK_LAST(); obsIndex = max(obsIndex, 0); obsIndex = min(obsIndex, n - 1); // Record new centroid position - CHECK_CUDA(cudaMemcpyAsync( - centroid, obs + IDX(0, obsIndex, d), d * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(centroid, obs + IDX(0, obsIndex, d), + d * sizeof(ValueType_), cudaMemcpyDeviceToDevice, + stream)); return 0; } @@ -401,18 +390,13 @@ static int chooseNewCentroid(handle_t handle, * distance between observation vectors and the closest centroid. * @return Zero if successful. Otherwise non-zero. */ -template -static int initializeCentroids(handle_t handle, - IndexType_ n, - IndexType_ d, - IndexType_ k, - const ValueType_* __restrict__ obs, - ValueType_* __restrict__ centroids, - IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes, - ValueType_* __restrict__ dists, - unsigned long long seed = 123456) -{ +template +static int initializeCentroids( + handle_t handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, + IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, + ValueType_* __restrict__ centroids, IndexType_* __restrict__ codes, + IndexType_* __restrict__ clusterSizes, ValueType_* __restrict__ dists, + unsigned long long seed) { // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- @@ -438,49 +422,54 @@ static int initializeCentroids(handle_t handle, blockDim_warp.x = WARP_SIZE; blockDim_warp.y = 1; blockDim_warp.z = BSIZE_DIV_WSIZE; - gridDim_warp.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim_warp.y = 1; - gridDim_warp.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); + gridDim_warp.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); + gridDim_warp.y = 1; + gridDim_warp.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); gridDim_block.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); gridDim_block.y = 1; gridDim_block.z = 1; // Assign observation vectors to code 0 - CHECK_CUDA(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_), stream)); + CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_), stream)); // Choose first centroid - thrust::fill(thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), 1); - cudaCheckError(); - if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists, centroids)) + thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n), 1); + CUDA_CHECK_LAST(); + if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), + obs, dists, centroids)) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from first centroid - CHECK_CUDA(cudaMemsetAsync(dists, 0, n * sizeof(ValueType_), stream)); - computeDistances<<>>(n, d, 1, obs, centroids, dists); + CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(ValueType_), stream)); + computeDistances<<>>( + n, d, 1, obs, centroids, dists); cudaCheckError() // Choose remaining centroids - for (i = 1; i < k; ++i) - { + for (i = 1; i < k; ++i) { // Choose ith centroid - if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists, centroids + IDX(0, i, d))) + if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), + obs, dists, centroids + IDX(0, i, d))) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from ith centroid - CHECK_CUDA(cudaMemsetAsync(dists + n, 0, n * sizeof(ValueType_), stream)); + CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(ValueType_), stream)); computeDistances<<>>( n, d, 1, obs, centroids + IDX(0, i, d), dists + n); - cudaCheckError(); + CUDA_CHECK_LAST(); // Recompute minimum distances - minDistances2<<>>(n, dists, dists + n, codes, i); - cudaCheckError(); + minDistances2<<>>(n, dists, dists + n, + codes, i); + CUDA_CHECK_LAST(); } // Compute cluster sizes - CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); - computeClusterSizes<<>>(n, k, codes, clusterSizes); - cudaCheckError(); + CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); + computeClusterSizes<<>>(n, k, codes, + clusterSizes); + CUDA_CHECK_LAST(); return 0; } @@ -508,18 +497,15 @@ static int initializeCentroids(handle_t handle, * of squares of assignment. * @return Zero if successful. Otherwise non-zero. */ -template -static int assignCentroids(handle_t handle, - IndexType_ n, - IndexType_ d, - IndexType_ k, +template +static int assignCentroids(handle_t handle, ThrustExePolicy thrust_exec_policy, + IndexType_ n, IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, const ValueType_* __restrict__ centroids, ValueType_* __restrict__ dists, IndexType_* __restrict__ codes, IndexType_* __restrict__ clusterSizes, - ValueType_* residual_host) -{ + ValueType_* residual_host) { // CUDA grid dimensions dim3 blockDim, gridDim; @@ -527,30 +513,33 @@ static int assignCentroids(handle_t handle, auto stream = handle.get_stream(); // Compute distance between centroids and observation vectors - CHECK_CUDA(cudaMemsetAsync(dists, 0, n * k * sizeof(ValueType_), stream)); + CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(ValueType_), stream)); blockDim.x = WARP_SIZE; blockDim.y = 1; blockDim.z = BLOCK_SIZE / WARP_SIZE; - gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim.y = min(k, 65535); - gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); - computeDistances<<>>(n, d, k, obs, centroids, dists); - cudaCheckError(); + gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); + gridDim.y = min(k, 65535); + gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); + computeDistances<<>>(n, d, k, obs, centroids, + dists); + CUDA_CHECK_LAST(); // Find centroid closest to each observation vector - CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); + CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); blockDim.x = BLOCK_SIZE; blockDim.y = 1; blockDim.z = 1; - gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); - gridDim.y = 1; - gridDim.z = 1; - minDistances<<>>(n, k, dists, codes, clusterSizes); - cudaCheckError(); + gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); + gridDim.y = 1; + gridDim.z = 1; + minDistances<<>>(n, k, dists, codes, + clusterSizes); + CUDA_CHECK_LAST(); // Compute residual sum of squares *residual_host = - thrust::reduce(thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n)); + thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n)); return 0; } @@ -576,25 +565,21 @@ static int assignCentroids(handle_t handle, * Workspace. * @return Zero if successful. Otherwise non-zero. */ -template -static int updateCentroids(handle_t handle, - IndexType_ n, - IndexType_ d, - IndexType_ k, +template +static int updateCentroids(handle_t handle, ThrustExePolicy thrust_exec_policy, + IndexType_ n, IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, const IndexType_* __restrict__ codes, const IndexType_* __restrict__ clusterSizes, ValueType_* __restrict__ centroids, ValueType_* __restrict__ work, - IndexType_* __restrict__ work_int) -{ - + IndexType_* __restrict__ work_int) { // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- // Useful constants - const ValueType_ one = 1; + const ValueType_ one = 1; const ValueType_ zero = 0; auto cublas_h = handle.get_cublas_handle(); @@ -609,54 +594,48 @@ static int updateCentroids(handle_t handle, thrust::device_ptr rows(work_int + d * n); // Take transpose of observation matrix - CUBLAS_CHECK(cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - n, - d, - &one, - obs, - d, - &zero, - (ValueType_*)NULL, - n, - thrust::raw_pointer_cast(obs_copy), - n, - stream)); + CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, n, d, &one, obs, + d, &zero, (ValueType_*)NULL, n, + thrust::raw_pointer_cast(obs_copy), n, stream)); // Cluster assigned to each observation matrix entry - thrust::sequence(rows, rows + d * n); - cudaCheckError(); - thrust::transform(rows, rows + d * n, make_constant_iterator(n), rows, modulus()); - cudaCheckError(); - thrust::gather(rows, rows + d * n,thrust::device_pointer_cast(codes), codes_copy); - cudaCheckError(); + thrust::sequence(thrust_exec_policy, rows, rows + d * n); + CUDA_CHECK_LAST(); + thrust::transform(thrust_exec_policy, rows, rows + d * n, + make_constant_iterator(n), rows, + modulus()); + CUDA_CHECK_LAST(); + thrust::gather(thrust_exec_policy, rows, rows + d * n, + thrust::device_pointer_cast(codes), codes_copy); + CUDA_CHECK_LAST(); // Row associated with each observation matrix entry - thrust::sequence(rows, rows + d * n); - cudaCheckError(); - thrust::transform(rows, rows + d * n, make_constant_iterator(n), rows, divides()); - cudaCheckError(); + thrust::sequence(thrust_exec_policy, rows, rows + d * n); + CUDA_CHECK_LAST(); + thrust::transform(thrust_exec_policy, rows, rows + d * n, + make_constant_iterator(n), rows, + divides()); + CUDA_CHECK_LAST(); // Sort and reduce to add observation vectors in same cluster - thrust::stable_sort_by_key(codes_copy, codes_copy + d * n, make_zip_iterator(make_tuple(obs_copy, rows))); - cudaCheckError(); - thrust::reduce_by_key(rows, - rows + d * n, - obs_copy, - codes_copy, // Output to codes_copy is ignored - thrust::device_pointer_cast(centroids)); - cudaCheckError(); + thrust::stable_sort_by_key(thrust_exec_policy, codes_copy, codes_copy + d * n, + make_zip_iterator(make_tuple(obs_copy, rows))); + CUDA_CHECK_LAST(); + thrust::reduce_by_key(thrust_exec_policy, rows, rows + d * n, obs_copy, + codes_copy, // Output to codes_copy is ignored + thrust::device_pointer_cast(centroids)); + CUDA_CHECK_LAST(); // Divide sums by cluster size to get centroid matrix blockDim.x = WARP_SIZE; blockDim.y = BLOCK_SIZE / WARP_SIZE; blockDim.z = 1; - gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim.y = min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); - gridDim.z = 1; - divideCentroids<<>>(d, k, clusterSizes, centroids); - cudaCheckError(); + gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); + gridDim.y = min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); + gridDim.z = 1; + divideCentroids<<>>(d, k, clusterSizes, + centroids); + CUDA_CHECK_LAST(); return 0; } @@ -699,25 +678,16 @@ namespace raft { * vectors and centroids). * @param iters_host (Output, host memory, 1 entry) Number of * k-means iterations. - * @return NVGRAPH error flag. + * @return error flag. */ -template -NVGRAPH_ERROR kmeans(handle_t handle, - IndexType_ n, - IndexType_ d, - IndexType_ k, - ValueType_ tol, - IndexType_ maxiter, - const ValueType_* __restrict__ obs, - IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes, - ValueType_* __restrict__ centroids, - ValueType_* __restrict__ work, - IndexType_* __restrict__ work_int, - ValueType_* residual_host, - IndexType_* iters_host, - unsigned long long seed = 123456) -{ +template +int kmeans(handle_t handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, + IndexType_ d, IndexType_ k, ValueType_ tol, IndexType_ maxiter, + const ValueType_* __restrict__ obs, IndexType_* __restrict__ codes, + IndexType_* __restrict__ clusterSizes, + ValueType_* __restrict__ centroids, ValueType_* __restrict__ work, + IndexType_* __restrict__ work_int, ValueType_* residual_host, + IndexType_* iters_host, unsigned long long seed) { // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- @@ -736,63 +706,50 @@ NVGRAPH_ERROR kmeans(handle_t handle, // Initialization // ------------------------------------------------------- - // Check that parameters are valid - if (n < 1) { - WARNING("invalid parameter (n<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (d < 1) { - WARNING("invalid parameter (d<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (k < 1) { - WARNING("invalid parameter (k<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (tol < 0) { - WARNING("invalid parameter (tol<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (maxiter < 0) { - WARNING("invalid parameter (maxiter<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); // Trivial cases if (k == 1) { - CHECK_CUDA(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_), stream)); - CHECK_CUDA(cudaMemcpyAsync(clusterSizes, &n, sizeof(IndexType_), cudaMemcpyHostToDevice, stream)); - if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) + CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_), stream)); + CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(IndexType_), + cudaMemcpyHostToDevice, stream)); + if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, + clusterSizes, centroids, work, work_int)) WARNING("could not compute k-means centroids"); dim3 blockDim, gridDim; blockDim.x = WARP_SIZE; blockDim.y = 1; blockDim.z = BLOCK_SIZE / WARP_SIZE; - gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim.y = 1; - gridDim.z = min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), 65535); - CHECK_CUDA(cudaMemsetAsync(work, 0, n * k * sizeof(ValueType_), stream)); - computeDistances<<>>(n, d, 1, obs, centroids, work); - cudaCheckError(); + gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); + gridDim.y = 1; + gridDim.z = + min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), 65535); + CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(ValueType_), stream)); + computeDistances<<>>(n, d, 1, obs, centroids, + work); + CUDA_CHECK_LAST(); *residual_host = - thrust::reduce(thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); - cudaCheckError(); - return NVGRAPH_OK; + thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(work), + thrust::device_pointer_cast(work + n)); + CUDA_CHECK_LAST(); + return 0; } if (n <= k) { - thrust::sequence(thrust::device_pointer_cast(codes), thrust::device_pointer_cast(codes + n)); - cudaCheckError(); - thrust::fill_n(thrust::device_pointer_cast(clusterSizes), n, 1); - cudaCheckError(); - - if (n < k) CHECK_CUDA(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(IndexType_), stream)); - CHECK_CUDA( - cudaMemcpyAsync(centroids, obs, d * n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); + thrust::sequence(thrust_exec_policy, thrust::device_pointer_cast(codes), + thrust::device_pointer_cast(codes + n)); + CUDA_CHECK_LAST(); + thrust::fill_n(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), n, 1); + CUDA_CHECK_LAST(); + + if (n < k) + CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, + (k - n) * sizeof(IndexType_), stream)); + CUDA_TRY(cudaMemcpyAsync(centroids, obs, d * n * sizeof(ValueType_), + cudaMemcpyDeviceToDevice, stream)); *residual_host = 0; - return NVGRAPH_OK; + return 0; } // Initialize cuBLAS @@ -804,40 +761,47 @@ NVGRAPH_ERROR kmeans(handle_t handle, // ------------------------------------------------------- // Choose initial cluster centroids - if (initializeCentroids(n, d, k, obs, centroids, codes, clusterSizes, work)) + if (initializeCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, + codes, clusterSizes, work, seed)) WARNING("could not initialize k-means centroids"); // Apply k-means iteration until convergence for (iter = 0; iter < maxiter; ++iter) { // Update cluster centroids - if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) + if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, + clusterSizes, centroids, work, work_int)) WARNING("could not update k-means centroids"); // Determine centroid closest to each observation residualPrev = *residual_host; - if (assignCentroids(n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) + if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, + work, codes, clusterSizes, residual_host)) WARNING("could not assign observation vectors to k-means clusters"); // Reinitialize empty clusters with new centroids - IndexType_ emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), - 0) - - thrust::device_pointer_cast(clusterSizes)); + IndexType_ emptyCentroid = + (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), 0) - + thrust::device_pointer_cast(clusterSizes)); // FIXME: emptyCentroid never reaches k (infinite loop) under certain // conditions, such as if obs is corrupt (as seen as a result of a // DataFrame column of NULL edge vals used to create the Graph) while (emptyCentroid < k) { - if (chooseNewCentroid( - n, d, k, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d))) + if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, + uniformDist(rng), obs, work, + centroids + IDX(0, emptyCentroid, d))) WARNING("could not replace empty centroid"); - if (assignCentroids(n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) + if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, + work, codes, clusterSizes, residual_host)) WARNING("could not assign observation vectors to k-means clusters"); - emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), - 0) - - thrust::device_pointer_cast(clusterSizes)); - cudaCheckError(); + emptyCentroid = + (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), 0) - + thrust::device_pointer_cast(clusterSizes)); + CUDA_CHECK_LAST(); } // Check for convergence @@ -848,7 +812,8 @@ NVGRAPH_ERROR kmeans(handle_t handle, } // Warning if k-means has failed to converge - if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge"); + if (std::fabs(residualPrev - (*residual_host)) / n >= tol) + WARNING("k-means failed to converge"); *iters_host = iter; return 0; @@ -875,63 +840,34 @@ NVGRAPH_ERROR kmeans(handle_t handle, * @param residual On exit, residual sum of squares (sum of squares * of distances between observation vectors and centroids). * @param On exit, number of k-means iterations. - * @return NVGRAPH error flag + * @return error flag */ -template -NVGRAPH_ERROR kmeans(IndexType_ n, - IndexType_ d, - IndexType_ k, - ValueType_ tol, - IndexType_ maxiter, - const ValueType_* __restrict__ obs, - IndexType_* __restrict__ codes, - ValueType_& residual, - IndexType_& iters) -{ +template +int kmeans(handle_t handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, + IndexType_ d, IndexType_ k, ValueType_ tol, IndexType_ maxiter, + const ValueType_* __restrict__ obs, IndexType_* __restrict__ codes, + ValueType_& residual, IndexType_& iters, + unsigned long long seed = 123456) { + using namespace matrix; + // Check that parameters are valid - if (n < 1) { - WARNING("invalid parameter (n<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (d < 1) { - WARNING("invalid parameter (d<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (k < 1) { - WARNING("invalid parameter (k<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (tol < 0) { - WARNING("invalid parameter (tol<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (maxiter < 0) { - WARNING("invalid parameter (maxiter<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } + RAFT_EXPECT(n > 0, "invalid parameter (n<1)"); + RAFT_EXPECT(d > 0, "invalid parameter (d<1)"); + RAFT_EXPECT(k > 0, "invalid parameter (k<1)"); + RAFT_EXPECT(tol > 0, "invalid parameter (tol<=0)"); + RAFT_EXPECT(maxiter >= 0, "invalid parameter (maxiter<0)"); // Allocate memory - // TODO: handle non-zero CUDA streams - cudaStream_t stream = 0; - Vector clusterSizes(k, stream); - Vector centroids(d * k, stream); - Vector work(n * max(k, d), stream); - Vector work_int(2 * d * n, stream); + vector_t clusterSizes(handle, k); + vector_t centroids(handle, d * k); + vector_t work(handle, n * max(k, d)); + vector_t work_int(handle, 2 * d * n); // Perform k-means - return kmeans(n, - d, - k, - tol, - maxiter, - obs, - codes, - clusterSizes.raw(), - centroids.raw(), - work.raw(), - work_int.raw(), - &residual, - &iters); + return kmeans( + handle, thrust_exec_policy, n, d, k, tol, maxiter, obs, codes, + clusterSizes.raw(), centroids.raw(), work.raw(), work_int.raw(), &residual, + &iters, seed); } } // namespace raft diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 938c4421ab..5a334c2c1a 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -880,9 +880,8 @@ int computeSmallestEigenvectors( ValueType_ *beta_host = beta_host_v.data(); //TODO: replace and fix allocation via RAFT handle - vector_t lanczosVecs_dev(handle, n * (restartIter + 1), stream); - vector_t work_dev(handle, (n + restartIter) * restartIter, - stream); + vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); + vector_t work_dev(handle, (n + restartIter) * restartIter); // Perform Lanczos method IndexType_ effIter; @@ -1203,9 +1202,8 @@ int computeLargestEigenvectors(handle_t handle, ValueType_ *beta_host = beta_host_v.data(); //TODO: replace and fix allocation via RAFT handle - vector_t lanczosVecs_dev(handle, n * (restartIter + 1), stream); - vector_t work_dev(handle, (n + restartIter) * restartIter, - stream); + vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); + vector_t work_dev(handle, (n + restartIter) * restartIter); // Perform Lanczos method IndexType_ effIter; diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index f3fb509e12..d8e497fe0f 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -15,15 +15,14 @@ */ #pragma once +#include // ? #include -#include // ? #include - -namespace raft{ +namespace raft { namespace matrix { -using size_type = int; // for now; TODO: move it in appropriate header +using size_type = int; // for now; TODO: move it in appropriate header // Vector "view"-like aggregate for linear algebra purposes // @@ -32,22 +31,16 @@ struct vector_view_t { value_type* buffer_; size_type size_; - vector_view_t(value_type* buffer, size_type sz): - buffer_(buffer), - size_(sz) - { - } + vector_view_t(value_type* buffer, size_type sz) + : buffer_(buffer), size_(sz) {} - vector_view_t(vector_view_t&& other): - buffer_(other.buffer_), - size_(other.size_) - { + vector_view_t(vector_view_t&& other) + : buffer_(other.buffer_), size_(other.size_) { other.buffer_ = nullptr; other.size_ = 0; } - vector_view_t& operator = (vector_view_t&& other) - { + vector_view_t& operator=(vector_view_t&& other) { buffer_ = other.buffer_; size_ = other.size_; @@ -64,109 +57,86 @@ class vector_t { value_type* buffer_; size_type size_; cudaStream_t stream_; -public: - - vector_t(handle_t const& raft_handle, size_type sz, cudaStream_t stream = 0): - handle_(raft_handle), - buffer_(static_cast(raft_handle.get_device_allocator()->allocate(sz*sizeof(value_type), stream))), - size_(sz), - stream_(stream) - { - } - ~vector_t(void) - { + public: + vector_t(handle_t const& raft_handle, size_type sz) + : handle_(raft_handle), + buffer_( + static_cast(raft_handle.get_device_allocator()->allocate( + sz * sizeof(value_type), raft_handle.get_stream()))), + size_(sz), + stream_(raft_handle.get_stream()) {} + + ~vector_t(void) { handle_.get_device_allocator()->deallocate(buffer_, size_, stream_); } - size_type size(void) const - { - return size_; - } - - value_type* raw(void) - { - return buffer_; - } + size_type size(void) const { return size_; } + + value_type* raw(void) { return buffer_; } }; - + template struct sparse_matrix_t { - sparse_matrix_t(index_type const* row_offsets, - index_type const* col_indices, - value_type const* values, - index_type const nnz, - index_type const nrows) : - row_offsets_(row_offsets), - col_indices_(col_indices), - values_(values), - nrows_(nrows), - nnz_(nnz) - { - } + sparse_matrix_t(index_type const* row_offsets, index_type const* col_indices, + value_type const* values, index_type const nnz, + index_type const nrows) + : row_offsets_(row_offsets), + col_indices_(col_indices), + values_(values), + nrows_(nrows), + nnz_(nnz) {} + + sparse_matrix_t( + GraphCSRView const& csr_view) + : row_offsets_(csr_view.offsets_), + col_indices_(csr_view.indices_), + values_(csr_view.edge_data_), + nrows_(csr_view.number_of_vertices_), + nnz_(csr_view.number_of_edges_) {} + + virtual ~sparse_matrix_t(void) = + default; // virtual because used as base for following matrix types - sparse_matrix_t(GraphCSRView const& csr_view): - row_offsets_(csr_view.offsets_), - col_indices_(csr_view.indices_), - values_(csr_view.edge_data_), - nrows_(csr_view.number_of_vertices_), - nnz_(csr_view.number_of_edges_) - { - } - - - virtual ~sparse_matrix_t(void) = default; // virtual because used as base for following matrix types - // y = alpha*A*x + beta*y // - virtual void mv(value_type alpha, - value_type const* __restrict__ x, - value_type beta, - value_type* __restrict__ y) const - { + virtual void mv(value_type alpha, value_type const* __restrict__ x, + value_type beta, value_type* __restrict__ y) const { //TODO: call cusparse::csrmv } - + //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate - + index_type const* row_offsets_; index_type const* col_indices_; - value_type const* values_; // TODO: const-ness of this is debatable; cusparse primitives may not accept it... + value_type const* + values_; // TODO: const-ness of this is debatable; cusparse primitives may not accept it... index_type const nrows_; index_type const nnz_; }; template struct laplacian_matrix_t : sparse_matrix_t { - laplacian_matrix_t(handle_t const& raft_handle, - index_type const* row_offsets, - index_type const* col_indices, - value_type const* values, - index_type const nrows, - index_type const nnz, - cudaStream_t stream = 0) : - sparse_matrix_t(row_offsets,col_indices,values,nrows,nnz), - diagonal_(raft_handle, nrows, stream) - { + laplacian_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, + index_type const* col_indices, value_type const* values, + index_type const nrows, index_type const nnz) + : sparse_matrix_t(row_offsets, col_indices, values, + nrows, nnz), + diagonal_(raft_handle, nrows) { auto* v = diagonal_.raw(); //TODO: more work, here... } - laplacian_matrix_t(handle_t const& raft_handle, - GraphCSRView const& csr_view, - cudaStream_t stream = 0): - sparse_matrix_t(csr_view), - diagonal_(raft_handle, csr_view.number_of_vertices_, stream) - { - } + laplacian_matrix_t( + handle_t const& raft_handle, + GraphCSRView const& csr_view) + : sparse_matrix_t(csr_view), + diagonal_(raft_handle, csr_view.number_of_vertices_) {} // y = alpha*A*x + beta*y // - void mv(value_type alpha, - value_type const* __restrict__ x, - value_type beta, - value_type* __restrict__ y) const override - { + void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, + value_type* __restrict__ y) const override { //TODO: call cusparse::csrmv } @@ -174,38 +144,29 @@ struct laplacian_matrix_t : sparse_matrix_t { }; template -struct modularity_matrix_t: laplacian_matrix_t -{ +struct modularity_matrix_t : laplacian_matrix_t { modularity_matrix_t(handle_t const& raft_handle, - index_type const* row_offsets, - index_type const* col_indices, - value_type const* values, - index_type const nrows, - index_type const nnz, - cudaStream_t stream = 0) : - laplacian_matrix_t(raft_handle, row_offsets, col_indices, values, nrows, nnz, stream) - { + index_type const* row_offsets, + index_type const* col_indices, value_type const* values, + index_type const nrows, index_type const nnz) + : laplacian_matrix_t( + raft_handle, row_offsets, col_indices, values, nrows, nnz) { auto* v = laplacian_matrix_t::diagonal_.raw(); //TODO: more work, here... } - modularity_matrix_t(handle_t const& raft_handle, - GraphCSRView const& csr_view, - cudaStream_t stream = 0): - laplacian_matrix_t(raft_handle, csr_view, stream) - { - } - + modularity_matrix_t( + handle_t const& raft_handle, + GraphCSRView const& csr_view) + : laplacian_matrix_t(raft_handle, csr_view) {} + // y = alpha*A*x + beta*y // - void mv(value_type alpha, - value_type const* __restrict__ x, - value_type beta, - value_type* __restrict__ y) const override - { + void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, + value_type* __restrict__ y) const override { //TODO: call cusparse::csrmv } }; - -} // namespace matrix -} // namespace raft + +} // namespace matrix +} // namespace raft diff --git a/cpp/include/raft/spectral/spectral_matrix.hpp b/cpp/include/raft/spectral/spectral_matrix.hpp deleted file mode 100644 index b9186329d3..0000000000 --- a/cpp/include/raft/spectral/spectral_matrix.hpp +++ /dev/null @@ -1,1044 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -// #include -// #include -// #include -// #include -// #include - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -#ifdef DEBUG -#include -#include -#endif - -#include "error_temp.hpp" // TODO: replace w/ actual error handling to be brought in soon - -// CUDA block size -#define BLOCK_SIZE 1024 - -// Get index of matrix entry -#define IDX(i, j, lda) ((i) + (j) * (lda)) - -namespace raft { -namespace matrix { - void check_size(size_t sz) - { - RAFT_EXPECT( sz <= INT_MAX, "Vector larger than INT_MAX"); - } - template - void nrm1_raw_vec(ValueType_* vec, size_t n, ValueType_* res, cudaStream_t stream) - { - thrust::device_ptr dev_ptr(vec); - *res = thrust::reduce(dev_ptr, dev_ptr + n); - CUDA_CHECK_LAST(); - } - - template - void fill_raw_vec(ValueType_* vec, size_t n, ValueType_ value, cudaStream_t stream) - { - thrust::device_ptr dev_ptr(vec); - thrust::fill(dev_ptr, dev_ptr + n, value); - CUDA_CHECK_LAST(); - } - - template - void dump_raw_vec(ValueType_* vec, size_t n, int offset, cudaStream_t stream) - { -#ifdef DEBUG - thrust::device_ptr dev_ptr(vec); - std::cout< - __global__ void flag_zeroes_kernel(int num_vertices, ValueType_* vec, int* flags) - { - int tidx = blockDim.x * blockIdx.x + threadIdx.x; - for (int r = tidx; r < num_vertices; r += blockDim.x * gridDim.x) { - if (vec[r] != 0.0) - flags[r] = 1; // NOTE 2 : alpha*0 + (1-alpha)*1 = (1-alpha) - else - flags[r] = 0; - } - } - template - __global__ void dmv0_kernel(const ValueType_* __restrict__ D, - const ValueType_* __restrict__ x, - ValueType_* __restrict__ y, - int n) - { - // y=D*x - int tidx = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = tidx; i < n; i += blockDim.x * gridDim.x) y[i] = D[i] * x[i]; - } - template - __global__ void dmv1_kernel(const ValueType_* __restrict__ D, - const ValueType_* __restrict__ x, - ValueType_* __restrict__ y, - int n) - { - // y+=D*x - int tidx = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = tidx; i < n; i += blockDim.x * gridDim.x) y[i] += D[i] * x[i]; - } - template - void copy_vec(ValueType_* vec1, size_t n, ValueType_* res, cudaStream_t stream) - { - thrust::device_ptr dev_ptr(vec1); - thrust::device_ptr res_ptr(res); -#ifdef DEBUG - // COUT() << "copy "<< n << " elements" << std::endl; -#endif - thrust::copy_n(dev_ptr, n, res_ptr); - CUDA_CHECK_LAST(); - // dump_raw_vec (res, n, 0); - } - - template - void flag_zeros_raw_vec(size_t num_vertices, ValueType_* vec, int* flags, cudaStream_t stream) - { - int items_per_thread = 4; - int num_threads = 128; - int max_grid_size = 4096; - check_size(num_vertices); - int n = static_cast(num_vertices); - int num_blocks = std::min(max_grid_size, (n / (items_per_thread * num_threads)) + 1); - flag_zeroes_kernel<<>>(num_vertices, vec, flags); - CUDA_CHECK_LAST(); - } - - template - void dmv(size_t num_vertices, - ValueType_ alpha, - ValueType_* D, - ValueType_* x, - ValueType_ beta, - ValueType_* y, - cudaStream_t stream) - { - RAFT_EXPECT((alpha == 1.0) && ((beta == 0.0) || (beta == 1.0)), "Not implemented case of y = D*x"); - - int items_per_thread = 4; - int num_threads = 128; - int max_grid_size = 4096; - check_size(num_vertices); - int n = static_cast(num_vertices); - int num_blocks = std::min(max_grid_size, (n / (items_per_thread * num_threads)) + 1); - if (beta == 0.0) - dmv0_kernel<<>>(D, x, y, n); - else if (beta == 1.0) - dmv1_kernel<<>>(D, x, y, n); - - CUDA_CHECK_LAST(); - } - - template - void set_connectivity(size_t n, - IndexType_ root, - ValueType_ self_loop_val, - ValueType_ unreachable_val, - ValueType_* res, - cudaStream_t stream) - { - fill_raw_vec(res, n, unreachable_val); - cudaMemcpy(&res[root], &self_loop_val, sizeof(self_loop_val), cudaMemcpyHostToDevice); - CUDA_CHECK_LAST(); - } - - - /*! A Vector contains a device vector of size |E| and type T - */ - template - class Vector { - public: - typedef ValueType_ ValueType; - - protected: - rmm::device_vector values; - - public: - /*! Construct an empty \p Vector. - */ - Vector(void) {} - ~Vector(void) {} - /*! Construct a \p Vector of size vertices. - * - * \param vertices The size of the Vector - */ - Vector(size_t vertices, cudaStream_t stream = 0) - : values(vertices) {} - - size_t get_size() const { return values.size(); } - size_t bytes() const { return values.size()*sizeof(ValueType);} - ValueType const *raw() const { return values.data().get(); } - ValueType *raw() { return values.data().get(); } - - void allocate(size_t n, cudaStream_t stream = 0) - { - values.resize(n);//TODO: delegate to outer alocator! - } - - void fill(ValueType val, cudaStream_t stream = 0) - { - fill_raw_vec(this->raw(), this->get_size(), val, stream); - } - - void copy(Vector &vec1, cudaStream_t stream = 0) - { - RAFT_EXPECT( (get_size() == 0 && vec1.get_size()>0) || (get_size() >= vec1.get_size()) ); - if (this->get_size() == 0 && vec1.get_size()>0) { - allocate(vec1.get_size(), stream); - copy_vec(vec1.raw(), this->get_size(), this->raw(), stream); - } else if (this->get_size() == vec1.get_size()) - copy_vec(vec1.raw(), this->get_size(), this->raw(), stream); - else // if (this->get_size() > vec1.get_size()) { - copy_vec(vec1.raw(), vec1.get_size(), this->raw(), stream); - } - } - - ValueType nrm1(cudaStream_t stream = 0) { - ValueType res = 0; - nrm1_raw_vec(this->raw(), this->get_size(), &res, stream); - return res; - } - }; // class Vector - - /// Abstract matrix class - /** Derived classes must implement matrix-vector products. - */ - template - class Matrix { - public: - /// Number of rows - const IndexType_ m; - /// Number of columns - const IndexType_ n; - /// CUDA stream - cudaStream_t s; - - /// Constructor - /** @param _m Number of rows. - * @param _n Number of columns. - */ - Matrix(IndexType_ _m, IndexType_ _n) : m(_m), n(_n), s(0){} - - /// Destructor - virtual ~Matrix() {} - - - /// Get and Set CUDA stream - virtual void setCUDAStream(cudaStream_t _s) = 0; - virtual void getCUDAStream(cudaStream_t *_s) = 0; - - /// Matrix-vector product - /** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n entries) Vector. - * @param beta Scalar. - * @param y (Input/output, device memory, m entries) Output - * vector. - */ - virtual void mv(ValueType_ alpha, - const ValueType_ * __restrict__ x, - ValueType_ beta, - ValueType_ * __restrict__ y) const = 0; - - virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const = 0; - /// Color and Reorder - virtual void color(IndexType_ *c, IndexType_ *p) const = 0; - virtual void reorder(IndexType_ *p) const = 0; - - /// Incomplete Cholesky (setup, factor and solve) - virtual void prec_setup(Matrix * _M) = 0; - virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const = 0; - - //Get the sum of all edges - virtual ValueType_ getEdgeSum() const = 0; - }; - - /// Sparse matrix class in CSR format - template - class CsrMatrix : public Matrix { - - private: - /// Whether to transpose matrix - const bool trans; - /// Whether matrix is stored in symmetric format - const bool sym; - /// Number of non-zero entries - const IndexType_ nnz; - /// Matrix properties - const cusparseMatDescr_t descrA; - /// Matrix entry values (device memory) - /*const*/ ValueType_ * csrValA; - /// Pointer to first entry in each row (device memory) - const IndexType_ * csrRowPtrA; - /// Column index of each matrix entry (device memory) - const IndexType_ * csrColIndA; - /// Analysis info (pointer to opaque CUSPARSE struct) - cusparseSolveAnalysisInfo_t info_l; - cusparseSolveAnalysisInfo_t info_u; - /// factored flag (originally set to false, then reset to true after factorization), - /// notice we only want to factor once - bool factored; - - public: - /// Constructor - CsrMatrix(bool _trans, bool _sym, - IndexType_ _m, IndexType_ _n, IndexType_ _nnz, - const cusparseMatDescr_t _descrA, - /*const*/ ValueType_ * _csrValA, - const IndexType_ * _csrRowPtrA, - const IndexType_ * _csrColIndA); - - /// Destructor - virtual ~CsrMatrix(); - - /// Get and Set CUDA stream - virtual void setCUDAStream(cudaStream_t _s); - virtual void getCUDAStream(cudaStream_t *_s); - - - /// Matrix-vector product - virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, - ValueType_ beta, ValueType_ * __restrict__ y) const; - /// Matrix-set of k vectors product - virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; - - /// Color and Reorder - virtual void color(IndexType_ *c, IndexType_ *p) const; - virtual void reorder(IndexType_ *p) const; - - /// Incomplete Cholesky (setup, factor and solve) - virtual void prec_setup(Matrix * _M); - virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; - - //Get the sum of all edges - virtual ValueType_ getEdgeSum() const; - }; - - /// Graph Laplacian matrix - template - class LaplacianMatrix - : public Matrix { - - private: - /// Adjacency matrix - /*const*/ Matrix * A; - /// Degree of each vertex - Vector D; - /// Preconditioning matrix - Matrix * M; - - public: - /// Constructor - LaplacianMatrix(/*const*/ Matrix & _A); - - /// Destructor - virtual ~LaplacianMatrix(); - - /// Get and Set CUDA stream - virtual void setCUDAStream(cudaStream_t _s); - virtual void getCUDAStream(cudaStream_t *_s); - - /// Matrix-vector product - virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, - ValueType_ beta, ValueType_ * __restrict__ y) const; - /// Matrix-set of k vectors product - virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; - - /// Scale a set of k vectors by a diagonal - virtual void dm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; - - /// Color and Reorder - virtual void color(IndexType_ *c, IndexType_ *p) const; - virtual void reorder(IndexType_ *p) const; - - /// Solve preconditioned system M x = f for a set of k vectors - virtual void prec_setup(Matrix * _M); - virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; - - //Get the sum of all edges - virtual ValueType_ getEdgeSum() const; - }; - - /// Modularity matrix - template - class ModularityMatrix - : public Matrix { - - private: - /// Adjacency matrix - /*const*/ Matrix * A; - /// Degree of each vertex - Vector D; - IndexType_ nnz; - ValueType_ edge_sum; - - /// Preconditioning matrix - Matrix * M; - - public: - /// Constructor - ModularityMatrix(/*const*/ Matrix & _A, IndexType_ _nnz); - - /// Destructor - virtual ~ModularityMatrix(); - - /// Get and Set CUDA stream - virtual void setCUDAStream(cudaStream_t _s); - virtual void getCUDAStream(cudaStream_t *_s); - - /// Matrix-vector product - virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, - ValueType_ beta, ValueType_ * __restrict__ y) const; - /// Matrix-set of k vectors product - virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; - - /// Scale a set of k vectors by a diagonal - virtual void dm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; - - /// Color and Reorder - virtual void color(IndexType_ *c, IndexType_ *p) const; - virtual void reorder(IndexType_ *p) const; - - /// Solve preconditioned system M x = f for a set of k vectors - virtual void prec_setup(Matrix * _M); - virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; - - //Get the sum of all edges - virtual ValueType_ getEdgeSum() const; - }; - -// ============================================= -// CUDA kernels -// ============================================= - -namespace { - -/// Apply diagonal matrix to vector -template -static __global__ void diagmv(IndexType_ n, - ValueType_ alpha, - const ValueType_ *__restrict__ D, - const ValueType_ *__restrict__ x, - ValueType_ *__restrict__ y) -{ - IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; - while (i < n) { - y[i] += alpha * D[i] * x[i]; - i += blockDim.x * gridDim.x; - } -} - -/// Apply diagonal matrix to a set of dense vectors (tall matrix) -template -static __global__ void diagmm(IndexType_ n, - IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ D, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) -{ - IndexType_ i, j, index; - - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < k; j += blockDim.y * gridDim.y) { - for (i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x) { - index = i + j * n; - if (beta_is_zero) { - y[index] = alpha * D[i] * x[index]; - } else { - y[index] = alpha * D[i] * x[index] + beta * y[index]; - } - } - } -} -} // namespace - -// ============================================= -// CSR matrix class -// ============================================= - -/// Constructor for CSR matrix class -/** @param _transA Whether to transpose matrix. - * @param _m Number of rows. - * @param _n Number of columns. - * @param _nnz Number of non-zero entries. - * @param _descrA Matrix properties. - * @param _csrValA (Input, device memory, _nnz entries) Matrix - * entry values. - * @param _csrRowPtrA (Input, device memory, _m+1 entries) Pointer - * to first entry in each row. - * @param _csrColIndA (Input, device memory, _nnz entries) Column - * index of each matrix entry. - */ -template -CsrMatrix::CsrMatrix(bool _trans, - bool _sym, - IndexType_ _m, - IndexType_ _n, - IndexType_ _nnz, - const cusparseMatDescr_t _descrA, - /*const*/ ValueType_ *_csrValA, - const IndexType_ *_csrRowPtrA, - const IndexType_ *_csrColIndA) - : Matrix(_m, _n), - trans(_trans), - sym(_sym), - nnz(_nnz), - descrA(_descrA), - csrValA(_csrValA), - csrRowPtrA(_csrRowPtrA), - csrColIndA(_csrColIndA) -{ - RAFT_EXPECT(nnz >= 0, "invalid CSR matrix parameter (nnz<0)"); - Cusparse::set_pointer_mode_host(); -} - -/// Destructor for CSR matrix class -template -CsrMatrix::~CsrMatrix() -{ -} - -/// Get and Set CUDA stream -template -void CsrMatrix::setCUDAStream(cudaStream_t _s) -{ - this->s = _s; - // printf("CsrMatrix setCUDAStream stream=%p\n",this->s); - Cusparse::setStream(_s); -} -template -void CsrMatrix::getCUDAStream(cudaStream_t *_s) -{ - *_s = this->s; - // CHECK_CUSPARSE(cusparseGetStream(Cusparse::get_handle(), _s)); -} -template -void CsrMatrix::mm(IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - // CHECK_CUSPARSE(cusparseXcsrmm(Cusparse::get_handle(), transA, this->m, k, this->n, nnz, &alpha, - // descrA, csrValA, csrRowPtrA, csrColIndA, x, this->n, &beta, y, this->m)); - Cusparse::csrmm(this->trans, - this->sym, - this->m, - k, - this->n, - this->nnz, - &alpha, - this->csrValA, - this->csrRowPtrA, - this->csrColIndA, - x, - this->n, - &beta, - y, - this->m); -} - -/// Color and Reorder -template -void CsrMatrix::color(IndexType_ *c, IndexType_ *p) const -{ -} - -template -void CsrMatrix::reorder(IndexType_ *p) const -{ -} - -/// Incomplete Cholesky (setup, factor and solve) -template -void CsrMatrix::prec_setup(Matrix *_M) -{ - // printf("CsrMatrix prec_setup dispacthed\n"); - if (!factored) { - // analyse lower triangular factor - CHECK_CUSPARSE(cusparseCreateSolveAnalysisInfo(&info_l)); - CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_LOWER)); - CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_UNIT)); - CHECK_CUSPARSE(cusparseXcsrsm_analysis(Cusparse::get_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->m, - nnz, - descrA, - csrValA, - csrRowPtrA, - csrColIndA, - info_l)); - // analyse upper triangular factor - CHECK_CUSPARSE(cusparseCreateSolveAnalysisInfo(&info_u)); - CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_UPPER)); - CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_NON_UNIT)); - CHECK_CUSPARSE(cusparseXcsrsm_analysis(Cusparse::get_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->m, - nnz, - descrA, - csrValA, - csrRowPtrA, - csrColIndA, - info_u)); - // perform csrilu0 (should be slightly faster than csric0) - CHECK_CUSPARSE(cusparseXcsrilu0(Cusparse::get_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->m, - descrA, - csrValA, - csrRowPtrA, - csrColIndA, - info_l)); - // set factored flag to true - factored = true; - } -} - -template -void CsrMatrix::prec_solve(IndexType_ k, - ValueType_ alpha, - ValueType_ *__restrict__ fx, - ValueType_ *__restrict__ t) const -{ - // printf("CsrMatrix prec_solve dispacthed (stream %p)\n",this->s); - - // preconditioning Mx=f (where M = L*U, threfore x=U\(L\f)) - // solve lower triangular factor - CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_LOWER)); - CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_UNIT)); - CHECK_CUSPARSE(cusparseXcsrsm_solve(Cusparse::get_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->m, - k, - alpha, - descrA, - csrValA, - csrRowPtrA, - csrColIndA, - info_l, - fx, - this->m, - t, - this->m)); - // solve upper triangular factor - CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_UPPER)); - CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_NON_UNIT)); - CHECK_CUSPARSE(cusparseXcsrsm_solve(Cusparse::get_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->m, - k, - alpha, - descrA, - csrValA, - csrRowPtrA, - csrColIndA, - info_u, - t, - this->m, - fx, - this->m)); -} - -/// Matrix-vector product for CSR matrix class -/** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n entries) Vector. - * @param beta Scalar. - * @param y (Input/output, device memory, m entries) Output vector. - */ -template -void CsrMatrix::mv(ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - // TODO: consider using merge-path csrmv - Cusparse::csrmv(this->trans, - this->sym, - this->m, - this->n, - this->nnz, - &alpha, - this->csrValA, - this->csrRowPtrA, - this->csrColIndA, - x, - &beta, - y); -} - -template -ValueType_ CsrMatrix::getEdgeSum() const -{ - return 0.0; -} - -// ============================================= -// Laplacian matrix class -// ============================================= - -/// Constructor for Laplacian matrix class -/** @param A Adjacency matrix - */ -template -LaplacianMatrix::LaplacianMatrix( - /*const*/ Matrix &_A) - : Matrix(_A.m, _A.n), A(&_A) -{ - // Check that adjacency matrix is square - RAFT_EXPECT(_A.m == _A.n, "cannot construct Laplacian matrix from non-square adjacency matrix"); - // set CUDA stream - this->s = nullptr; - // Construct degree matrix - D.allocate(_A.m, this->s); - Vector ones(this->n, this->s); - ones.fill(1.0); - _A.mv(1, ones.raw(), 0, D.raw()); - - // Set preconditioning matrix pointer to nullptr - M = nullptr; -} - -/// Destructor for Laplacian matrix class -template -LaplacianMatrix::~LaplacianMatrix() -{ -} - -/// Get and Set CUDA stream -template -void LaplacianMatrix::setCUDAStream(cudaStream_t _s) -{ - this->s = _s; - // printf("LaplacianMatrix setCUDAStream stream=%p\n",this->s); - A->setCUDAStream(_s); - if (M != nullptr) { M->setCUDAStream(_s); } -} -template -void LaplacianMatrix::getCUDAStream(cudaStream_t *_s) -{ - *_s = this->s; - // A->getCUDAStream(_s); -} - -/// Matrix-vector product for Laplacian matrix class -/** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n entries) Vector. - * @param beta Scalar. - * @param y (Input/output, device memory, m entries) Output vector. - */ -template -void LaplacianMatrix::mv(ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - // Scale result vector - if (beta == 0) - CHECK_CUDA(cudaMemset(y, 0, (this->n) * sizeof(ValueType_))) - else if (beta != 1) - thrust::transform(thrust::device_pointer_cast(y), - thrust::device_pointer_cast(y + this->n), - thrust::make_constant_iterator(beta), - thrust::device_pointer_cast(y), - thrust::multiplies()); - - // Apply diagonal matrix - dim3 gridDim, blockDim; - gridDim.x = min(((this->n) + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); - gridDim.y = 1; - gridDim.z = 1; - blockDim.x = BLOCK_SIZE; - blockDim.y = 1; - blockDim.z = 1; - diagmv<<s>>>(this->n, alpha, D.raw(), x, y); - CUDA_CHECK_LAST(); - - // Apply adjacency matrix - A->mv(-alpha, x, 1, y); -} -/// Matrix-vector product for Laplacian matrix class -/** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n*k entries) nxk dense matrix. - * @param beta Scalar. - * @param y (Input/output, device memory, m*k entries) Output mxk dense matrix. - */ -template -void LaplacianMatrix::mm(IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - // Apply diagonal matrix - ValueType_ one = (ValueType_)1.0; - this->dm(k, alpha, x, beta, y); - - // Apply adjacency matrix - A->mm(k, -alpha, x, one, y); -} - -template -void LaplacianMatrix::dm(IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - IndexType_ t = k * (this->n); - dim3 gridDim, blockDim; - - // setup launch parameters - gridDim.x = min(((this->n) + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); - gridDim.y = min(k, 65535); - gridDim.z = 1; - blockDim.x = BLOCK_SIZE; - blockDim.y = 1; - blockDim.z = 1; - - // Apply diagonal matrix - if (beta == 0.0) { - // set vectors to 0 (WARNING: notice that you need to set, not scale, because of NaNs corner - // case) - CHECK_CUDA(cudaMemset(y, 0, t * sizeof(ValueType_))); - diagmm - <<s>>>(this->n, k, alpha, D.raw(), x, beta, y); - } else { - diagmm - <<s>>>(this->n, k, alpha, D.raw(), x, beta, y); - } - CUDA_CHECK_LAST(); -} - -/// Color and Reorder -template -void LaplacianMatrix::color(IndexType_ *c, IndexType_ *p) const -{ -} - -template -void LaplacianMatrix::reorder(IndexType_ *p) const -{ -} - -/// Solve preconditioned system M x = f for a set of k vectors -template -void LaplacianMatrix::prec_setup(Matrix *_M) -{ - // save the pointer to preconditioner M - M = _M; - if (M != nullptr) { - // setup the preconditioning matrix M - M->prec_setup(nullptr); - } -} - -template -void LaplacianMatrix::prec_solve(IndexType_ k, - ValueType_ alpha, - ValueType_ *__restrict__ fx, - ValueType_ *__restrict__ t) const -{ - if (M != nullptr) { - // preconditioning - M->prec_solve(k, alpha, fx, t); - } -} - -template -ValueType_ LaplacianMatrix::getEdgeSum() const -{ - return 0.0; -} -// ============================================= -// Modularity matrix class -// ============================================= - -/// Constructor for Modularity matrix class -/** @param A Adjacency matrix - */ -template -ModularityMatrix::ModularityMatrix( - /*const*/ Matrix &_A, IndexType_ _nnz) - : Matrix(_A.m, _A.n), A(&_A), nnz(_nnz) -{ - // Check that adjacency matrix is square - RAFT_EXPECT(_A.m == _A.n, "cannot construct Modularity matrix from non-square adjacency matrix"); - - // set CUDA stream - this->s = nullptr; - // Construct degree matrix - D.allocate(_A.m, this->s); - Vector ones(this->n, this->s); - ones.fill(1.0); - _A.mv(1, ones.raw(), 0, D.raw()); - // D.dump(0,this->n); - edge_sum = D.nrm1(); - - // Set preconditioning matrix pointer to nullptr - M = nullptr; -} - -/// Destructor for Modularity matrix class -template -ModularityMatrix::~ModularityMatrix() -{ -} - -/// Get and Set CUDA stream -template -void ModularityMatrix::setCUDAStream(cudaStream_t _s) -{ - this->s = _s; - // printf("ModularityMatrix setCUDAStream stream=%p\n",this->s); - A->setCUDAStream(_s); - if (M != nullptr) { M->setCUDAStream(_s); } -} - -template -void ModularityMatrix::getCUDAStream(cudaStream_t *_s) -{ - *_s = this->s; - // A->getCUDAStream(_s); -} - -/// Matrix-vector product for Modularity matrix class -/** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n entries) Vector. - * @param beta Scalar. - * @param y (Input/output, device memory, m entries) Output vector. - */ -template -void ModularityMatrix::mv(ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - // Scale result vector - RAFT_EXPECT(alpha == 1 && beta == 0, "cannot construct Modularity matrix from non-square adjacency matrix"); - - // CHECK_CUBLAS(cublasXdot(handle, this->n, const double *x, int incx, const double *y, int incy, - // double *result)); - // y = A*x - A->mv(alpha, x, 0, y); - ValueType_ dot_res; - // gamma = d'*x - Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); - // y = y -(gamma/edge_sum)*d - Cublas::axpy(this->n, -(dot_res / this->edge_sum), D.raw(), 1, y, 1); -} -/// Matrix-vector product for Modularity matrix class -/** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n*k entries) nxk dense matrix. - * @param beta Scalar. - * @param y (Input/output, device memory, m*k entries) Output mxk dense matrix. - */ -template -void ModularityMatrix::mm(IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - RAFT_FAIL("Functionality not currently supported in Modularity Matrix."); -} - -template -void ModularityMatrix::dm(IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - RAFT_FAIL("Functionality not currently supported in Modularity Matrix."); -} - -/// Color and Reorder -template -void ModularityMatrix::color(IndexType_ *c, IndexType_ *p) const -{ - RAFT_FAIL("Functionality not currently supported in Modularity Matrix."); -} - -template -void ModularityMatrix::reorder(IndexType_ *p) const -{ - RAFT_FAIL("Functionality not currently supported in Modularity Matrix."); -} - -/// Solve preconditioned system M x = f for a set of k vectors -template -void ModularityMatrix::prec_setup(Matrix *_M) -{ - // save the pointer to preconditioner M - M = _M; - if (M != nullptr) { - // setup the preconditioning matrix M - M->prec_setup(nullptr); - } -} - -template -void ModularityMatrix::prec_solve(IndexType_ k, - ValueType_ alpha, - ValueType_ *__restrict__ fx, - ValueType_ *__restrict__ t) const -{ - RAFT_EXPECT(M == nullptr, "Functionality not currently supported in Modularity Matrix."); -} - -template -ValueType_ ModularityMatrix::getEdgeSum() const -{ - return edge_sum; -} - -} // namespace matrix -} // namespace raft From 17091559892b347f58b22039cf5ac13977978574 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 11 Jun 2020 12:48:31 -0500 Subject: [PATCH 21/88] Updating partition entry calls. --- cpp/include/raft/spectral/kmeans.hpp | 16 +-- cpp/include/raft/spectral/lanczos.hpp | 7 - cpp/include/raft/spectral/matrix_wrappers.hpp | 9 +- cpp/include/raft/spectral/partition.hpp | 130 ++++++------------ 4 files changed, 54 insertions(+), 108 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index a9b1c1f049..37c045b7f9 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once #include #include @@ -32,18 +33,15 @@ #include #include +namespace { + // ========================================================= -// Useful macros +// Useful grid settings // ========================================================= -#define BLOCK_SIZE 1024 -#define WARP_SIZE 32 -#define BSIZE_DIV_WSIZE (BLOCK_SIZE / WARP_SIZE) - -// Get index of matrix entry -#define IDX(i, j, lda) ((i) + (j) * (lda)) - -namespace { +constexpr unsigned int BLOCK_SIZE = 1024; +constexpr unsigned int WARP_SIZE = 32; +constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); // ========================================================= // CUDA kernels diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 5a334c2c1a..8f33eb7cc2 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -30,13 +30,6 @@ #include #include -// ========================================================= -// Useful macros -// ========================================================= - -// Get index of matrix entry -#define IDX(i, j, lda) ((i) + (j) * (lda)) - namespace raft { namespace { diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index d8e497fe0f..3bca437c0c 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -15,10 +15,17 @@ */ #pragma once -#include // ? +#include #include #include +// ========================================================= +// Useful macros +// ========================================================= + +// Get index of matrix entry +#define IDX(i, j, lda) ((i) + (j) * (lda)) + namespace raft { namespace matrix { diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index e4b9f50790..90b678a973 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once #include "include/partition.hxx" @@ -25,16 +26,10 @@ #include #include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include -namespace nvgraph { +namespace raft { // ========================================================= // Useful macros @@ -153,21 +148,24 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) * performed. * @param iters_kmeans On exit, number of k-means iterations * performed. - * @return NVGRAPH error flag. + * @return error flag. */ -template -NVGRAPH_ERROR partition( - cugraph::experimental::GraphCSRView const &graph, - vertex_t nParts, - vertex_t nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - weight_t tol_lanczos, - int maxIter_kmeans, - weight_t tol_kmeans, - vertex_t *__restrict__ parts, - weight_t *eigVals, - weight_t *eigVecs) +template +int partition(handle_t handle, + ThrustExePolicy thrust_exec_policy, + cugraph::experimental::GraphCSRView const &graph, + vertex_t nParts, + vertex_t nEigVecs, + int maxIter_lanczos, + int restartIter_lanczos, + weight_t tol_lanczos, + int maxIter_kmeans, + weight_t tol_kmeans, + vertex_t *__restrict__ parts, + weight_t *eigVals, + weight_t *eigVecs) { cudaStream_t stream = 0; @@ -190,19 +188,11 @@ NVGRAPH_ERROR partition( // Compute eigenvectors of Laplacian // Initialize Laplacian - CsrMatrix A(false, - false, - graph.number_of_vertices, - graph.number_of_vertices, - graph.number_of_edges, - 0, - graph.edge_data, - graph.offsets, - graph.indices); - LaplacianMatrix L(A); + sparse_matrix_t A{graph}; + laplacian_matrix_t L{handle, graph}; // Compute smallest eigenvalues and eigenvectors - CHECK_NVGRAPH(computeSmallestEigenvectors(L, + RAFT_TRY(computeSmallestEigenvectors(L, nEigVecs, maxIter_lanczos, restartIter_lanczos, @@ -260,17 +250,17 @@ NVGRAPH_ERROR partition( // eigVecs.dump(0, nEigVecs*n); // Find partition with k-means clustering - CHECK_NVGRAPH(kmeans(n, - nEigVecs, - nParts, - tol_kmeans, - maxIter_kmeans, - eigVecs, - parts, - residual_kmeans, - iters_kmeans)); - - return NVGRAPH_OK; + RAFT_TRY(kmeans(n, + nEigVecs, + nParts, + tol_kmeans, + maxIter_kmeans, + eigVecs, + parts, + residual_kmeans, + iters_kmeans)); + + return 0; } // ========================================================= @@ -307,10 +297,10 @@ struct equal_to_i_op { * assignments. * @param edgeCut On exit, weight of edges cut by partition. * @param cost On exit, partition cost function. - * @return NVGRAPH error flag. + * @return error flag. */ template -NVGRAPH_ERROR analyzePartition( +int analyzePartition( cugraph::experimental::GraphCSRView const &graph, vertex_t nParts, const vertex_t *__restrict__ parts, @@ -376,49 +366,7 @@ NVGRAPH_ERROR analyzePartition( } // Clean up and return - return NVGRAPH_OK; + return 0; } -// ========================================================= -// Explicit instantiation -// ========================================================= -template NVGRAPH_ERROR partition( - cugraph::experimental::GraphCSRView const &graph, - int nParts, - int nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - float tol_lanczos, - int maxIter_kmeans, - float tol_kmeans, - int *__restrict__ parts, - float *eigVals, - float *eigVecs); - -template NVGRAPH_ERROR partition( - cugraph::experimental::GraphCSRView const &graph, - int nParts, - int nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - double tol_lanczos, - int maxIter_kmeans, - double tol_kmeans, - int *__restrict__ parts, - double *eigVals, - double *eigVecs); - -template NVGRAPH_ERROR analyzePartition( - cugraph::experimental::GraphCSRView const &graph, - int nParts, - const int *__restrict__ parts, - float &edgeCut, - float &cost); -template NVGRAPH_ERROR analyzePartition( - cugraph::experimental::GraphCSRView const &graph, - int nParts, - const int *__restrict__ parts, - double &edgeCut, - double &cost); - -} // namespace nvgraph +} // namespace raft From aa7d8ec68819f4241b7682cbd19d225b048039ca Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 11 Jun 2020 13:19:37 -0500 Subject: [PATCH 22/88] Updates on partition and matrix wrappers. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 4 +- cpp/include/raft/spectral/partition.hpp | 162 +++++++----------- 2 files changed, 66 insertions(+), 100 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 3bca437c0c..04fd8cc185 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -144,7 +144,7 @@ struct laplacian_matrix_t : sparse_matrix_t { // void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, value_type* __restrict__ y) const override { - //TODO: call cusparse::csrmv + //TODO: call cusparse::csrmv ... and more } vector_t diagonal_; @@ -171,7 +171,7 @@ struct modularity_matrix_t : laplacian_matrix_t { // void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, value_type* __restrict__ y) const override { - //TODO: call cusparse::csrmv + //TODO: call cusparse::csrmv ... and more } }; diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 90b678a973..8dfa38d9a3 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -26,8 +26,8 @@ #include #include -#include #include +#include namespace raft { @@ -39,19 +39,21 @@ namespace raft { #define IDX(i, j, lda) ((i) + (j) * (lda)) template -static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, ValueType_ *obs) -{ +static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, + ValueType_ *obs) { IndexType_ i, j, k, index, mm; ValueType_ alpha, v, last; bool valid; // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension // compute alpha - mm = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x); // m in multiple of blockDim.x + mm = (((m + blockDim.x - 1) / blockDim.x) * + blockDim.x); // m in multiple of blockDim.x alpha = 0.0; // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, // li, mn); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; + j += blockDim.y * gridDim.y) { for (i = threadIdx.x; i < mm; i += blockDim.x) { // check if the thread is valid valid = i < m; @@ -76,17 +78,17 @@ static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, ValueType_ * // scale by alpha alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); alpha = std::sqrt(alpha); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; + j += blockDim.y * gridDim.y) { for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 - index = i + j * m; + index = i + j * m; obs[index] = obs[index] / alpha; } } } template -IndexType_ next_pow2(IndexType_ n) -{ +IndexType_ next_pow2(IndexType_ n) { IndexType_ v; // Reference: // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float @@ -100,8 +102,7 @@ IndexType_ next_pow2(IndexType_ n) } template -cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) -{ +cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) { IndexType_ p2m; dim3 nthreads, nblocks; @@ -111,9 +112,9 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) nthreads.x = max(2, min(p2m, 32)); nthreads.y = 256 / nthreads.x; nthreads.z = 1; - nblocks.x = 1; - nblocks.y = (n + nthreads.y - 1) / nthreads.y; - nblocks.z = 1; + nblocks.x = 1; + nblocks.y = (n + nthreads.y - 1) / nthreads.y; + nblocks.z = 1; // printf("m=%d(%d),n=%d,obs=%p, // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); @@ -150,28 +151,21 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) * performed. * @return error flag. */ -template -int partition(handle_t handle, - ThrustExePolicy thrust_exec_policy, - cugraph::experimental::GraphCSRView const &graph, - vertex_t nParts, - vertex_t nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - weight_t tol_lanczos, - int maxIter_kmeans, - weight_t tol_kmeans, - vertex_t *__restrict__ parts, - weight_t *eigVals, - weight_t *eigVecs) -{ - cudaStream_t stream = 0; - +int partition( + handle_t handle, ThrustExePolicy thrust_exec_policy, + cugraph::experimental::GraphCSRView const &graph, + vertex_t nParts, vertex_t nEigVecs, int maxIter_lanczos, + int restartIter_lanczos, weight_t tol_lanczos, int maxIter_kmeans, + weight_t tol_kmeans, vertex_t *__restrict__ parts, weight_t *eigVals, + weight_t *eigVecs) { const weight_t zero{0.0}; const weight_t one{1.0}; + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + int iters_lanczos; int iters_kmeans; @@ -192,22 +186,17 @@ int partition(handle_t handle, laplacian_matrix_t L{handle, graph}; // Compute smallest eigenvalues and eigenvectors - RAFT_TRY(computeSmallestEigenvectors(L, - nEigVecs, - maxIter_lanczos, - restartIter_lanczos, - tol_lanczos, - false, - iters_lanczos, - eigVals, - eigVecs)); + RAFT_TRY(computeSmallestEigenvectors(L, nEigVecs, maxIter_lanczos, + restartIter_lanczos, tol_lanczos, false, + iters_lanczos, eigVals, eigVecs)); // Whiten eigenvector matrix for (i = 0; i < nEigVecs; ++i) { weight_t mean, std; - mean = thrust::reduce(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + mean = + thrust::reduce(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); cudaCheckError(); mean /= n; thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), @@ -216,7 +205,8 @@ int partition(handle_t handle, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::minus()); cudaCheckError(); - std = Cublas::nrm2(n, eigVecs + IDX(0, i, n), 1) / std::sqrt(static_cast(n)); + std = Cublas::nrm2(n, eigVecs + IDX(0, i, n), 1) / + std::sqrt(static_cast(n)); thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), thrust::make_constant_iterator(std), @@ -228,38 +218,22 @@ int partition(handle_t handle, // Transpose eigenvector matrix // TODO: in-place transpose { - Vector work(nEigVecs * n, stream); + vector_t work(handle, nEigVecs * n); Cublas::set_pointer_mode_host(); - Cublas::geam(true, - false, - nEigVecs, - n, - &one, - eigVecs, - n, - &zero, - (weight_t *)NULL, - nEigVecs, - work.raw(), - nEigVecs); - CHECK_CUDA(cudaMemcpyAsync( - eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice)); + Cublas::geam(true, false, nEigVecs, n, &one, eigVecs, n, &zero, + (weight_t *)NULL, nEigVecs, work.raw(), nEigVecs); + CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(), + nEigVecs * n * sizeof(weight_t), + cudaMemcpyDeviceToDevice)); } // Clean up // eigVecs.dump(0, nEigVecs*n); // Find partition with k-means clustering - RAFT_TRY(kmeans(n, - nEigVecs, - nParts, - tol_kmeans, - maxIter_kmeans, - eigVecs, - parts, - residual_kmeans, - iters_kmeans)); - + RAFT_TRY(kmeans(n, nEigVecs, nParts, tol_kmeans, maxIter_kmeans, eigVecs, + parts, residual_kmeans, iters_kmeans)); + return 0; } @@ -278,9 +252,9 @@ struct equal_to_i_op { public: equal_to_i_op(IndexType_ _i) : i(_i) {} template - __host__ __device__ void operator()(Tuple_ t) - { - thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; + __host__ __device__ void operator()(Tuple_ t) { + thrust::get<1>(t) = + (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; } }; } // namespace @@ -301,51 +275,43 @@ struct equal_to_i_op { */ template int analyzePartition( + handle_t handle, ThrustExePolicy thrust_exec_policy, cugraph::experimental::GraphCSRView const &graph, - vertex_t nParts, - const vertex_t *__restrict__ parts, - weight_t &edgeCut, - weight_t &cost) -{ - cudaStream_t stream = 0; - + vertex_t nParts, const vertex_t *__restrict__ parts, weight_t &edgeCut, + weight_t &cost) { edge_t i; edge_t n = graph.number_of_vertices; + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + weight_t partEdgesCut, partSize; // Device memory - Vector part_i(n, stream); - Vector Lx(n, stream); + vector_t part_i(handle, n); + vector_t Lx(handle, n); // Initialize cuBLAS Cublas::set_pointer_mode_host(); // Initialize Laplacian - CsrMatrix A(false, - false, - graph.number_of_vertices, - graph.number_of_vertices, - graph.number_of_edges, - 0, - graph.edge_data, - graph.offsets, - graph.indices); - LaplacianMatrix L(A); + sparse_matrix_t A{graph}; + laplacian_matrix_t L{handle, graph}; // Initialize output - cost = 0; + cost = 0; edgeCut = 0; // Iterate through partitions for (i = 0; i < nParts; ++i) { // Construct indicator vector for ith partition - thrust::for_each( - thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts), - thrust::device_pointer_cast(part_i.raw()))), - thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts + n), - thrust::device_pointer_cast(part_i.raw() + n))), - equal_to_i_op(i)); + thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple( + thrust::device_pointer_cast(parts), + thrust::device_pointer_cast(part_i.raw()))), + thrust::make_zip_iterator(thrust::make_tuple( + thrust::device_pointer_cast(parts + n), + thrust::device_pointer_cast(part_i.raw() + n))), + equal_to_i_op(i)); cudaCheckError(); // Compute size of ith partition From 192000c7a35c3536cb489ff93261cf3d178362c7 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 11 Jun 2020 15:58:42 -0500 Subject: [PATCH 23/88] More partition clan-up. Added sm_utils. --- cpp/include/raft/spectral/kmeans.hpp | 10 +- cpp/include/raft/spectral/lanczos.hpp | 1 + cpp/include/raft/spectral/partition.hpp | 67 +++--- cpp/include/raft/spectral/sm_utils.hpp | 297 ++++++++++++++++++++++++ 4 files changed, 344 insertions(+), 31 deletions(-) create mode 100644 cpp/include/raft/spectral/sm_utils.hpp diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 37c045b7f9..f57a4c1be5 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -32,9 +32,12 @@ #include #include #include +#include namespace { +using namespace raft; +using namespace raft::linalg; // ========================================================= // Useful grid settings // ========================================================= @@ -328,8 +331,6 @@ static int chooseNewCentroid(handle_t handle, const ValueType_* __restrict__ obs, ValueType_* __restrict__ dists, ValueType_* __restrict__ centroid) { - using namespace thrust; - // Cumulative sum of distances ValueType_* distsCumSum = dists + n; // Residual sum of squares @@ -751,8 +752,9 @@ int kmeans(handle_t handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, } // Initialize cuBLAS - CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, - stream)); // ????? TODO: check / remove + CUBLAS_CHECK( + linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, + stream)); // ????? TODO: check / remove // ------------------------------------------------------- // k-means++ algorithm diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 8f33eb7cc2..54818e1766 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -35,6 +35,7 @@ namespace raft { namespace { using namespace matrix; +using namespace linalg; // ========================================================= // Helper functions diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 8dfa38d9a3..156be656bd 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -28,15 +28,12 @@ #include #include +#include namespace raft { -// ========================================================= -// Useful macros -// ========================================================= - -// Get index of matrix entry -#define IDX(i, j, lda) ((i) + (j) * (lda)) +using namespace matrix; +using namespace linalg; template static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, @@ -120,7 +117,7 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) { // launch scaling kernel (scale each column of obs by its norm) scale_obs_kernel<<>>(m, n, obs); - cudaCheckError(); + CUDA_CHECK_LAST(); return cudaSuccess; } @@ -152,7 +149,8 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) { * @return error flag. */ template + typename ThrustExePolicy, typename EigenSolver = LanczosSolver, + typename ClusterSolver = KmeansSolver> int partition( handle_t handle, ThrustExePolicy thrust_exec_policy, cugraph::experimental::GraphCSRView const &graph, @@ -194,37 +192,47 @@ int partition( for (i = 0; i < nEigVecs; ++i) { weight_t mean, std; - mean = - thrust::reduce(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); - cudaCheckError(); + mean = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + CUDA_CHECK_LAST(); mean /= n; - thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::transform(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), thrust::make_constant_iterator(mean), thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::minus()); - cudaCheckError(); - std = Cublas::nrm2(n, eigVecs + IDX(0, i, n), 1) / - std::sqrt(static_cast(n)); - thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + CUDA_CHECK_LAST(); + + CUBLAS_CHECK( + cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); + + std /= std::sqrt(static_cast(n)); + + thrust::transform(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), thrust::make_constant_iterator(std), thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::divides()); - cudaCheckError(); + CUDA_CHECK_LAST(); } // Transpose eigenvector matrix // TODO: in-place transpose { vector_t work(handle, nEigVecs * n); - Cublas::set_pointer_mode_host(); - Cublas::geam(true, false, nEigVecs, n, &one, eigVecs, n, &zero, - (weight_t *)NULL, nEigVecs, work.raw(), nEigVecs); + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n, + &one, eigVecs, n, &zero, (weight_t *)NULL, nEigVecs, + work.raw(), nEigVecs, stream)); + CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), - cudaMemcpyDeviceToDevice)); + cudaMemcpyDeviceToDevice, stream)); } // Clean up @@ -292,7 +300,8 @@ int analyzePartition( vector_t Lx(handle, n); // Initialize cuBLAS - Cublas::set_pointer_mode_host(); + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Laplacian sparse_matrix_t A{graph}; @@ -305,17 +314,20 @@ int analyzePartition( // Iterate through partitions for (i = 0; i < nParts; ++i) { // Construct indicator vector for ith partition - thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple( + thrust::for_each(thrust_exec_policy, + thrust::make_zip_iterator(thrust::make_tuple( thrust::device_pointer_cast(parts), thrust::device_pointer_cast(part_i.raw()))), thrust::make_zip_iterator(thrust::make_tuple( thrust::device_pointer_cast(parts + n), thrust::device_pointer_cast(part_i.raw() + n))), equal_to_i_op(i)); - cudaCheckError(); + CUDA_CHECK_LAST(); // Compute size of ith partition - Cublas::dot(n, part_i.raw(), 1, part_i.raw(), 1, &partSize); + CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, + &partSize, stream)); + partSize = round(partSize); if (partSize < 0.5) { WARNING("empty partition"); @@ -324,7 +336,8 @@ int analyzePartition( // Compute number of edges cut by ith partition L.mv(1, part_i.raw(), 0, Lx.raw()); - Cublas::dot(n, Lx.raw(), 1, part_i.raw(), 1, &partEdgesCut); + CUBLAS_CHECK(cublasdot(cublas_h, n, Lx.raw(), 1, part_i.raw(), 1, + &partEdgesCut, stream)); // Record results cost += partEdgesCut / partSize; diff --git a/cpp/include/raft/spectral/sm_utils.hpp b/cpp/include/raft/spectral/sm_utils.hpp new file mode 100644 index 0000000000..25d6e2e358 --- /dev/null +++ b/cpp/include/raft/spectral/sm_utils.hpp @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef _MSC_VER +#include +#else +#include +#endif + +#define DEFAULT_MASK 0xffffffff + +#define USE_CG 1 +//(__CUDACC_VER__ >= 80500) + +namespace raft { +namespace utils { +static __device__ __forceinline__ int lane_id() { + int id; + asm("mov.u32 %0, %%laneid;" : "=r"(id)); + return id; +} + +static __device__ __forceinline__ int lane_mask_lt() { + int mask; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask)); + return mask; +} + +static __device__ __forceinline__ int lane_mask_le() { + int mask; + asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask)); + return mask; +} + +static __device__ __forceinline__ int warp_id() { return threadIdx.x >> 5; } + +static __device__ __forceinline__ unsigned int ballot(int p, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#if USE_CG + return __ballot_sync(mask, p); +#else + return __ballot(p); +#endif +#else + return 0; +#endif +} + +static __device__ __forceinline__ int shfl(int r, int lane, int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#if USE_CG + return __shfl_sync(mask, r, lane, bound); +#else + return __shfl(r, lane, bound); +#endif +#else + return 0; +#endif +} + +static __device__ __forceinline__ float shfl(float r, int lane, int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#if USE_CG + return __shfl_sync(mask, r, lane, bound); +#else + return __shfl(r, lane, bound); +#endif +#else + return 0.0f; +#endif +} + +/// Warp shuffle down function +/** Warp shuffle functions on 64-bit floating point values are not + * natively implemented as of Compute Capability 5.0. This + * implementation has been copied from + * (http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler). + * Once this is natively implemented, this function can be replaced + * by __shfl_down. + * + */ +static __device__ __forceinline__ double shfl(double r, int lane, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_sync(mask, a.x, lane, bound); + a.y = __shfl_sync(mask, a.y, lane, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl(a.x, lane, bound); + a.y = __shfl(a.y, lane, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +static __device__ __forceinline__ long long shfl(long long r, int lane, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_sync(mask, a.x, lane, bound); + a.y = __shfl_sync(mask, a.y, lane, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl(a.x, lane, bound); + a.y = __shfl(a.y, lane, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +static __device__ __forceinline__ int shfl_down(int r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + return __shfl_down_sync(mask, r, offset, bound); +#else + return __shfl_down(r, offset, bound); +#endif +#else + return 0.0f; +#endif +} + +static __device__ __forceinline__ float shfl_down(float r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + return __shfl_down_sync(mask, r, offset, bound); +#else + return __shfl_down(r, offset, bound); +#endif +#else + return 0.0f; +#endif +} + +static __device__ __forceinline__ double shfl_down(double r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_down_sync(mask, a.x, offset, bound); + a.y = __shfl_down_sync(mask, a.y, offset, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl_down(a.x, offset, bound); + a.y = __shfl_down(a.y, offset, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +static __device__ __forceinline__ long long shfl_down(long long r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_down_sync(mask, a.x, offset, bound); + a.y = __shfl_down_sync(mask, a.y, offset, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl_down(a.x, offset, bound); + a.y = __shfl_down(a.y, offset, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +// specifically for triangles counting +static __device__ __forceinline__ uint64_t shfl_down(uint64_t r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_down_sync(mask, a.x, offset, bound); + a.y = __shfl_down_sync(mask, a.y, offset, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl_down(mask, a.x, offset, bound); + a.y = __shfl_down(mask, a.y, offset, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +static __device__ __forceinline__ int shfl_up(int r, int offset, int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + return __shfl_up_sync(mask, r, offset, bound); +#else + return __shfl_up(r, offset, bound); +#endif +#else + return 0.0f; +#endif +} + +static __device__ __forceinline__ float shfl_up(float r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + return __shfl_up_sync(mask, r, offset, bound); +#else + return __shfl_up(r, offset, bound); +#endif +#else + return 0.0f; +#endif +} + +static __device__ __forceinline__ double shfl_up(double r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_up_sync(mask, a.x, offset, bound); + a.y = __shfl_up_sync(mask, a.y, offset, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl_up(a.x, offset, bound); + a.y = __shfl_up(a.y, offset, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} + +static __device__ __forceinline__ long long shfl_up(long long r, int offset, + int bound = 32, + int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#ifdef USE_CG + int2 a = *reinterpret_cast(&r); + a.x = __shfl_up_sync(mask, a.x, offset, bound); + a.y = __shfl_up_sync(mask, a.y, offset, bound); + return *reinterpret_cast(&a); +#else + int2 a = *reinterpret_cast(&r); + a.x = __shfl_up(a.x, offset, bound); + a.y = __shfl_up(a.y, offset, bound); + return *reinterpret_cast(&a); +#endif +#else + return 0.0; +#endif +} +} // namespace utils + +} // namespace raft From 3361dc35d6808e7aec21bab4243c7b02468ca245 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 11 Jun 2020 17:38:17 -0500 Subject: [PATCH 24/88] Update partition with eigen solver interface. --- cpp/include/raft/spectral/eigen_solvers.hpp | 87 +++++++++++++++++++++ cpp/include/raft/spectral/lanczos.hpp | 4 +- cpp/include/raft/spectral/partition.hpp | 22 +++--- 3 files changed, 99 insertions(+), 14 deletions(-) create mode 100644 cpp/include/raft/spectral/eigen_solvers.hpp diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp new file mode 100644 index 0000000000..fcb099a556 --- /dev/null +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace raft { + + using namespace matrix; + + // aggregate of control params for Eigen Solver: + // + template + struct eigen_solver_config_t { + size_type_t n_eigVecs; + size_type_t maxIter; + + size_type_t restartIter; + value_type_t tol; + + bool reorthogonalize; + unsigned long long seed{1234567}; + }; + + template + struct lanczos_solver_t { + explicit lanczos_solver_t(eigen_solver_config_t const& config): + config_(config) + { + } + + index_type_t solve_smallest_eigenvectors(handle_t handle, sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) { + index_type_t iters{}; + RAFT_TRY(computeSmallestEigenvectors(handle, + A, + config_.n_eigVecs, + config_.maxIter, + config_.restartIter, + config_.tol, + config_.reorthogonalize, + iters, + eigVals, + eigVecs, + config_.seed)); + return iters; + } + + index_type_t solve_largest_eigenvectors(handle_t handle, sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) { + index_type_t iters{}; + RAFT_TRY(computeLargestEigenvectors(handle, + A, + config_.n_eigVecs, + config_.maxIter, + config_.restartIter, + config_.tol, + config_.reorthogonalize, + iters, + eigVals, + eigVecs, + config_.seed)); + return iters; + } + + decltype(auto) get_config(void) const + { + return config_; + } + + private: + eigen_solver_config_t config_; + }; +} // namespace raft diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 54818e1766..c4ab61b78e 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -32,11 +32,11 @@ namespace raft { -namespace { - using namespace matrix; using namespace linalg; +namespace { + // ========================================================= // Helper functions // ========================================================= diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 156be656bd..c8a284fbb1 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -15,8 +15,6 @@ */ #pragma once -#include "include/partition.hxx" - #include #include @@ -27,7 +25,7 @@ #include #include -#include +#include #include namespace raft { @@ -149,22 +147,21 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) { * @return error flag. */ template , typename ClusterSolver = KmeansSolver> int partition( handle_t handle, ThrustExePolicy thrust_exec_policy, cugraph::experimental::GraphCSRView const &graph, - vertex_t nParts, vertex_t nEigVecs, int maxIter_lanczos, - int restartIter_lanczos, weight_t tol_lanczos, int maxIter_kmeans, - weight_t tol_kmeans, vertex_t *__restrict__ parts, weight_t *eigVals, - weight_t *eigVecs) { + vertex_t nParts, EigenSolver eigen_solver, + int maxIter_kmeans, weight_t tol_kmeans, vertex_t *__restrict__ parts, + weight_t *eigVals, weight_t *eigVecs) { const weight_t zero{0.0}; const weight_t one{1.0}; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); - int iters_lanczos; + int iters_eig_solver; int iters_kmeans; edge_t i; @@ -183,10 +180,11 @@ int partition( sparse_matrix_t A{graph}; laplacian_matrix_t L{handle, graph}; + auto eigen_config = eigen_solver.get_config(); + auto nEigVecs = eigen_configs.n_eigVecs; + // Compute smallest eigenvalues and eigenvectors - RAFT_TRY(computeSmallestEigenvectors(L, nEigVecs, maxIter_lanczos, - restartIter_lanczos, tol_lanczos, false, - iters_lanczos, eigVals, eigVecs)); + iter_eigs_solver = eigen_solver.solve_smallest_eigenvector(L, eigVals, eigVecs); // Whiten eigenvector matrix for (i = 0; i < nEigVecs; ++i) { From 0373a52046ccdba7515b731eae7d95e67c6160ae Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 11 Jun 2020 18:06:04 -0500 Subject: [PATCH 25/88] Format and some comments. --- cpp/include/raft/spectral/eigen_solvers.hpp | 109 +++++++++----------- 1 file changed, 48 insertions(+), 61 deletions(-) diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp index fcb099a556..2f87c95b3a 100644 --- a/cpp/include/raft/spectral/eigen_solvers.hpp +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -19,69 +19,56 @@ namespace raft { - using namespace matrix; +using namespace matrix; - // aggregate of control params for Eigen Solver: - // - template - struct eigen_solver_config_t { - size_type_t n_eigVecs; - size_type_t maxIter; - - size_type_t restartIter; - value_type_t tol; +// aggregate of control params for Eigen Solver: +// +template +struct eigen_solver_config_t { + size_type_t n_eigVecs; + size_type_t maxIter; - bool reorthogonalize; - unsigned long long seed{1234567}; - }; + size_type_t restartIter; + value_type_t tol; - template - struct lanczos_solver_t { - explicit lanczos_solver_t(eigen_solver_config_t const& config): - config_(config) - { - } + bool reorthogonalize{false}; + unsigned long long seed{ + 1234567}; // CAVEAT: this default value is now common to all instances of using seed in Lanczos; was not the case before: there were places where a default seed = 123456 was used; this may trigger slightly different # solver iterations +}; - index_type_t solve_smallest_eigenvectors(handle_t handle, sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) { - index_type_t iters{}; - RAFT_TRY(computeSmallestEigenvectors(handle, - A, - config_.n_eigVecs, - config_.maxIter, - config_.restartIter, - config_.tol, - config_.reorthogonalize, - iters, - eigVals, - eigVecs, - config_.seed)); - return iters; - } - - index_type_t solve_largest_eigenvectors(handle_t handle, sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) { - index_type_t iters{}; - RAFT_TRY(computeLargestEigenvectors(handle, - A, - config_.n_eigVecs, - config_.maxIter, - config_.restartIter, - config_.tol, - config_.reorthogonalize, - iters, - eigVals, - eigVecs, - config_.seed)); - return iters; - } +template +struct lanczos_solver_t { + explicit lanczos_solver_t( + eigen_solver_config_t const& config) + : config_(config) {} - decltype(auto) get_config(void) const - { - return config_; - } - - private: - eigen_solver_config_t config_; - }; -} // namespace raft + index_type_t solve_smallest_eigenvectors( + handle_t handle, sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) { + index_type_t iters{}; + RAFT_TRY(computeSmallestEigenvectors( + handle, A, config_.n_eigVecs, config_.maxIter, config_.restartIter, + config_.tol, config_.reorthogonalize, iters, eigVals, eigVecs, + config_.seed)); + return iters; + } + + index_type_t solve_largest_eigenvectors( + handle_t handle, sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) { + index_type_t iters{}; + RAFT_TRY(computeLargestEigenvectors(handle, A, config_.n_eigVecs, + config_.maxIter, config_.restartIter, + config_.tol, config_.reorthogonalize, + iters, eigVals, eigVecs, config_.seed)); + return iters; + } + + decltype(auto) get_config(void) const { return config_; } + + private: + eigen_solver_config_t config_; +}; +} // namespace raft From 308e893e587f0b55ca576842ca6687219a0ec96a Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 11 Jun 2020 21:54:38 -0500 Subject: [PATCH 26/88] Added generic cluster solvers. Partition clean-up. --- cpp/include/raft/spectral/cluster_solvers.hpp | 63 +++++++++++++++++++ cpp/include/raft/spectral/eigen_solvers.hpp | 8 ++- cpp/include/raft/spectral/partition.hpp | 37 ++++++----- 3 files changed, 86 insertions(+), 22 deletions(-) create mode 100644 cpp/include/raft/spectral/cluster_solvers.hpp diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp new file mode 100644 index 0000000000..cd0963506f --- /dev/null +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include // for std::pair + +namespace raft { + +using namespace matrix; + +// aggregate of control params for Eigen Solver: +// +template +struct cluster_solver_config_t { + size_type_t n_clusters; + size_type_t maxIter; + + value_type_t tol; + + unsigned long long seed{123456}; +}; + +template +struct kmeans_solver_t { + explicit kmeans_solver_t( + cluster_solver_config_t const& config) + : config_(config) {} + + template + std::pair solve( + handle_t handle, thrust_exe_policy_t t_exe_policy, size_type_t n_obs_vecs, + size_type_t dim, value_type_t const* __restrict__ obs, + index_type_t* __restrict__ codes) const { + value_type_t residual{}; + index_type_t iters{}; + RAFT_TRY(kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters, + config_.tol, config_.maxIter, obs, codes, residual, iters, + config_.seed)); + return std::make_pair(residual, iters); + } + + auto const& get_config(void) const { return config_; } + + private: + cluster_solver_config_t config_; +}; +} // namespace raft diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp index 2f87c95b3a..9c1258c432 100644 --- a/cpp/include/raft/spectral/eigen_solvers.hpp +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -46,7 +46,8 @@ struct lanczos_solver_t { index_type_t solve_smallest_eigenvectors( handle_t handle, sparse_matrix_t const& A, - value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) { + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const { index_type_t iters{}; RAFT_TRY(computeSmallestEigenvectors( handle, A, config_.n_eigVecs, config_.maxIter, config_.restartIter, @@ -57,7 +58,8 @@ struct lanczos_solver_t { index_type_t solve_largest_eigenvectors( handle_t handle, sparse_matrix_t const& A, - value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) { + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const { index_type_t iters{}; RAFT_TRY(computeLargestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, config_.restartIter, @@ -66,7 +68,7 @@ struct lanczos_solver_t { return iters; } - decltype(auto) get_config(void) const { return config_; } + auto const& get_config(void) const { return config_; } private: eigen_solver_config_t config_; diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index c8a284fbb1..0e858ac90d 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -24,7 +24,9 @@ #include #include -#include +#include + +#include #include #include @@ -147,29 +149,26 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) { * @return error flag. */ template , - typename ClusterSolver = KmeansSolver> -int partition( + typename ThrustExePolicy, + typename EigenSolver = lanczos_solver_t, + typename ClusterSolver = kmeans_solver_t> +std::tuple partition( handle_t handle, ThrustExePolicy thrust_exec_policy, cugraph::experimental::GraphCSRView const &graph, - vertex_t nParts, EigenSolver eigen_solver, - int maxIter_kmeans, weight_t tol_kmeans, vertex_t *__restrict__ parts, - weight_t *eigVals, weight_t *eigVecs) { + EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, + vertex_t *__restrict__ parts, weight_t *eigVals, weight_t *eigVecs) { const weight_t zero{0.0}; const weight_t one{1.0}; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); - int iters_eig_solver; - int iters_kmeans; + std::tuple + stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver edge_t i; edge_t n = graph.number_of_vertices; - // k-means residual - weight_t residual_kmeans; - // ------------------------------------------------------- // Spectral partitioner // ------------------------------------------------------- @@ -184,7 +183,7 @@ int partition( auto nEigVecs = eigen_configs.n_eigVecs; // Compute smallest eigenvalues and eigenvectors - iter_eigs_solver = eigen_solver.solve_smallest_eigenvector(L, eigVals, eigVecs); + stats.get<0>() = eigen_solver.solve_smallest_eigenvector(L, eigVals, eigVecs); // Whiten eigenvector matrix for (i = 0; i < nEigVecs; ++i) { @@ -233,14 +232,14 @@ int partition( cudaMemcpyDeviceToDevice, stream)); } - // Clean up + // Find partition with clustering + auto pair_cluster = cluster_solver.solve(handle, t_thrust_exec_policy, n, + nEigVecs, eigVecs, parts); - // eigVecs.dump(0, nEigVecs*n); - // Find partition with k-means clustering - RAFT_TRY(kmeans(n, nEigVecs, nParts, tol_kmeans, maxIter_kmeans, eigVecs, - parts, residual_kmeans, iters_kmeans)); + stats.get<1>() = pair_cluster.first; + stats.get<2>() = pair_cluster.second; - return 0; + return stats; } // ========================================================= From 8fb1c0425c644158017ebad5b9e8aa5b92645931 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 11 Jun 2020 22:01:55 -0500 Subject: [PATCH 27/88] Fixed tuple. --- cpp/include/raft/spectral/partition.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 0e858ac90d..ea5bcdd1d8 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -183,7 +183,8 @@ std::tuple partition( auto nEigVecs = eigen_configs.n_eigVecs; // Compute smallest eigenvalues and eigenvectors - stats.get<0>() = eigen_solver.solve_smallest_eigenvector(L, eigVals, eigVecs); + std::get<0>(stats) = + eigen_solver.solve_smallest_eigenvector(L, eigVals, eigVecs); // Whiten eigenvector matrix for (i = 0; i < nEigVecs; ++i) { @@ -236,8 +237,8 @@ std::tuple partition( auto pair_cluster = cluster_solver.solve(handle, t_thrust_exec_policy, n, nEigVecs, eigVecs, parts); - stats.get<1>() = pair_cluster.first; - stats.get<2>() = pair_cluster.second; + std::get<1>(stats) = pair_cluster.first; + std::get<2>(stats) = pair_cluster.second; return stats; } From 25756ecafbd566f89dfd46216672c2c105693158 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 12 Jun 2020 16:59:13 -0500 Subject: [PATCH 28/88] Modularity Maximization refactor. More cleanup in matrix wrappers and partition. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 216 +++++++++- .../raft/spectral/modularity_maximization.hpp | 383 ++++-------------- cpp/include/raft/spectral/partition.hpp | 184 ++------- 3 files changed, 320 insertions(+), 463 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 04fd8cc185..68cd829949 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -18,6 +18,7 @@ #include #include #include +#include // ========================================================= // Useful macros @@ -27,6 +28,153 @@ #define IDX(i, j, lda) ((i) + (j) * (lda)) namespace raft { + +namespace { + +template +static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, + ValueType_* obs) { + IndexType_ i, j, k, index, mm; + ValueType_ alpha, v, last; + bool valid; + // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension + + // compute alpha + mm = (((m + blockDim.x - 1) / blockDim.x) * + blockDim.x); // m in multiple of blockDim.x + alpha = 0.0; + // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, + // li, mn); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; + j += blockDim.y * gridDim.y) { + for (i = threadIdx.x; i < mm; i += blockDim.x) { + // check if the thread is valid + valid = i < m; + + // get the value of the last thread + last = utils::shfl(alpha, blockDim.x - 1, blockDim.x); + + // if you are valid read the value from memory, otherwise set your value to 0 + alpha = (valid) ? obs[i + j * m] : 0.0; + alpha = alpha * alpha; + + // do prefix sum (of size warpSize=blockDim.x =< 32) + for (k = 1; k < blockDim.x; k *= 2) { + v = utils::shfl_up(alpha, k, blockDim.x); + if (threadIdx.x >= k) alpha += v; + } + // shift by last + alpha += last; + } + } + + // scale by alpha + alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); + alpha = std::sqrt(alpha); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; + j += blockDim.y * gridDim.y) { + for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 + index = i + j * m; + obs[index] = obs[index] / alpha; + } + } +} + +template +IndexType_ next_pow2(IndexType_ n) { + IndexType_ v; + // Reference: + // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float + v = n - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return v + 1; +} + +template +cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_* obs) { + IndexType_ p2m; + dim3 nthreads, nblocks; + + // find next power of 2 + p2m = next_pow2(m); + // setup launch configuration + nthreads.x = max(2, min(p2m, 32)); + nthreads.y = 256 / nthreads.x; + nthreads.z = 1; + nblocks.x = 1; + nblocks.y = (n + nthreads.y - 1) / nthreads.y; + nblocks.z = 1; + // printf("m=%d(%d),n=%d,obs=%p, + // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); + + // launch scaling kernel (scale each column of obs by its norm) + scale_obs_kernel<<>>(m, n, obs); + CUDA_CHECK_LAST(); + + return cudaSuccess; +} + +template +void transform_eigen_matrix(handle_t handle, ThrustExePolicy thrust_exec_policy, + edge_t n, vertex_t nEigVecs, weight_t* eigVecs) { + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // Whiten eigenvector matrix + for (auto i = 0; i < nEigVecs; ++i) { + weight_t mean, std; + + mean = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + CUDA_CHECK_LAST(); + mean /= n; + thrust::transform(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(mean), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::minus()); + CUDA_CHECK_LAST(); + + CUBLAS_CHECK( + cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); + + std /= std::sqrt(static_cast(n)); + + thrust::transform(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(std), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::divides()); + CUDA_CHECK_LAST(); + } + + // Transpose eigenvector matrix + // TODO: in-place transpose + { + vector_t work(handle, nEigVecs * n); + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n, + &one, eigVecs, n, &zero, (weight_t*)NULL, nEigVecs, + work.raw(), nEigVecs, stream)); + + CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(), + nEigVecs * n * sizeof(weight_t), + cudaMemcpyDeviceToDevice, stream)); + } +} + +} // namespace + namespace matrix { using size_type = int; // for now; TODO: move it in appropriate header @@ -109,7 +257,10 @@ struct sparse_matrix_t { // virtual void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, value_type* __restrict__ y) const { - //TODO: call cusparse::csrmv + //TODO: + // + //Cusparse::set_pointer_mode_host(); + //cusparsecsrmv(...); } //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate @@ -131,20 +282,53 @@ struct laplacian_matrix_t : sparse_matrix_t { nrows, nnz), diagonal_(raft_handle, nrows) { auto* v = diagonal_.raw(); - //TODO: more work, here... + //TODO: more work, here: + // + // vector_t ones(nrows); + // ones.fill(1.0); + // sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); } laplacian_matrix_t( handle_t const& raft_handle, GraphCSRView const& csr_view) : sparse_matrix_t(csr_view), - diagonal_(raft_handle, csr_view.number_of_vertices_) {} + diagonal_(raft_handle, csr_view.number_of_vertices_) { + //TODO: more work, here: + // + // vector_t ones(csr_view.number_of_vertices_); + // ones.fill(1.0); + // sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); + } // y = alpha*A*x + beta*y // void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, value_type* __restrict__ y) const override { - //TODO: call cusparse::csrmv ... and more + //TODO: call cusparse::csrmv ... and more: + // + // if (beta == 0) + // CHECK_CUDA(cudaMemset(y, 0, (this->n) * sizeof(ValueType_))) + // else if (beta != 1) + // thrust::transform(thrust::device_pointer_cast(y), + // thrust::device_pointer_cast(y + this->n), + // thrust::make_constant_iterator(beta), + // thrust::device_pointer_cast(y), + // thrust::multiplies()); + + // // Apply diagonal matrix + // dim3 gridDim, blockDim; + // gridDim.x = min(((this->n) + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); + // gridDim.y = 1; + // gridDim.z = 1; + // blockDim.x = BLOCK_SIZE; + // blockDim.y = 1; + // blockDim.z = 1; + // diagmv<<s>>>(this->n, alpha, D.raw(), x, y); + // cudaCheckError(); + + // // Apply adjacency matrix + // sparse_matrix_t::mv(-alpha, x, 1, y); } vector_t diagonal_; @@ -159,20 +343,38 @@ struct modularity_matrix_t : laplacian_matrix_t { : laplacian_matrix_t( raft_handle, row_offsets, col_indices, values, nrows, nnz) { auto* v = laplacian_matrix_t::diagonal_.raw(); - //TODO: more work, here... + //TODO: more work, here: + // + // diag_nrm1_ = diagonal_.nrm1(); } modularity_matrix_t( handle_t const& raft_handle, GraphCSRView const& csr_view) - : laplacian_matrix_t(raft_handle, csr_view) {} + : laplacian_matrix_t(raft_handle, csr_view) { + //TODO: more work, here: + // + // diag_nrm1_ = diagonal_.nrm1(); + } // y = alpha*A*x + beta*y // void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, value_type* __restrict__ y) const override { - //TODO: call cusparse::csrmv ... and more + //TODO: call cusparse::csrmv ... and more: + // + // // y = A*x + // sparse_matrix_t::mv(alpha, x, 0, y); + // value_type dot_res; + // // gamma = d'*x + // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); + // // y = y -(gamma/edge_sum)*d + // Cublas::axpy(this->n, -(dot_res / this->edge_sum), D.raw(), 1, y, 1); } + + value_type get_diag_nrm1(void) const { return diag_nrm1_; } + + value_type diag_nrm1_; }; } // namespace matrix diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index bd90f3093a..6b42f783c9 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,9 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -//#ifdef NVGRAPH_PARTITION -#include "include/modularity_maximization.hxx" +#pragma once #include #include @@ -26,14 +25,10 @@ #include #include -#include "include/debug_macros.h" -#include "include/kmeans.hxx" -#include "include/lanczos.hxx" -#include "include/nvgraph_cublas.hxx" -#include "include/nvgraph_error.hxx" -#include "include/nvgraph_vector.hxx" -#include "include/sm_utils.h" -#include "include/spectral_matrix.hxx" +#include + +#include +#include //#define COLLECT_TIME_STATISTICS 1 //#undef COLLECT_TIME_STATISTICS @@ -47,8 +42,7 @@ #endif #ifdef COLLECT_TIME_STATISTICS -static double timer(void) -{ +static double timer(void) { struct timeval tv; cudaDeviceSynchronize(); gettimeofday(&tv, NULL); @@ -56,100 +50,10 @@ static double timer(void) } #endif -namespace nvgraph { - -// ========================================================= -// Useful macros -// ========================================================= - -// Get index of matrix entry -#define IDX(i, j, lda) ((i) + (j) * (lda)) +namespace raft { -template -static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, ValueType_ *obs) -{ - IndexType_ i, j, k, index, mm; - ValueType_ alpha, v, last; - bool valid; - // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension - - // compute alpha - mm = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x); // m in multiple of blockDim.x - alpha = 0.0; - // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, - // li, mn); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { - for (i = threadIdx.x; i < mm; i += blockDim.x) { - // check if the thread is valid - valid = i < m; - - // get the value of the last thread - last = utils::shfl(alpha, blockDim.x - 1, blockDim.x); - - // if you are valid read the value from memory, otherwise set your value to 0 - alpha = (valid) ? obs[i + j * m] : 0.0; - alpha = alpha * alpha; - - // do prefix sum (of size warpSize=blockDim.x =< 32) - for (k = 1; k < blockDim.x; k *= 2) { - v = utils::shfl_up(alpha, k, blockDim.x); - if (threadIdx.x >= k) alpha += v; - } - // shift by last - alpha += last; - } - } - - // scale by alpha - alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); - alpha = std::sqrt(alpha); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { - for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 - index = i + j * m; - obs[index] = obs[index] / alpha; - } - } -} - -template -IndexType_ next_pow2(IndexType_ n) -{ - IndexType_ v; - // Reference: - // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float - v = n - 1; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - return v + 1; -} - -template -cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) -{ - IndexType_ p2m; - dim3 nthreads, nblocks; - - // find next power of 2 - p2m = next_pow2(m); - // setup launch configuration - nthreads.x = max(2, min(p2m, 32)); - nthreads.y = 256 / nthreads.x; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = (n + nthreads.y - 1) / nthreads.y; - nblocks.z = 1; - // printf("m=%d(%d),n=%d,obs=%p, - // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); - - // launch scaling kernel (scale each column of obs by its norm) - scale_obs_kernel<<>>(m, n, obs); - cudaCheckError(); - - return cudaSuccess; -} +using namespace matrix; +using namespace linalg; // ========================================================= // Spectral modularity_maximization @@ -168,129 +72,63 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) * @param tol_lanczos Convergence tolerance for Lanczos method. * @param maxIter_kmeans Maximum number of k-means iterations. * @param tol_kmeans Convergence tolerance for k-means algorithm. - * @param parts (Output, device memory, n entries) Cluster + * @param clusters (Output, device memory, n entries) Cluster * assignments. * @param iters_lanczos On exit, number of Lanczos iterations * performed. * @param iters_kmeans On exit, number of k-means iterations * performed. - * @return NVGRAPH error flag. + * @return error flag. */ -template -NVGRAPH_ERROR modularity_maximization( - cugraph::experimental::GraphCSRView const &graph, - vertex_t nClusters, - vertex_t nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - weight_t tol_lanczos, - int maxIter_kmeans, - weight_t tol_kmeans, - vertex_t *__restrict__ clusters, - weight_t *eigVals, - weight_t *eigVecs, - int &iters_lanczos, - int &iters_kmeans) -{ - cudaStream_t stream = 0; +template , + typename ClusterSolver = kmeans_solver_t> +std::tuple modularity_maximization( + handle_t handle, ThrustExePolicy thrust_exec_policy, + GraphCSRView const &graph, + EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, + vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { const weight_t zero{0.0}; const weight_t one{1.0}; - edge_t i; - edge_t n = graph.number_of_vertices; + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + std::tuple + stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver - // k-means residual - weight_t residual_kmeans; + edge_t n = graph.number_of_vertices; // Compute eigenvectors of Modularity Matrix + // Initialize Modularity Matrix - CsrMatrix A(false, - false, - graph.number_of_vertices, - graph.number_of_vertices, - graph.number_of_edges, - 0, - graph.edge_data, - graph.offsets, - graph.indices); - ModularityMatrix B(A, graph.number_of_edges); - - // Compute smallest eigenvalues and eigenvectors - CHECK_NVGRAPH(computeLargestEigenvectors(B, - nEigVecs, - maxIter_lanczos, - restartIter_lanczos, - tol_lanczos, - false, - iters_lanczos, - eigVals, - eigVecs)); - - // eigVals.dump(0, nEigVecs); - // eigVecs.dump(0, nEigVecs); - // eigVecs.dump(n, nEigVecs); - // eigVecs.dump(2*n, nEigVecs); - // Whiten eigenvector matrix - for (i = 0; i < nEigVecs; ++i) { - weight_t mean, std; - mean = thrust::reduce(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); - cudaCheckError(); - mean /= n; - thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), - thrust::make_constant_iterator(mean), - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::minus()); - cudaCheckError(); - std = Cublas::nrm2(n, eigVecs + IDX(0, i, n), 1) / std::sqrt(static_cast(n)); - thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), - thrust::make_constant_iterator(std), - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::divides()); - cudaCheckError(); - } + sparse_matrix_t A{graph}; + modularity_matrix_t B{handle, graph}; - // Transpose eigenvector matrix - // TODO: in-place transpose - { - Vector work(nEigVecs * n, stream); - Cublas::set_pointer_mode_host(); - Cublas::geam(true, - false, - nEigVecs, - n, - &one, - eigVecs, - n, - &zero, - (weight_t *)NULL, - nEigVecs, - work.raw(), - nEigVecs); - CHECK_CUDA(cudaMemcpyAsync( - eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice)); - } + auto eigen_config = eigen_solver.get_config(); + auto nEigVecs = eigen_configs.n_eigVecs; + + // Compute eigenvectors corresponding to largest eigenvalues + std::get<0>(stats) = + eigen_solver.solve_largest_eigenvectors(B, eigVals, eigVecs); + + // Whiten eigenvector matrix + transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); - // WARNING: notice that at this point the matrix has already been transposed, so we are scaling + // notice that at this point the matrix has already been transposed, so we are scaling // columns scale_obs(nEigVecs, n, eigVecs); - cudaCheckError(); - - // eigVecs.dump(0, nEigVecs*n); - // Find partition with k-means clustering - CHECK_NVGRAPH(kmeans(n, - nEigVecs, - nClusters, - tol_kmeans, - maxIter_kmeans, - eigVecs, - clusters, - residual_kmeans, - iters_kmeans)); - - return NVGRAPH_OK; + CUDA_CHECK_LAST(); + + // Find partition clustering + auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, + nEigVecs, eigVecs, clusters); + + std::get<1>(stats) = pair_cluster.first; + std::get<2>(stats) = pair_cluster.second; + + return stats; } //=================================================== // Analysis of graph partition @@ -307,9 +145,9 @@ struct equal_to_i_op { public: equal_to_i_op(IndexType_ _i) : i(_i) {} template - __host__ __device__ void operator()(Tuple_ t) - { - thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; + __host__ __device__ void operator()(Tuple_ t) { + thrust::get<1>(t) = + (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; } }; } // namespace @@ -318,39 +156,33 @@ struct equal_to_i_op { /** This function determines the modularity based on a graph and cluster assignments * @param G Weighted graph in CSR format * @param nClusters Number of clusters. - * @param parts (Input, device memory, n entries) Cluster assignments. + * @param clusters (Input, device memory, n entries) Cluster assignments. * @param modularity On exit, modularity */ template -NVGRAPH_ERROR analyzeModularity( - cugraph::experimental::GraphCSRView const &graph, - vertex_t nClusters, - const vertex_t *__restrict__ parts, - weight_t &modularity) -{ - cudaStream_t stream = 0; +void analyzeModularity(handle_t handle, ThrustExePolicy thrust_exec_policy, + GraphCSRView const &graph, + vertex_t nClusters, + const vertex_t *__restrict__ clusters, + weight_t &modularity) { edge_t i; edge_t n = graph.number_of_vertices; - weight_t partModularity, partSize; + weight_t partModularity, clustersize; + + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); // Device memory - Vector part_i(n, stream); - Vector Bx(n, stream); + vector_t part_i(handle, n); + Vector Bx(handle, n); // Initialize cuBLAS - Cublas::set_pointer_mode_host(); + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity - CsrMatrix A(false, - false, - graph.number_of_vertices, - graph.number_of_vertices, - graph.number_of_edges, - 0, - graph.edge_data, - graph.offsets, - graph.indices); - ModularityMatrix B(A, graph.number_of_edges); + sparse_matrix_t A{graph}; + modularity_matrix_t B{handle, graph}; // Initialize output modularity = 0; @@ -358,25 +190,30 @@ NVGRAPH_ERROR analyzeModularity( // Iterate through partitions for (i = 0; i < nClusters; ++i) { // Construct indicator vector for ith partition - thrust::for_each( - thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts), - thrust::device_pointer_cast(part_i.raw()))), - thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts + n), - thrust::device_pointer_cast(part_i.raw() + n))), - equal_to_i_op(i)); - cudaCheckError(); + thrust::for_each(thrust_exec_policy, + thrust::make_zip_iterator(thrust::make_tuple( + thrust::device_pointer_cast(clusters), + thrust::device_pointer_cast(part_i.raw()))), + thrust::make_zip_iterator(thrust::make_tuple( + thrust::device_pointer_cast(clusters + n), + thrust::device_pointer_cast(part_i.raw() + n))), + equal_to_i_op(i)); + CUDA_CHECK_LAST(); // Compute size of ith partition - Cublas::dot(n, part_i.raw(), 1, part_i.raw(), 1, &partSize); - partSize = round(partSize); - if (partSize < 0.5) { + CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, + &clustersize, stream)); + + clustersize = round(clustersize); + if (clustersize < 0.5) { WARNING("empty partition"); continue; } // Compute modularity B.mv(1, part_i.raw(), 0, Bx.raw()); - Cublas::dot(n, Bx.raw(), 1, part_i.raw(), 1, &partModularity); + CUBLAS_CHECK(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, + &partModularity, stream)); // Record results modularity += partModularity; @@ -384,53 +221,7 @@ NVGRAPH_ERROR analyzeModularity( } // modularity = modularity/nClusters; // devide by nnz - modularity = modularity / B.getEdgeSum(); - // Clean up and return - - return NVGRAPH_OK; + modularity = modularity / B.get_diag_nrm1(); } -// ========================================================= -// Explicit instantiation -// ========================================================= -template NVGRAPH_ERROR modularity_maximization( - cugraph::experimental::GraphCSRView const &graph, - int nClusters, - int nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - float tol_lanczos, - int maxIter_kmeans, - float tol_kmeans, - int *__restrict__ parts, - float *eigVals, - float *eigVecs, - int &iters_lanczos, - int &iters_kmeans); -template NVGRAPH_ERROR modularity_maximization( - cugraph::experimental::GraphCSRView const &graph, - int nClusters, - int nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - double tol_lanczos, - int maxIter_kmeans, - double tol_kmeans, - int *__restrict__ parts, - double *eigVals, - double *eigVecs, - int &iters_lanczos, - int &iters_kmeans); -template NVGRAPH_ERROR analyzeModularity( - cugraph::experimental::GraphCSRView const &graph, - int nClusters, - const int *__restrict__ parts, - float &modularity); -template NVGRAPH_ERROR analyzeModularity( - cugraph::experimental::GraphCSRView const &graph, - int nClusters, - const int *__restrict__ parts, - double &modularity); - -} // namespace nvgraph -//#endif //NVGRAPH_PARTITION +} // namespace raft diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index ea5bcdd1d8..00b11f7740 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -28,100 +28,12 @@ #include #include -#include namespace raft { using namespace matrix; using namespace linalg; -template -static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, - ValueType_ *obs) { - IndexType_ i, j, k, index, mm; - ValueType_ alpha, v, last; - bool valid; - // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension - - // compute alpha - mm = (((m + blockDim.x - 1) / blockDim.x) * - blockDim.x); // m in multiple of blockDim.x - alpha = 0.0; - // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, - // li, mn); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; - j += blockDim.y * gridDim.y) { - for (i = threadIdx.x; i < mm; i += blockDim.x) { - // check if the thread is valid - valid = i < m; - - // get the value of the last thread - last = utils::shfl(alpha, blockDim.x - 1, blockDim.x); - - // if you are valid read the value from memory, otherwise set your value to 0 - alpha = (valid) ? obs[i + j * m] : 0.0; - alpha = alpha * alpha; - - // do prefix sum (of size warpSize=blockDim.x =< 32) - for (k = 1; k < blockDim.x; k *= 2) { - v = utils::shfl_up(alpha, k, blockDim.x); - if (threadIdx.x >= k) alpha += v; - } - // shift by last - alpha += last; - } - } - - // scale by alpha - alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); - alpha = std::sqrt(alpha); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; - j += blockDim.y * gridDim.y) { - for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 - index = i + j * m; - obs[index] = obs[index] / alpha; - } - } -} - -template -IndexType_ next_pow2(IndexType_ n) { - IndexType_ v; - // Reference: - // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float - v = n - 1; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - return v + 1; -} - -template -cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) { - IndexType_ p2m; - dim3 nthreads, nblocks; - - // find next power of 2 - p2m = next_pow2(m); - // setup launch configuration - nthreads.x = max(2, min(p2m, 32)); - nthreads.y = 256 / nthreads.x; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = (n + nthreads.y - 1) / nthreads.y; - nblocks.z = 1; - // printf("m=%d(%d),n=%d,obs=%p, - // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); - - // launch scaling kernel (scale each column of obs by its norm) - scale_obs_kernel<<>>(m, n, obs); - CUDA_CHECK_LAST(); - - return cudaSuccess; -} - // ========================================================= // Spectral partitioner // ========================================================= @@ -132,7 +44,7 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) { * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) * * @param G Weighted graph in CSR format - * @param nParts Number of partitions. + * @param nClusters Number of partitions. * @param nEigVecs Number of eigenvectors to compute. * @param maxIter_lanczos Maximum number of Lanczos iterations. * @param restartIter_lanczos Maximum size of Lanczos system before @@ -140,13 +52,13 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) { * @param tol_lanczos Convergence tolerance for Lanczos method. * @param maxIter_kmeans Maximum number of k-means iterations. * @param tol_kmeans Convergence tolerance for k-means algorithm. - * @param parts (Output, device memory, n entries) Partition + * @param clusters (Output, device memory, n entries) Partition * assignments. * @param iters_lanczos On exit, number of Lanczos iterations * performed. * @param iters_kmeans On exit, number of k-means iterations * performed. - * @return error flag. + * @return statistics: number of eigensolver iterations, . */ template > std::tuple partition( handle_t handle, ThrustExePolicy thrust_exec_policy, - cugraph::experimental::GraphCSRView const &graph, + GraphCSRView const &graph, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, - vertex_t *__restrict__ parts, weight_t *eigVals, weight_t *eigVecs) { + vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { const weight_t zero{0.0}; const weight_t one{1.0}; @@ -184,58 +96,14 @@ std::tuple partition( // Compute smallest eigenvalues and eigenvectors std::get<0>(stats) = - eigen_solver.solve_smallest_eigenvector(L, eigVals, eigVecs); + eigen_solver.solve_smallest_eigenvectors(L, eigVals, eigVecs); // Whiten eigenvector matrix - for (i = 0; i < nEigVecs; ++i) { - weight_t mean, std; + transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); - mean = thrust::reduce( - thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); - CUDA_CHECK_LAST(); - mean /= n; - thrust::transform(thrust_exec_policy, - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), - thrust::make_constant_iterator(mean), - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::minus()); - CUDA_CHECK_LAST(); - - CUBLAS_CHECK( - cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); - - std /= std::sqrt(static_cast(n)); - - thrust::transform(thrust_exec_policy, - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), - thrust::make_constant_iterator(std), - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::divides()); - CUDA_CHECK_LAST(); - } - - // Transpose eigenvector matrix - // TODO: in-place transpose - { - vector_t work(handle, nEigVecs * n); - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n, - &one, eigVecs, n, &zero, (weight_t *)NULL, nEigVecs, - work.raw(), nEigVecs, stream)); - - CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(), - nEigVecs * n * sizeof(weight_t), - cudaMemcpyDeviceToDevice, stream)); - } - - // Find partition with clustering - auto pair_cluster = cluster_solver.solve(handle, t_thrust_exec_policy, n, - nEigVecs, eigVecs, parts); + // Find partition clustering + auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, + nEigVecs, eigVecs, clusters); std::get<1>(stats) = pair_cluster.first; std::get<2>(stats) = pair_cluster.second; @@ -272,26 +140,25 @@ struct equal_to_i_op { * Graph is assumed to be weighted and undirected. * * @param G Weighted graph in CSR format - * @param nParts Number of partitions. - * @param parts (Input, device memory, n entries) Partition + * @param nClusters Number of partitions. + * @param clusters (Input, device memory, n entries) Partition * assignments. * @param edgeCut On exit, weight of edges cut by partition. * @param cost On exit, partition cost function. * @return error flag. */ template -int analyzePartition( - handle_t handle, ThrustExePolicy thrust_exec_policy, - cugraph::experimental::GraphCSRView const &graph, - vertex_t nParts, const vertex_t *__restrict__ parts, weight_t &edgeCut, - weight_t &cost) { +void analyzePartition(handle_t handle, ThrustExePolicy thrust_exec_policy, + GraphCSRView const &graph, + vertex_t nClusters, const vertex_t *__restrict__ clusters, + weight_t &edgeCut, weight_t &cost) { edge_t i; edge_t n = graph.number_of_vertices; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); - weight_t partEdgesCut, partSize; + weight_t partEdgesCut, clustersize; // Device memory vector_t part_i(handle, n); @@ -310,24 +177,24 @@ int analyzePartition( edgeCut = 0; // Iterate through partitions - for (i = 0; i < nParts; ++i) { + for (i = 0; i < nClusters; ++i) { // Construct indicator vector for ith partition thrust::for_each(thrust_exec_policy, thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(parts), + thrust::device_pointer_cast(clusters), thrust::device_pointer_cast(part_i.raw()))), thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(parts + n), + thrust::device_pointer_cast(clusters + n), thrust::device_pointer_cast(part_i.raw() + n))), equal_to_i_op(i)); CUDA_CHECK_LAST(); // Compute size of ith partition CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, - &partSize, stream)); + &clustersize, stream)); - partSize = round(partSize); - if (partSize < 0.5) { + clustersize = round(clustersize); + if (clustersize < 0.5) { WARNING("empty partition"); continue; } @@ -338,12 +205,9 @@ int analyzePartition( &partEdgesCut, stream)); // Record results - cost += partEdgesCut / partSize; + cost += partEdgesCut / clustersize; edgeCut += partEdgesCut / 2; } - - // Clean up and return - return 0; } } // namespace raft From 531bf2bdd74115b837772512d3f12b6ef4fb5639 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 12 Jun 2020 21:07:41 -0500 Subject: [PATCH 29/88] More refactoring in partition/modularity analysis. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 149 +----------- .../raft/spectral/modularity_maximization.hpp | 31 +-- cpp/include/raft/spectral/partition.hpp | 44 +--- cpp/include/raft/spectral/spectral_util.hpp | 230 ++++++++++++++++++ 4 files changed, 243 insertions(+), 211 deletions(-) create mode 100644 cpp/include/raft/spectral/spectral_util.hpp diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 68cd829949..779fbb9dc8 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -18,7 +18,7 @@ #include #include #include -#include +#include // ========================================================= // Useful macros @@ -28,153 +28,6 @@ #define IDX(i, j, lda) ((i) + (j) * (lda)) namespace raft { - -namespace { - -template -static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, - ValueType_* obs) { - IndexType_ i, j, k, index, mm; - ValueType_ alpha, v, last; - bool valid; - // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension - - // compute alpha - mm = (((m + blockDim.x - 1) / blockDim.x) * - blockDim.x); // m in multiple of blockDim.x - alpha = 0.0; - // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, - // li, mn); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; - j += blockDim.y * gridDim.y) { - for (i = threadIdx.x; i < mm; i += blockDim.x) { - // check if the thread is valid - valid = i < m; - - // get the value of the last thread - last = utils::shfl(alpha, blockDim.x - 1, blockDim.x); - - // if you are valid read the value from memory, otherwise set your value to 0 - alpha = (valid) ? obs[i + j * m] : 0.0; - alpha = alpha * alpha; - - // do prefix sum (of size warpSize=blockDim.x =< 32) - for (k = 1; k < blockDim.x; k *= 2) { - v = utils::shfl_up(alpha, k, blockDim.x); - if (threadIdx.x >= k) alpha += v; - } - // shift by last - alpha += last; - } - } - - // scale by alpha - alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); - alpha = std::sqrt(alpha); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; - j += blockDim.y * gridDim.y) { - for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 - index = i + j * m; - obs[index] = obs[index] / alpha; - } - } -} - -template -IndexType_ next_pow2(IndexType_ n) { - IndexType_ v; - // Reference: - // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float - v = n - 1; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - return v + 1; -} - -template -cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_* obs) { - IndexType_ p2m; - dim3 nthreads, nblocks; - - // find next power of 2 - p2m = next_pow2(m); - // setup launch configuration - nthreads.x = max(2, min(p2m, 32)); - nthreads.y = 256 / nthreads.x; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = (n + nthreads.y - 1) / nthreads.y; - nblocks.z = 1; - // printf("m=%d(%d),n=%d,obs=%p, - // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); - - // launch scaling kernel (scale each column of obs by its norm) - scale_obs_kernel<<>>(m, n, obs); - CUDA_CHECK_LAST(); - - return cudaSuccess; -} - -template -void transform_eigen_matrix(handle_t handle, ThrustExePolicy thrust_exec_policy, - edge_t n, vertex_t nEigVecs, weight_t* eigVecs) { - auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); - - // Whiten eigenvector matrix - for (auto i = 0; i < nEigVecs; ++i) { - weight_t mean, std; - - mean = thrust::reduce( - thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); - CUDA_CHECK_LAST(); - mean /= n; - thrust::transform(thrust_exec_policy, - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), - thrust::make_constant_iterator(mean), - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::minus()); - CUDA_CHECK_LAST(); - - CUBLAS_CHECK( - cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); - - std /= std::sqrt(static_cast(n)); - - thrust::transform(thrust_exec_policy, - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), - thrust::make_constant_iterator(std), - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::divides()); - CUDA_CHECK_LAST(); - } - - // Transpose eigenvector matrix - // TODO: in-place transpose - { - vector_t work(handle, nEigVecs * n); - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n, - &one, eigVecs, n, &zero, (weight_t*)NULL, nEigVecs, - work.raw(), nEigVecs, stream)); - - CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(), - nEigVecs * n * sizeof(weight_t), - cudaMemcpyDeviceToDevice, stream)); - } -} - -} // namespace - namespace matrix { using size_type = int; // for now; TODO: move it in appropriate header diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index 6b42f783c9..1e387c0606 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -29,6 +29,7 @@ #include #include +#include //#define COLLECT_TIME_STATISTICS 1 //#undef COLLECT_TIME_STATISTICS @@ -51,6 +52,7 @@ static double timer(void) { #endif namespace raft { +namespace spectral { using namespace matrix; using namespace linalg; @@ -163,7 +165,7 @@ template void analyzeModularity(handle_t handle, ThrustExePolicy thrust_exec_policy, GraphCSRView const &graph, vertex_t nClusters, - const vertex_t *__restrict__ clusters, + vertex_t const *__restrict__ clusters, weight_t &modularity) { edge_t i; edge_t n = graph.number_of_vertices; @@ -174,7 +176,7 @@ void analyzeModularity(handle_t handle, ThrustExePolicy thrust_exec_policy, // Device memory vector_t part_i(handle, n); - Vector Bx(handle, n); + vector_t Bx(handle, n); // Initialize cuBLAS CUBLAS_CHECK( @@ -189,32 +191,12 @@ void analyzeModularity(handle_t handle, ThrustExePolicy thrust_exec_policy, // Iterate through partitions for (i = 0; i < nClusters; ++i) { - // Construct indicator vector for ith partition - thrust::for_each(thrust_exec_policy, - thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(clusters), - thrust::device_pointer_cast(part_i.raw()))), - thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(clusters + n), - thrust::device_pointer_cast(part_i.raw() + n))), - equal_to_i_op(i)); - CUDA_CHECK_LAST(); - - // Compute size of ith partition - CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, - &clustersize, stream)); - - clustersize = round(clustersize); - if (clustersize < 0.5) { + if (!construct_indicator(handle, thrust_exec_policy, n, clustersize, + partModularity, clusters, part_i, Bx, B)) { WARNING("empty partition"); continue; } - // Compute modularity - B.mv(1, part_i.raw(), 0, Bx.raw()); - CUBLAS_CHECK(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, - &partModularity, stream)); - // Record results modularity += partModularity; // std::cout<< "partModularity " < #include +#include namespace raft { +namespace spectral { using namespace matrix; using namespace linalg; @@ -115,24 +117,6 @@ std::tuple partition( // Analysis of graph partition // ========================================================= -namespace { -/// Functor to generate indicator vectors -/** For use in Thrust transform - */ -template -struct equal_to_i_op { - const IndexType_ i; - - public: - equal_to_i_op(IndexType_ _i) : i(_i) {} - template - __host__ __device__ void operator()(Tuple_ t) { - thrust::get<1>(t) = - (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; - } -}; -} // namespace - /// Compute cost function for partition /** This function determines the edges cut by a partition and a cost * function: @@ -179,35 +163,17 @@ void analyzePartition(handle_t handle, ThrustExePolicy thrust_exec_policy, // Iterate through partitions for (i = 0; i < nClusters; ++i) { // Construct indicator vector for ith partition - thrust::for_each(thrust_exec_policy, - thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(clusters), - thrust::device_pointer_cast(part_i.raw()))), - thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(clusters + n), - thrust::device_pointer_cast(part_i.raw() + n))), - equal_to_i_op(i)); - CUDA_CHECK_LAST(); - - // Compute size of ith partition - CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, - &clustersize, stream)); - - clustersize = round(clustersize); - if (clustersize < 0.5) { + if (!construct_indicator(handle, thrust_exec_policy, n, clustersize, + partEdgesCut, clusters, part_i, Lx, L)) { WARNING("empty partition"); continue; } - // Compute number of edges cut by ith partition - L.mv(1, part_i.raw(), 0, Lx.raw()); - CUBLAS_CHECK(cublasdot(cublas_h, n, Lx.raw(), 1, part_i.raw(), 1, - &partEdgesCut, stream)); - // Record results cost += partEdgesCut / clustersize; edgeCut += partEdgesCut / 2; } } +} // namespace spectral } // namespace raft diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp new file mode 100644 index 0000000000..a0c10284e3 --- /dev/null +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include + +namespace raft { +namespace spectral { + +template +static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, + ValueType_* obs) { + IndexType_ i, j, k, index, mm; + ValueType_ alpha, v, last; + bool valid; + // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension + + // compute alpha + mm = (((m + blockDim.x - 1) / blockDim.x) * + blockDim.x); // m in multiple of blockDim.x + alpha = 0.0; + // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, + // li, mn); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; + j += blockDim.y * gridDim.y) { + for (i = threadIdx.x; i < mm; i += blockDim.x) { + // check if the thread is valid + valid = i < m; + + // get the value of the last thread + last = utils::shfl(alpha, blockDim.x - 1, blockDim.x); + + // if you are valid read the value from memory, otherwise set your value to 0 + alpha = (valid) ? obs[i + j * m] : 0.0; + alpha = alpha * alpha; + + // do prefix sum (of size warpSize=blockDim.x =< 32) + for (k = 1; k < blockDim.x; k *= 2) { + v = utils::shfl_up(alpha, k, blockDim.x); + if (threadIdx.x >= k) alpha += v; + } + // shift by last + alpha += last; + } + } + + // scale by alpha + alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); + alpha = std::sqrt(alpha); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; + j += blockDim.y * gridDim.y) { + for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 + index = i + j * m; + obs[index] = obs[index] / alpha; + } + } +} + +template +IndexType_ next_pow2(IndexType_ n) { + IndexType_ v; + // Reference: + // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float + v = n - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return v + 1; +} + +template +cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_* obs) { + IndexType_ p2m; + dim3 nthreads, nblocks; + + // find next power of 2 + p2m = next_pow2(m); + // setup launch configuration + nthreads.x = max(2, min(p2m, 32)); + nthreads.y = 256 / nthreads.x; + nthreads.z = 1; + nblocks.x = 1; + nblocks.y = (n + nthreads.y - 1) / nthreads.y; + nblocks.z = 1; + // printf("m=%d(%d),n=%d,obs=%p, + // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); + + // launch scaling kernel (scale each column of obs by its norm) + scale_obs_kernel<<>>(m, n, obs); + CUDA_CHECK_LAST(); + + return cudaSuccess; +} + +template +void transform_eigen_matrix(handle_t handle, ThrustExePolicy thrust_exec_policy, + edge_t n, vertex_t nEigVecs, weight_t* eigVecs) { + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + // Whiten eigenvector matrix + for (auto i = 0; i < nEigVecs; ++i) { + weight_t mean, std; + + mean = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + CUDA_CHECK_LAST(); + mean /= n; + thrust::transform(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(mean), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::minus()); + CUDA_CHECK_LAST(); + + CUBLAS_CHECK( + cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); + + std /= std::sqrt(static_cast(n)); + + thrust::transform(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(std), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::divides()); + CUDA_CHECK_LAST(); + } + + // Transpose eigenvector matrix + // TODO: in-place transpose + { + vector_t work(handle, nEigVecs * n); + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n, + &one, eigVecs, n, &zero, (weight_t*)NULL, nEigVecs, + work.raw(), nEigVecs, stream)); + + CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(), + nEigVecs * n * sizeof(weight_t), + cudaMemcpyDeviceToDevice, stream)); + } +} + +namespace { +/// Functor to generate indicator vectors +/** For use in Thrust transform + */ +template +struct equal_to_i_op { + const IndexType_ i; + + public: + equal_to_i_op(IndexType_ _i) : i(_i) {} + template + __host__ __device__ void operator()(Tuple_ t) { + thrust::get<1>(t) = + (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; + } +}; +} // namespace + +// Construct indicator vector for ith partition +// +template +bool construct_indicator(handle_t handle, ThrustExePolicy thrust_exec_policy, + edge_t n, weight_t& clustersize, weight_t& partStats, + vertex_t const* __restrict__ clusters, + vector_t& part_i, vector_t& Bx, + laplacian_matrix_t const& B) { + auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + + thrust::for_each(thrust_exec_policy, + thrust::make_zip_iterator(thrust::make_tuple( + thrust::device_pointer_cast(clusters), + thrust::device_pointer_cast(part_i.raw()))), + thrust::make_zip_iterator(thrust::make_tuple( + thrust::device_pointer_cast(clusters + n), + thrust::device_pointer_cast(part_i.raw() + n))), + equal_to_i_op(i)); + CUDA_CHECK_LAST(); + + // Compute size of ith partition + CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, + &clustersize, stream)); + + clustersize = round(clustersize); + if (clustersize < 0.5) { + return false; + } + + // Compute part stats + B.mv(1, part_i.raw(), 0, Bx.raw()); + CUBLAS_CHECK( + cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); + + return true; +} + +} // namespace spectral +} // namespace raft From 4fa1c76b3087c6f370e29f94302c4b82ee715182 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 15 Jun 2020 11:30:58 -0500 Subject: [PATCH 30/88] Removed MPI dependency (for now). --- cpp/include/raft/graph.hpp | 197 ++++++++++++++++--------------------- 1 file changed, 86 insertions(+), 111 deletions(-) diff --git a/cpp/include/raft/graph.hpp b/cpp/include/raft/graph.hpp index 8e72572764..089decc8ee 100644 --- a/cpp/include/raft/graph.hpp +++ b/cpp/include/raft/graph.hpp @@ -14,7 +14,7 @@ * limitations under the License. */ #pragma once -#include +/// #include // TODO: clarify what must be done about `comm` #include #include #include @@ -55,7 +55,7 @@ template class GraphViewBase { public: WT *edge_data; ///< edge weight - Comm comm; + /// Comm comm; // TODO: clarify what must be done about `comm` GraphProperties prop; @@ -69,16 +69,14 @@ class GraphViewBase { * identifiers */ void get_vertex_identifiers(VT *identifiers) const; - void set_communicator(Comm &comm_) { comm = comm_; } + /// void set_communicator(Comm &comm_) { comm = comm_; } // TODO: see above GraphViewBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) : edge_data(edge_data_), - comm(), + /// comm(), // TODO: see above prop(), number_of_vertices(number_of_vertices_), - number_of_edges(number_of_edges_) - { - } + number_of_edges(number_of_edges_) {} bool has_data(void) const { return edge_data != nullptr; } }; @@ -126,13 +124,12 @@ class GraphCOOView : public GraphViewBase { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCOOView( - VT *src_indices_, VT *dst_indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) - : GraphViewBase(edge_data_, number_of_vertices_, number_of_edges_), + GraphCOOView(VT *src_indices_, VT *dst_indices_, WT *edge_data_, + VT number_of_vertices_, ET number_of_edges_) + : GraphViewBase(edge_data_, number_of_vertices_, + number_of_edges_), src_indices(src_indices_), - dst_indices(dst_indices_) - { - } + dst_indices(dst_indices_) {} }; /** @@ -187,13 +184,12 @@ class GraphCompressedSparseBaseView : public GraphViewBase { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCompressedSparseBaseView( - ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) - : GraphViewBase(edge_data_, number_of_vertices_, number_of_edges_), + GraphCompressedSparseBaseView(ET *offsets_, VT *indices_, WT *edge_data_, + VT number_of_vertices_, ET number_of_edges_) + : GraphViewBase(edge_data_, number_of_vertices_, + number_of_edges_), offsets{offsets_}, - indices{indices_} - { - } + indices{indices_} {} }; /** @@ -209,7 +205,9 @@ class GraphCSRView : public GraphCompressedSparseBaseView { /** * @brief Default constructor */ - GraphCSRView() : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, 0) {} + GraphCSRView() + : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, + 0) {} /** * @brief Wrap existing arrays representing adjacency lists in a Graph. @@ -226,11 +224,10 @@ class GraphCSRView : public GraphCompressedSparseBaseView { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCSRView( - ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + GraphCSRView(ET *offsets_, VT *indices_, WT *edge_data_, + VT number_of_vertices_, ET number_of_edges_) : GraphCompressedSparseBaseView( - offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) - { + offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) { } }; @@ -247,7 +244,9 @@ class GraphCSCView : public GraphCompressedSparseBaseView { /** * @brief Default constructor */ - GraphCSCView() : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, 0) {} + GraphCSCView() + : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, + 0) {} /** * @brief Wrap existing arrays representing transposed adjacency lists in a Graph. @@ -264,11 +263,10 @@ class GraphCSCView : public GraphCompressedSparseBaseView { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCSCView( - ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) + GraphCSCView(ET *offsets_, VT *indices_, WT *edge_data_, + VT number_of_vertices_, ET number_of_edges_) : GraphCompressedSparseBaseView( - offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) - { + offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) { } }; @@ -323,30 +321,28 @@ class GraphCOO { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCOO(VT number_of_vertices, - ET number_of_edges, - bool has_data = false, - cudaStream_t stream = nullptr, - rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) + GraphCOO( + VT number_of_vertices, ET number_of_edges, bool has_data = false, + cudaStream_t stream = nullptr, + rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) : number_of_vertices_(number_of_vertices), number_of_edges_(number_of_edges), src_indices_(sizeof(VT) * number_of_edges, stream, mr), dst_indices_(sizeof(VT) * number_of_edges, stream, mr), - edge_data_((has_data ? sizeof(WT) * number_of_edges : 0), stream, mr) - { - } + edge_data_((has_data ? sizeof(WT) * number_of_edges : 0), stream, mr) {} - GraphCOO(GraphCOOView const &graph, - cudaStream_t stream = nullptr, - rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) + GraphCOO( + GraphCOOView const &graph, cudaStream_t stream = nullptr, + rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) : number_of_vertices_(graph.number_of_vertices), number_of_edges_(graph.number_of_edges), - src_indices_(graph.src_indices, graph.number_of_edges * sizeof(VT), stream, mr), - dst_indices_(graph.dst_indices, graph.number_of_edges * sizeof(VT), stream, mr) - { + src_indices_(graph.src_indices, graph.number_of_edges * sizeof(VT), + stream, mr), + dst_indices_(graph.dst_indices, graph.number_of_edges * sizeof(VT), + stream, mr) { if (graph.has_data()) { - edge_data_ = - rmm::device_buffer{graph.edge_data, graph.number_of_edges * sizeof(WT), stream, mr}; + edge_data_ = rmm::device_buffer{ + graph.edge_data, graph.number_of_edges * sizeof(WT), stream, mr}; } } @@ -356,24 +352,21 @@ class GraphCOO { VT *dst_indices(void) { return static_cast(dst_indices_.data()); } WT *edge_data(void) { return static_cast(edge_data_.data()); } - GraphCOOContents release() noexcept - { + GraphCOOContents release() noexcept { VT number_of_vertices = number_of_vertices_; - ET number_of_edges = number_of_edges_; - number_of_vertices_ = 0; - number_of_edges_ = 0; + ET number_of_edges = number_of_edges_; + number_of_vertices_ = 0; + number_of_edges_ = 0; return GraphCOOContents{ - number_of_vertices, - number_of_edges, + number_of_vertices, number_of_edges, std::make_unique(std::move(src_indices_)), std::make_unique(std::move(dst_indices_)), std::make_unique(std::move(edge_data_))}; } - GraphCOOView view(void) noexcept - { - return GraphCOOView( - src_indices(), dst_indices(), edge_data(), number_of_vertices_, number_of_edges_); + GraphCOOView view(void) noexcept { + return GraphCOOView(src_indices(), dst_indices(), edge_data(), + number_of_vertices_, number_of_edges_); } bool has_data(void) { return nullptr != edge_data_.data(); } @@ -420,27 +413,21 @@ class GraphCompressedSparseBase { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCompressedSparseBase(VT number_of_vertices, - ET number_of_edges, - bool has_data, - cudaStream_t stream, + GraphCompressedSparseBase(VT number_of_vertices, ET number_of_edges, + bool has_data, cudaStream_t stream, rmm::mr::device_memory_resource *mr) : number_of_vertices_(number_of_vertices), number_of_edges_(number_of_edges), offsets_(sizeof(ET) * (number_of_vertices + 1), stream, mr), indices_(sizeof(VT) * number_of_edges, stream, mr), - edge_data_((has_data ? sizeof(WT) * number_of_edges : 0), stream, mr) - { - } + edge_data_((has_data ? sizeof(WT) * number_of_edges : 0), stream, mr) {} GraphCompressedSparseBase(GraphSparseContents &&contents) : number_of_vertices_(contents.number_of_vertices), number_of_edges_(contents.number_of_edges), offsets_(std::move(*contents.offsets.release())), indices_(std::move(*contents.indices.release())), - edge_data_(std::move(*contents.edge_data.release())) - { - } + edge_data_(std::move(*contents.edge_data.release())) {} VT number_of_vertices(void) { return number_of_vertices_; } ET number_of_edges(void) { return number_of_edges_; } @@ -448,15 +435,13 @@ class GraphCompressedSparseBase { VT *indices(void) { return static_cast(indices_.data()); } WT *edge_data(void) { return static_cast(edge_data_.data()); } - GraphSparseContents release() noexcept - { + GraphSparseContents release() noexcept { VT number_of_vertices = number_of_vertices_; - ET number_of_edges = number_of_edges_; - number_of_vertices_ = 0; - number_of_edges_ = 0; + ET number_of_edges = number_of_edges_; + number_of_vertices_ = 0; + number_of_edges_ = 0; return GraphSparseContents{ - number_of_vertices, - number_of_edges, + number_of_vertices, number_of_edges, std::make_unique(std::move(offsets_)), std::make_unique(std::move(indices_)), std::make_unique(std::move(edge_data_))}; @@ -493,28 +478,23 @@ class GraphCSR : public GraphCompressedSparseBase { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCSR(VT number_of_vertices_, - ET number_of_edges_, - bool has_data_ = false, - cudaStream_t stream = nullptr, - rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) + GraphCSR( + VT number_of_vertices_, ET number_of_edges_, bool has_data_ = false, + cudaStream_t stream = nullptr, + rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) : GraphCompressedSparseBase( - number_of_vertices_, number_of_edges_, has_data_, stream, mr) - { - } + number_of_vertices_, number_of_edges_, has_data_, stream, mr) {} GraphCSR(GraphSparseContents &&contents) - : GraphCompressedSparseBase(std::move(contents)) - { - } - - GraphCSRView view(void) noexcept - { - return GraphCSRView(GraphCompressedSparseBase::offsets(), - GraphCompressedSparseBase::indices(), - GraphCompressedSparseBase::edge_data(), - GraphCompressedSparseBase::number_of_vertices(), - GraphCompressedSparseBase::number_of_edges()); + : GraphCompressedSparseBase(std::move(contents)) {} + + GraphCSRView view(void) noexcept { + return GraphCSRView( + GraphCompressedSparseBase::offsets(), + GraphCompressedSparseBase::indices(), + GraphCompressedSparseBase::edge_data(), + GraphCompressedSparseBase::number_of_vertices(), + GraphCompressedSparseBase::number_of_edges()); } }; @@ -546,28 +526,23 @@ class GraphCSC : public GraphCompressedSparseBase { * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCSC(VT number_of_vertices_, - ET number_of_edges_, - bool has_data_ = false, - cudaStream_t stream = nullptr, - rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) + GraphCSC( + VT number_of_vertices_, ET number_of_edges_, bool has_data_ = false, + cudaStream_t stream = nullptr, + rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) : GraphCompressedSparseBase( - number_of_vertices_, number_of_edges_, has_data_, stream, mr) - { - } + number_of_vertices_, number_of_edges_, has_data_, stream, mr) {} GraphCSC(GraphSparseContents &&contents) - : GraphCompressedSparseBase(contents) - { - } - - GraphCSCView view(void) noexcept - { - return GraphCSCView(GraphCompressedSparseBase::offsets(), - GraphCompressedSparseBase::indices(), - GraphCompressedSparseBase::edge_data(), - GraphCompressedSparseBase::number_of_vertices(), - GraphCompressedSparseBase::number_of_edges()); + : GraphCompressedSparseBase(contents) {} + + GraphCSCView view(void) noexcept { + return GraphCSCView( + GraphCompressedSparseBase::offsets(), + GraphCompressedSparseBase::indices(), + GraphCompressedSparseBase::edge_data(), + GraphCompressedSparseBase::number_of_vertices(), + GraphCompressedSparseBase::number_of_edges()); } }; From 10496dedbe4839026a25c8f2f476d15add79aa0d Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 15 Jun 2020 11:42:44 -0500 Subject: [PATCH 31/88] Fixed sparse matrix cnstr. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 779fbb9dc8..0f3a7d1e4c 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -97,11 +97,11 @@ struct sparse_matrix_t { sparse_matrix_t( GraphCSRView const& csr_view) - : row_offsets_(csr_view.offsets_), - col_indices_(csr_view.indices_), - values_(csr_view.edge_data_), - nrows_(csr_view.number_of_vertices_), - nnz_(csr_view.number_of_edges_) {} + : row_offsets_(csr_view.offsets), + col_indices_(csr_view.indices), + values_(csr_view.edge_data), + nrows_(csr_view.number_of_vertices), + nnz_(csr_view.number_of_edges) {} virtual ~sparse_matrix_t(void) = default; // virtual because used as base for following matrix types From 38e5ca9b53a01df67c97ba347b87dfdbdbc27ccb Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 15 Jun 2020 11:52:41 -0500 Subject: [PATCH 32/88] Added test for spectral matrix functionality. Compilation checker for now. --- cpp/CMakeLists.txt | 3 +- cpp/include/raft/spectral/matrix_wrappers.hpp | 6 +- cpp/test/spectral_matrix.cpp | 60 +++++++++++++++++++ 3 files changed, 65 insertions(+), 4 deletions(-) create mode 100644 cpp/test/spectral_matrix.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 947d0318cb..de7f3e0e34 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -205,7 +205,8 @@ if(BUILD_RAFT_TESTS) test/handle.cpp test/mr/device/buffer.cpp test/mr/host/buffer.cpp - test/test.cpp) + test/test.cpp + test/spectral_matrix.cpp) target_include_directories(test_raft PRIVATE diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 0f3a7d1e4c..74dbd38be6 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -87,8 +87,8 @@ class vector_t { template struct sparse_matrix_t { sparse_matrix_t(index_type const* row_offsets, index_type const* col_indices, - value_type const* values, index_type const nnz, - index_type const nrows) + value_type const* values, index_type const nrows, + index_type const nnz) : row_offsets_(row_offsets), col_indices_(col_indices), values_(values), @@ -146,7 +146,7 @@ struct laplacian_matrix_t : sparse_matrix_t { handle_t const& raft_handle, GraphCSRView const& csr_view) : sparse_matrix_t(csr_view), - diagonal_(raft_handle, csr_view.number_of_vertices_) { + diagonal_(raft_handle, csr_view.number_of_vertices) { //TODO: more work, here: // // vector_t ones(csr_view.number_of_vertices_); diff --git a/cpp/test/spectral_matrix.cpp b/cpp/test/spectral_matrix.cpp new file mode 100644 index 0000000000..24fd31875e --- /dev/null +++ b/cpp/test/spectral_matrix.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +namespace raft { + +TEST(Raft, SpectralMatrices) { + using namespace matrix; + using index_type = int; + using value_type = double; + + handle_t h; + ASSERT_EQ(0, h.get_num_internal_streams()); + ASSERT_EQ(0, h.get_device()); + // ASSERT_EQ(nullptr, h.get_stream()); + // ASSERT_NE(nullptr, h.get_cublas_handle()); + // ASSERT_NE(nullptr, h.get_cusolver_dn_handle()); + // ASSERT_NE(nullptr, h.get_cusolver_sp_handle()); + // ASSERT_NE(nullptr, h.get_cusparse_handle()); + + int const sz = 10; + vector_t d_v{h, sz}; + + GraphCSRView empty_graph; + + index_type* ro{nullptr}; + index_type* ci{nullptr}; + value_type* vs{nullptr}; + index_type nnz = 0; + index_type nrows = 0; + sparse_matrix_t sm1{ro, ci, vs, nrows, nnz}; + sparse_matrix_t sm2{empty_graph}; + + laplacian_matrix_t lm1{h, ro, ci, vs, nrows, nnz}; + laplacian_matrix_t lm2{h, empty_graph}; + + modularity_matrix_t mm1{h, ro, ci, vs, nrows, nnz}; + modularity_matrix_t mm2{h, empty_graph}; +} + +} // namespace raft From 86dc155bb1c694e557afb211c31f3849cbc2e246 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 15 Jun 2020 12:08:17 -0500 Subject: [PATCH 33/88] Heart-beat tests for spectral matrices. --- cpp/test/spectral_matrix.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/test/spectral_matrix.cpp b/cpp/test/spectral_matrix.cpp index 24fd31875e..46d753550f 100644 --- a/cpp/test/spectral_matrix.cpp +++ b/cpp/test/spectral_matrix.cpp @@ -31,11 +31,6 @@ TEST(Raft, SpectralMatrices) { handle_t h; ASSERT_EQ(0, h.get_num_internal_streams()); ASSERT_EQ(0, h.get_device()); - // ASSERT_EQ(nullptr, h.get_stream()); - // ASSERT_NE(nullptr, h.get_cublas_handle()); - // ASSERT_NE(nullptr, h.get_cusolver_dn_handle()); - // ASSERT_NE(nullptr, h.get_cusolver_sp_handle()); - // ASSERT_NE(nullptr, h.get_cusparse_handle()); int const sz = 10; vector_t d_v{h, sz}; @@ -49,12 +44,18 @@ TEST(Raft, SpectralMatrices) { index_type nrows = 0; sparse_matrix_t sm1{ro, ci, vs, nrows, nnz}; sparse_matrix_t sm2{empty_graph}; + ASSERT_EQ(nullptr, sm1.row_offsets_); + ASSERT_EQ(nullptr, sm2.row_offsets_); laplacian_matrix_t lm1{h, ro, ci, vs, nrows, nnz}; laplacian_matrix_t lm2{h, empty_graph}; + ASSERT_EQ(nullptr, lm1.diagonal_.raw()); + ASSERT_EQ(nullptr, lm2.diagonal_.raw()); modularity_matrix_t mm1{h, ro, ci, vs, nrows, nnz}; modularity_matrix_t mm2{h, empty_graph}; + ASSERT_EQ(nullptr, mm1.diagonal_.raw()); + ASSERT_EQ(nullptr, mm2.diagonal_.raw()); } } // namespace raft From 554554397d44a91fc50ef398e1ccf3f00123d01f Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 15 Jun 2020 16:04:55 -0500 Subject: [PATCH 34/88] Fixed lapack dependencies on dense cusolver. --- cpp/CMakeLists.txt | 3 +- cpp/include/raft/spectral/lanczos.hpp | 34 +- cpp/include/raft/spectral/lapack.hpp | 538 +++++++++----------------- cpp/test/spectral_solvers.cpp | 61 +++ 4 files changed, 272 insertions(+), 364 deletions(-) create mode 100644 cpp/test/spectral_solvers.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index de7f3e0e34..879b214e62 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -206,7 +206,8 @@ if(BUILD_RAFT_TESTS) test/mr/device/buffer.cpp test/mr/host/buffer.cpp test/test.cpp - test/spectral_matrix.cpp) + test/spectral_matrix.cpp + test/spectral_solvers.cpp) target_include_directories(test_raft PRIVATE diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index c4ab61b78e..e9682f5c28 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -28,6 +28,7 @@ #include #include #include +#include #include namespace raft { @@ -69,7 +70,7 @@ namespace { */ template int performLanczosIteration( - handle_t handle, sparse_matrix_t const *A, + handle_t const &handle, sparse_matrix_t const *A, IndexType_ *iter, IndexType_ maxIter, ValueType_ shift, ValueType_ tol, bool reorthogonalize, ValueType_ *__restrict__ alpha_host, ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ lanczosVecs_dev, @@ -82,13 +83,14 @@ int performLanczosIteration( const ValueType_ one = 1; const ValueType_ negOne = -1; const ValueType_ zero = 0; + ValueType_ alpha; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); RAFT_EXPECT(A != nullptr, "Null matrix pointer."); - IndexType_ n = A->nrows; + IndexType_ n = A->nrows_; // ------------------------------------------------------- // Compute second Lanczos vector @@ -108,7 +110,7 @@ int performLanczosIteration( lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream)); - auto alpha = -alpha_host[0]; + alpha = -alpha_host[0]; CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, @@ -443,7 +445,7 @@ static int francisQRIteration(IndexType_ n, ValueType_ shift1, */ template static int lanczosRestart( - handle_t handle, IndexType_ n, IndexType_ iter, IndexType_ iter_new, + handle_t const &handle, IndexType_ n, IndexType_ iter, IndexType_ iter_new, ValueType_ *shiftUpper, ValueType_ *shiftLower, ValueType_ *__restrict__ alpha_host, ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ V_host, ValueType_ *__restrict__ work_host, @@ -500,16 +502,16 @@ static int lanczosRestart( *shiftUpper = ritzVals_host[iter - 1]; *shiftLower = ritzVals_host[iter_new]; } else { - *shiftUpper = max(*shiftUpper, ritzVals_host[iter - 1]); - *shiftLower = min(*shiftLower, ritzVals_host[iter_new]); + *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - 1]); + *shiftLower = std::min(*shiftLower, ritzVals_host[iter_new]); } } else { if (*shiftLower > *shiftUpper) { *shiftUpper = ritzVals_host[iter - iter_new - 1]; *shiftLower = ritzVals_host[0]; } else { - *shiftUpper = max(*shiftUpper, ritzVals_host[iter - iter_new - 1]); - *shiftLower = min(*shiftLower, ritzVals_host[0]); + *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - iter_new - 1]); + *shiftLower = std::min(*shiftLower, ritzVals_host[0]); } } @@ -617,7 +619,7 @@ static int lanczosRestart( */ template int computeSmallestEigenvectors( - handle_t handle, sparse_matrix_t const *A, + handle_t const &handle, sparse_matrix_t const *A, IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, ValueType_ tol, bool reorthogonalize, IndexType_ *effIter, IndexType_ *totalIter, ValueType_ *shift, ValueType_ *__restrict__ alpha_host, @@ -633,7 +635,7 @@ int computeSmallestEigenvectors( const ValueType_ zero = 0; // Matrix dimension - IndexType_ n = A->nrows; + IndexType_ n = A->nrows_; // Shift for implicit restart ValueType_ shiftUpper; @@ -851,13 +853,13 @@ int computeSmallestEigenvectors( */ template int computeSmallestEigenvectors( - handle_t handle, sparse_matrix_t const &A, + handle_t const &handle, sparse_matrix_t const &A, IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, ValueType_ tol, bool reorthogonalize, IndexType_ &iter, ValueType_ *__restrict__ eigVals_dev, ValueType_ *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) { // Matrix dimension - IndexType_ n = A.nrows; + IndexType_ n = A.nrows_; // Check that parameters are valid RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); @@ -936,7 +938,7 @@ int computeSmallestEigenvectors( */ template int computeLargestEigenvectors( - handle_t handle, sparse_matrix_t const *A, + handle_t const &handle, sparse_matrix_t const *A, IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, ValueType_ tol, bool reorthogonalize, IndexType_ *effIter, IndexType_ *totalIter, ValueType_ *__restrict__ alpha_host, @@ -952,7 +954,7 @@ int computeLargestEigenvectors( const ValueType_ zero = 0; // Matrix dimension - IndexType_ n = A->nrows; + IndexType_ n = A->nrows_; // Lanczos iteration counters IndexType_ maxIter_curr = restartIter; // Maximum size of Lanczos system @@ -1170,7 +1172,7 @@ int computeLargestEigenvectors( * @return error flag. */ template -int computeLargestEigenvectors(handle_t handle, +int computeLargestEigenvectors(handle_t const &handle, sparse_matrix_t const &A, IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, ValueType_ tol, @@ -1179,7 +1181,7 @@ int computeLargestEigenvectors(handle_t handle, ValueType_ *__restrict__ eigVecs_dev, unsigned long long seed = 123456) { // Matrix dimension - IndexType_ n = A.nrows; + IndexType_ n = A.nrows_; // Check that parameters are valid RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp index d86343990d..0dab3d57b2 100644 --- a/cpp/include/raft/spectral/lapack.hpp +++ b/cpp/include/raft/spectral/lapack.hpp @@ -13,12 +13,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - + #pragma once +#include -#include -#include #include +#include +#include //for now; TODO: check if/where this `define` should be; // @@ -26,388 +27,249 @@ namespace raft { -#define lapackCheckError(status) \ - { \ - if (status < 0) { \ - std::stringstream ss; \ - ss << "Lapack error: argument number " << -status << " had an illegal value."; \ - RAFT_FAIL(ss.str()); \ - } else if (status > 0) \ - RAFT_FAIL("Lapack error: internal error."); \ +#define lapackCheckError(status) \ + { \ + if (status < 0) { \ + std::stringstream ss; \ + ss << "Lapack error: argument number " << -status \ + << " had an illegal value."; \ + RAFT_FAIL(ss.str()); \ + } else if (status > 0) \ + RAFT_FAIL("Lapack error: internal error."); \ } - -extern "C" void sgeqrf_( - int *m, int *n, float *a, int *lda, float *tau, float *work, int *lwork, int *info); -extern "C" void dgeqrf_( - int *m, int *n, double *a, int *lda, double *tau, double *work, int *lwork, int *info); -extern "C" void sormqr_(char *side, - char *trans, - int *m, - int *n, - int *k, - float *a, - int *lda, - const float *tau, - float *c, - int *ldc, - float *work, - int *lwork, - int *info); -extern "C" void dormqr_(char *side, - char *trans, - int *m, - int *n, - int *k, - double *a, - int *lda, - const double *tau, - double *c, - int *ldc, - double *work, - int *lwork, - int *info); -extern "C" int dgeev_(char *jobvl, - char *jobvr, - int *n, - double *a, - int *lda, - double *wr, - double *wi, - double *vl, - int *ldvl, - double *vr, - int *ldvr, - double *work, - int *lwork, - int *info); - -extern "C" int sgeev_(char *jobvl, - char *jobvr, - int *n, - float *a, - int *lda, - float *wr, - float *wi, - float *vl, - int *ldvl, - float *vr, - int *ldvr, - float *work, - int *lwork, - int *info); - +extern "C" void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, + float *work, int *lwork, int *info); +extern "C" void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau, + double *work, int *lwork, int *info); +extern "C" void sormqr_(char *side, char *trans, int *m, int *n, int *k, + float *a, int *lda, const float *tau, float *c, + int *ldc, float *work, int *lwork, int *info); +extern "C" void dormqr_(char *side, char *trans, int *m, int *n, int *k, + double *a, int *lda, const double *tau, double *c, + int *ldc, double *work, int *lwork, int *info); +extern "C" int dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, + double *wr, double *wi, double *vl, int *ldvl, double *vr, + int *ldvr, double *work, int *lwork, int *info); + +extern "C" int sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, + float *wr, float *wi, float *vl, int *ldvl, float *vr, + int *ldvr, float *work, int *lwork, int *info); + +extern "C" cusolverStatus_t cusolverDnSgemmHost( + cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, + const float *alpha, const float *A, int lda, const float *B, int ldb, + const float *beta, float *C, int ldc); + +extern "C" cusolverStatus_t cusolverDnDgemmHost( + cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, + const double *alpha, const double *A, int lda, const double *B, int ldb, + const double *beta, double *C, int ldc); + +extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float *d, float *e, + int *info); + +extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double *d, double *e, + int *info); + +extern "C" cusolverStatus_t cusolverDnSsteqrHost(const signed char *compz, + int n, float *d, float *e, + float *z, int ldz, float *work, + int *info); + +extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char *compz, + int n, double *d, double *e, + double *z, int ldz, + double *work, int *info); template -class Lapack -{ -private: - Lapack(); - ~Lapack(); -public: - static void check_lapack_enabled(); - - static void gemm(bool transa, bool transb, int m, int n, int k, T alpha, const T * A, int lda, const T * B, int ldb, T beta, T * C, int ldc); - - // special QR for lanczos - static void sterf(int n, T * d, T * e); - static void steqr(char compz, int n, T * d, T * e, T * z, int ldz, T * work); - - // QR - // computes the QR factorization of a general matrix - static void geqrf (int m, int n, T *a, int lda, T *tau, T *work, int *lwork); - // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf. - //static void orgqr( int m, int n, int k, T* a, int lda, const T* tau, T* work, int* lwork ); - // multiply C by implicit Q - static void ormqr (bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork); - //static void unmqr (bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork); - //static void qrf (int n, T *H, T *Q, T *R); - - //static void hseqr (T* Q, T* R, T* eigenvalues,T* eigenvectors, int dim, int ldh, int ldq); - static void geev(T* A, T* eigenvalues, int dim, int lda); - static void geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr); - static void geev(T* A, T* eigenvalues_r, T* eigenvalues_i, T* eigenvectors_r, T* eigenvectors_i, int dim, int lda, int ldvr); - -private: - static void lapack_gemm(const char transa, - const char transb, - int m, - int n, - int k, - float alpha, - const float *a, - int lda, - const float *b, - int ldb, - float beta, - float *c, - int ldc) - { - cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cusolverDnSgemmHost( - cublas_transa, cublas_transb, m, n, k, &alpha, (float *)a, lda, (float *)b, ldb, &beta, c, ldc); +class Lapack { + private: + Lapack(); + ~Lapack(); + + public: + static void check_lapack_enabled(); + + static void gemm(bool transa, bool transb, int m, int n, int k, T alpha, + const T *A, int lda, const T *B, int ldb, T beta, T *C, + int ldc); + + // special QR for lanczos + static void sterf(int n, T *d, T *e); + static void steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work); + + // QR + // computes the QR factorization of a general matrix + static void geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork); + // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf. + //static void orgqr( int m, int n, int k, T* a, int lda, const T* tau, T* work, int* lwork ); + // multiply C by implicit Q + static void ormqr(bool right_side, bool transq, int m, int n, int k, T *a, + int lda, T *tau, T *c, int ldc, T *work, int *lwork); + //static void unmqr (bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork); + //static void qrf (int n, T *H, T *Q, T *R); + + //static void hseqr (T* Q, T* R, T* eigenvalues,T* eigenvectors, int dim, int ldh, int ldq); + static void geev(T *A, T *eigenvalues, int dim, int lda); + static void geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, + int ldvr); + static void geev(T *A, T *eigenvalues_r, T *eigenvalues_i, T *eigenvectors_r, + T *eigenvectors_i, int dim, int lda, int ldvr); + + private: + static void lapack_gemm(const char transa, const char transb, int m, int n, + int k, float alpha, const float *a, int lda, + const float *b, int ldb, float beta, float *c, + int ldc) { + cublasOperation_t cublas_transa = + (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = + (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnSgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha, + (float *)a, lda, (float *)b, ldb, &beta, c, ldc); } - static void lapack_gemm(const signed char transa, - const signed char transb, - int m, - int n, - int k, - double alpha, - const double *a, - int lda, - const double *b, - int ldb, - double beta, - double *c, - int ldc) - { - cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cusolverDnDgemmHost(cublas_transa, - cublas_transb, - m, - n, - k, - &alpha, - (double *)a, - lda, - (double *)b, - ldb, - &beta, - c, - ldc); + static void lapack_gemm(const signed char transa, const signed char transb, + int m, int n, int k, double alpha, const double *a, + int lda, const double *b, int ldb, double beta, + double *c, int ldc) { + cublasOperation_t cublas_transa = + (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = + (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnDgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha, + (double *)a, lda, (double *)b, ldb, &beta, c, ldc); } - - static void lapack_sterf(int n, float *d, float *e, int *info) - { + static void lapack_sterf(int n, float *d, float *e, int *info) { cusolverDnSsterfHost(n, d, e, info); } - static void lapack_sterf(int n, double *d, double *e, int *info) - { + static void lapack_sterf(int n, double *d, double *e, int *info) { cusolverDnDsterfHost(n, d, e, info); } - static void void lapack_steqr(const signed char compz, int n, float *d, float *e, float *z, int ldz, float *work, int *info) - { + static void lapack_steqr(const signed char compz, int n, float *d, float *e, + float *z, int ldz, float *work, int *info) { cusolverDnSsteqrHost(&compz, n, d, e, z, ldz, work, info); } - static void lapack_steqr(const signed char compz, int n, double *d, double *e, double *z, int ldz, double *work, int *info) - { + static void lapack_steqr(const signed char compz, int n, double *d, double *e, + double *z, int ldz, double *work, int *info) { cusolverDnDsteqrHost(&compz, n, d, e, z, ldz, work, info); } - static void lapack_geqrf(int m, int n, float *a, int lda, float *tau, float *work, int *lwork, int *info) - { + static void lapack_geqrf(int m, int n, float *a, int lda, float *tau, + float *work, int *lwork, int *info) { sgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); } - - static void lapack_geqrf(int m, int n, double *a, int lda, double *tau, double *work, int *lwork, int *info) - { + + static void lapack_geqrf(int m, int n, double *a, int lda, double *tau, + double *work, int *lwork, int *info) { dgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); } - static void lapack_ormqr(char side, - char trans, - int m, - int n, - int k, - float *a, - int lda, - float *tau, - float *c, - int ldc, - float *work, - int *lwork, - int *info) - { - sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); + static void lapack_ormqr(char side, char trans, int m, int n, int k, float *a, + int lda, float *tau, float *c, int ldc, float *work, + int *lwork, int *info) { + sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, + info); } - - static void lapack_ormqr(char side, - char trans, - int m, - int n, - int k, - double *a, - int lda, - double *tau, - double *c, - int ldc, - double *work, - int *lwork, - int *info) - { - dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); + + static void lapack_ormqr(char side, char trans, int m, int n, int k, + double *a, int lda, double *tau, double *c, int ldc, + double *work, int *lwork, int *info) { + dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, + info); } - static int lapack_geev_dispatch(char *jobvl, - char *jobvr, - int *n, - double *a, - int *lda, - double *wr, - double *wi, - double *vl, - int *ldvl, - double *vr, - int *ldvr, - double *work, - int *lwork, - int *info) - { - return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); + static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, double *a, + int *lda, double *wr, double *wi, double *vl, + int *ldvl, double *vr, int *ldvr, + double *work, int *lwork, int *info) { + return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, + lwork, info); } - static int lapack_geev_dispatch(char *jobvl, - char *jobvr, - int *n, - float *a, - int *lda, - float *wr, - float *wi, - float *vl, - int *ldvl, - float *vr, - int *ldvr, - float *work, - int *lwork, - int *info) - { - return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); + static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, float *a, + int *lda, float *wr, float *wi, float *vl, + int *ldvl, float *vr, int *ldvr, float *work, + int *lwork, int *info) { + return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, + lwork, info); } // real eigenvalues - static - void lapack_geev(T *A, T *eigenvalues, int dim, int lda) - { + static void lapack_geev(T *A, T *eigenvalues, int dim, int lda) { char job = 'N'; std::vector WI(dim); - int ldv = 1; - T *vl = 0; + int ldv = 1; + T *vl = 0; int work_size = 6 * dim; std::vector work(work_size); int info; - lapack_geev_dispatch(&job, - &job, - &dim, - A, - &lda, - eigenvalues, - WI.data(), - vl, - &ldv, - vl, - &ldv, - work.data(), - &work_size, - &info); + lapack_geev_dispatch(&job, &job, &dim, A, &lda, eigenvalues, WI.data(), vl, + &ldv, vl, &ldv, work.data(), &work_size, &info); lapackCheckError(info); } - + // real eigenpairs - static - void lapack_geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, int ldvr) - { + static void lapack_geev(T *A, T *eigenvalues, T *eigenvectors, int dim, + int lda, int ldvr) { char jobvl = 'N'; char jobvr = 'V'; std::vector WI(dim); int work_size = 6 * dim; - T *vl = 0; - int ldvl = 1; + T *vl = 0; + int ldvl = 1; std::vector work(work_size); int info; - lapack_geev_dispatch(&jobvl, - &jobvr, - &dim, - A, - &lda, - eigenvalues, - WI.data(), - vl, - &ldvl, - eigenvectors, - &ldvr, - work.data(), - &work_size, - &info); + lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues, WI.data(), + vl, &ldvl, eigenvectors, &ldvr, work.data(), + &work_size, &info); lapackCheckError(info); } - + // complex eigenpairs - static - void lapack_geev(T *A, - T *eigenvalues_r, - T *eigenvalues_i, - T *eigenvectors_r, - T *eigenvectors_i, - int dim, - int lda, - int ldvr) - { - char jobvl = 'N'; - char jobvr = 'V'; + static void lapack_geev(T *A, T *eigenvalues_r, T *eigenvalues_i, + T *eigenvectors_r, T *eigenvectors_i, int dim, + int lda, int ldvr) { + char jobvl = 'N'; + char jobvr = 'V'; int work_size = 8 * dim; - int ldvl = 1; + int ldvl = 1; std::vector work(work_size); int info; - lapack_geev_dispatch(&jobvl, - &jobvr, - &dim, - A, - &lda, - eigenvalues_r, - eigenvalues_i, - 0, - &ldvl, - eigenvectors_r, - &ldvr, - work.data(), - &work_size, - &info); + lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues_r, + eigenvalues_i, 0, &ldvl, eigenvectors_r, &ldvr, + work.data(), &work_size, &info); lapackCheckError(info); } - }; template -void Lapack::check_lapack_enabled() -{ +void Lapack::check_lapack_enabled() { #ifndef USE_LAPACK RAFT_FAIL("Error: LAPACK not enabled."); #endif } template -void Lapack::gemm(bool transa, - bool transb, - int m, - int n, - int k, - T alpha, - const T *A, - int lda, - const T *B, - int ldb, - T beta, - T *C, - int ldc) -{ +void Lapack::gemm(bool transa, bool transb, int m, int n, int k, T alpha, + const T *A, int lda, const T *B, int ldb, T beta, T *C, + int ldc) { // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK const char transA_char = transa ? 'T' : 'N'; const char transB_char = transb ? 'T' : 'N'; - lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, + ldc); //#endif } template -void Lapack::sterf(int n, T *d, T *e) -{ +void Lapack::sterf(int n, T *d, T *e) { // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK int info; @@ -417,8 +279,7 @@ void Lapack::sterf(int n, T *d, T *e) } template -void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) -{ +void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) { // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK int info; @@ -428,8 +289,8 @@ void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) } template -void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork) -{ +void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, + int *lwork) { check_lapack_enabled(); #ifdef USE_LAPACK int info; @@ -438,22 +299,11 @@ void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork) #endif } template -void Lapack::ormqr(bool right_side, - bool transq, - int m, - int n, - int k, - T *a, - int lda, - T *tau, - T *c, - int ldc, - T *work, - int *lwork) -{ +void Lapack::ormqr(bool right_side, bool transq, int m, int n, int k, T *a, + int lda, T *tau, T *c, int ldc, T *work, int *lwork) { check_lapack_enabled(); #ifdef USE_LAPACK - char side = right_side ? 'R' : 'L'; + char side = right_side ? 'R' : 'L'; char trans = transq ? 'T' : 'N'; int info; lapack_ormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info); @@ -463,8 +313,7 @@ void Lapack::ormqr(bool right_side, // real eigenvalues template -void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) -{ +void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) { check_lapack_enabled(); #ifdef USE_LAPACK lapack_geev(A, eigenvalues, dim, lda); @@ -472,8 +321,8 @@ void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) } // real eigenpairs template -void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, int ldvr) -{ +void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, + int ldvr) { check_lapack_enabled(); #ifdef USE_LAPACK lapack_geev(A, eigenvalues, eigenvectors, dim, lda, ldvr); @@ -481,19 +330,14 @@ void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, in } // complex eigenpairs template -void Lapack::geev(T *A, - T *eigenvalues_r, - T *eigenvalues_i, - T *eigenvectors_r, - T *eigenvectors_i, - int dim, - int lda, - int ldvr) -{ +void Lapack::geev(T *A, T *eigenvalues_r, T *eigenvalues_i, + T *eigenvectors_r, T *eigenvectors_i, int dim, int lda, + int ldvr) { check_lapack_enabled(); #ifdef USE_LAPACK - lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, dim, lda, ldvr); + lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, + dim, lda, ldvr); #endif } - -} // namespace raft + +} // namespace raft diff --git a/cpp/test/spectral_solvers.cpp b/cpp/test/spectral_solvers.cpp new file mode 100644 index 0000000000..c1bc9738ae --- /dev/null +++ b/cpp/test/spectral_solvers.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +//#include +#include + +namespace raft { + +TEST(Raft, SpectralSolvers) { + using namespace matrix; + using index_type = int; + using value_type = double; + + handle_t h; + ASSERT_EQ(0, h.get_num_internal_streams()); + ASSERT_EQ(0, h.get_device()); + + index_type* ro{nullptr}; + index_type* ci{nullptr}; + value_type* vs{nullptr}; + index_type nnz = 0; + index_type nrows = 0; + sparse_matrix_t sm1{ro, ci, vs, nrows, nnz}; + ASSERT_EQ(nullptr, sm1.row_offsets_); + + laplacian_matrix_t lm1{h, ro, ci, vs, nrows, nnz}; + ASSERT_EQ(nullptr, lm1.diagonal_.raw()); + + index_type neigvs{10}; + index_type maxiter{100}; + index_type restart_iter{10}; + value_type tol{1.0e-10}; + bool reorthog{true}; + + index_type iter; + value_type* eigvals{nullptr}; + value_type* eigvecs{nullptr}; + unsigned long long seed{100110021003}; + computeSmallestEigenvectors(h, lm1, neigvs, maxiter, restart_iter, tol, + reorthog, iter, eigvals, eigvecs, seed); +} + +} // namespace raft From ab4e8a66083b6c4c59d406f5a4e3062a321bcef8 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 15 Jun 2020 17:18:00 -0500 Subject: [PATCH 35/88] Added largest eigenvector heart-beat test. --- cpp/test/spectral_solvers.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/test/spectral_solvers.cpp b/cpp/test/spectral_solvers.cpp index c1bc9738ae..068d61e9c1 100644 --- a/cpp/test/spectral_solvers.cpp +++ b/cpp/test/spectral_solvers.cpp @@ -56,6 +56,9 @@ TEST(Raft, SpectralSolvers) { unsigned long long seed{100110021003}; computeSmallestEigenvectors(h, lm1, neigvs, maxiter, restart_iter, tol, reorthog, iter, eigvals, eigvecs, seed); + + computeLargestEigenvectors(h, lm1, neigvs, maxiter, restart_iter, tol, + reorthog, iter, eigvals, eigvecs, seed); } } // namespace raft From 058aae701cbd0381f95834e91d07669e08534a91 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 15 Jun 2020 18:49:55 -0500 Subject: [PATCH 36/88] Heart-beat for kmeans. --- cpp/CMakeLists.txt | 3 +- cpp/include/raft/spectral/kmeans.hpp | 59 ++++---- cpp/include/raft/spectral/sm_utils.hpp | 180 +++++++++++++++++++++---- cpp/test/spectral_solvers.cpp | 1 - cpp/test/spectral_solvers.cu | 54 ++++++++ 5 files changed, 238 insertions(+), 59 deletions(-) create mode 100644 cpp/test/spectral_solvers.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 879b214e62..d96f936a3d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -207,7 +207,8 @@ if(BUILD_RAFT_TESTS) test/mr/host/buffer.cpp test/test.cpp test/spectral_matrix.cpp - test/spectral_solvers.cpp) + test/spectral_solvers.cpp + test/spectral_solvers.cu) target_include_directories(test_raft PRIVATE diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index f57a4c1be5..444bf2491a 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -113,7 +114,7 @@ static __global__ void computeDistances( // Write result to global memory if (threadIdx.x == 0) - atomicFPAdd(dists + IDX(gidz, gidy, n), dist_private); + utils::atomicFPAdd(dists + IDX(gidz, gidy, n), dist_private); // Move to another observation vector gidz += blockDim.z * gridDim.z; @@ -325,7 +326,7 @@ static __global__ void divideCentroids( * @return Zero if successful. Otherwise non-zero. */ template -static int chooseNewCentroid(handle_t handle, +static int chooseNewCentroid(handle_t const& handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, IndexType_ d, IndexType_ k, ValueType_ rand, const ValueType_* __restrict__ obs, @@ -334,7 +335,7 @@ static int chooseNewCentroid(handle_t handle, // Cumulative sum of distances ValueType_* distsCumSum = dists + n; // Residual sum of squares - ValueType_ distsSum; + ValueType_ distsSum{0}; // Observation vector that is chosen as new centroid IndexType_ obsIndex; @@ -391,7 +392,7 @@ static int chooseNewCentroid(handle_t handle, */ template static int initializeCentroids( - handle_t handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, + handle_t const& handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, ValueType_* __restrict__ centroids, IndexType_* __restrict__ codes, IndexType_* __restrict__ clusterSizes, ValueType_* __restrict__ dists, @@ -443,10 +444,10 @@ static int initializeCentroids( CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(ValueType_), stream)); computeDistances<<>>( n, d, 1, obs, centroids, dists); - cudaCheckError() + CUDA_CHECK_LAST(); - // Choose remaining centroids - for (i = 1; i < k; ++i) { + // Choose remaining centroids + for (i = 1; i < k; ++i) { // Choose ith centroid if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), obs, dists, centroids + IDX(0, i, d))) @@ -497,14 +498,12 @@ static int initializeCentroids( * @return Zero if successful. Otherwise non-zero. */ template -static int assignCentroids(handle_t handle, ThrustExePolicy thrust_exec_policy, - IndexType_ n, IndexType_ d, IndexType_ k, - const ValueType_* __restrict__ obs, - const ValueType_* __restrict__ centroids, - ValueType_* __restrict__ dists, - IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes, - ValueType_* residual_host) { +static int assignCentroids( + handle_t const& handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, + IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, + const ValueType_* __restrict__ centroids, ValueType_* __restrict__ dists, + IndexType_* __restrict__ codes, IndexType_* __restrict__ clusterSizes, + ValueType_* residual_host) { // CUDA grid dimensions dim3 blockDim, gridDim; @@ -565,8 +564,9 @@ static int assignCentroids(handle_t handle, ThrustExePolicy thrust_exec_policy, * @return Zero if successful. Otherwise non-zero. */ template -static int updateCentroids(handle_t handle, ThrustExePolicy thrust_exec_policy, - IndexType_ n, IndexType_ d, IndexType_ k, +static int updateCentroids(handle_t const& handle, + ThrustExePolicy thrust_exec_policy, IndexType_ n, + IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, const IndexType_* __restrict__ codes, const IndexType_* __restrict__ clusterSizes, @@ -601,8 +601,8 @@ static int updateCentroids(handle_t handle, ThrustExePolicy thrust_exec_policy, thrust::sequence(thrust_exec_policy, rows, rows + d * n); CUDA_CHECK_LAST(); thrust::transform(thrust_exec_policy, rows, rows + d * n, - make_constant_iterator(n), rows, - modulus()); + thrust::make_constant_iterator(n), rows, + thrust::modulus()); CUDA_CHECK_LAST(); thrust::gather(thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy); @@ -612,8 +612,8 @@ static int updateCentroids(handle_t handle, ThrustExePolicy thrust_exec_policy, thrust::sequence(thrust_exec_policy, rows, rows + d * n); CUDA_CHECK_LAST(); thrust::transform(thrust_exec_policy, rows, rows + d * n, - make_constant_iterator(n), rows, - divides()); + thrust::make_constant_iterator(n), rows, + thrust::divides()); CUDA_CHECK_LAST(); // Sort and reduce to add observation vectors in same cluster @@ -680,9 +680,10 @@ namespace raft { * @return error flag. */ template -int kmeans(handle_t handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, - IndexType_ d, IndexType_ k, ValueType_ tol, IndexType_ maxiter, - const ValueType_* __restrict__ obs, IndexType_* __restrict__ codes, +int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, + IndexType_ n, IndexType_ d, IndexType_ k, ValueType_ tol, + IndexType_ maxiter, const ValueType_* __restrict__ obs, + IndexType_* __restrict__ codes, IndexType_* __restrict__ clusterSizes, ValueType_* __restrict__ centroids, ValueType_* __restrict__ work, IndexType_* __restrict__ work_int, ValueType_* residual_host, @@ -843,11 +844,11 @@ int kmeans(handle_t handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, * @return error flag */ template -int kmeans(handle_t handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, - IndexType_ d, IndexType_ k, ValueType_ tol, IndexType_ maxiter, - const ValueType_* __restrict__ obs, IndexType_* __restrict__ codes, - ValueType_& residual, IndexType_& iters, - unsigned long long seed = 123456) { +int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, + IndexType_ n, IndexType_ d, IndexType_ k, ValueType_ tol, + IndexType_ maxiter, const ValueType_* __restrict__ obs, + IndexType_* __restrict__ codes, ValueType_& residual, + IndexType_& iters, unsigned long long seed = 123456) { using namespace matrix; // Check that parameters are valid diff --git a/cpp/include/raft/spectral/sm_utils.hpp b/cpp/include/raft/spectral/sm_utils.hpp index 25d6e2e358..3c1c1e4484 100644 --- a/cpp/include/raft/spectral/sm_utils.hpp +++ b/cpp/include/raft/spectral/sm_utils.hpp @@ -102,15 +102,15 @@ static __device__ __forceinline__ double shfl(double r, int lane, int mask = DEFAULT_MASK) { #if __CUDA_ARCH__ >= 300 #ifdef USE_CG - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_sync(mask, a.x, lane, bound); a.y = __shfl_sync(mask, a.y, lane, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #else - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl(a.x, lane, bound); a.y = __shfl(a.y, lane, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #endif #else return 0.0; @@ -122,15 +122,15 @@ static __device__ __forceinline__ long long shfl(long long r, int lane, int mask = DEFAULT_MASK) { #if __CUDA_ARCH__ >= 300 #ifdef USE_CG - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_sync(mask, a.x, lane, bound); a.y = __shfl_sync(mask, a.y, lane, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #else - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl(a.x, lane, bound); a.y = __shfl(a.y, lane, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #endif #else return 0.0; @@ -170,15 +170,15 @@ static __device__ __forceinline__ double shfl_down(double r, int offset, int mask = DEFAULT_MASK) { #if __CUDA_ARCH__ >= 300 #ifdef USE_CG - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_down_sync(mask, a.x, offset, bound); a.y = __shfl_down_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #else - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_down(a.x, offset, bound); a.y = __shfl_down(a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #endif #else return 0.0; @@ -190,15 +190,15 @@ static __device__ __forceinline__ long long shfl_down(long long r, int offset, int mask = DEFAULT_MASK) { #if __CUDA_ARCH__ >= 300 #ifdef USE_CG - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_down_sync(mask, a.x, offset, bound); a.y = __shfl_down_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #else - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_down(a.x, offset, bound); a.y = __shfl_down(a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #endif #else return 0.0; @@ -211,15 +211,15 @@ static __device__ __forceinline__ uint64_t shfl_down(uint64_t r, int offset, int mask = DEFAULT_MASK) { #if __CUDA_ARCH__ >= 300 #ifdef USE_CG - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_down_sync(mask, a.x, offset, bound); a.y = __shfl_down_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #else - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_down(mask, a.x, offset, bound); a.y = __shfl_down(mask, a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #endif #else return 0.0; @@ -258,15 +258,15 @@ static __device__ __forceinline__ double shfl_up(double r, int offset, int mask = DEFAULT_MASK) { #if __CUDA_ARCH__ >= 300 #ifdef USE_CG - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_up_sync(mask, a.x, offset, bound); a.y = __shfl_up_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #else - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_up(a.x, offset, bound); a.y = __shfl_up(a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #endif #else return 0.0; @@ -278,20 +278,144 @@ static __device__ __forceinline__ long long shfl_up(long long r, int offset, int mask = DEFAULT_MASK) { #if __CUDA_ARCH__ >= 300 #ifdef USE_CG - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_up_sync(mask, a.x, offset, bound); a.y = __shfl_up_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #else - int2 a = *reinterpret_cast(&r); + int2 a = *reinterpret_cast(&r); a.x = __shfl_up(a.x, offset, bound); a.y = __shfl_up(a.y, offset, bound); - return *reinterpret_cast(&a); + return *reinterpret_cast(&a); #endif #else return 0.0; #endif } + +static __inline__ __device__ double atomicFPAdd(double *addr, double val) { +// atomicAdd for double starts with sm_60 +#if __CUDA_ARCH__ >= 600 + return atomicAdd(addr, val); +#else + unsigned long long old = __double_as_longlong(addr[0]), assumed; + + do { + assumed = old; + old = atomicCAS((unsigned long long *)addr, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); + + return old; +#endif +} + +// atomicAdd for float starts with sm_20 +static __inline__ __device__ float atomicFPAdd(float *addr, float val) { + return atomicAdd(addr, val); +} + +static __inline__ __device__ double atomicFPMin(double *addr, double val) { + double old, assumed; + old = *addr; + do { + assumed = old; + old = __longlong_as_double( + atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed), + __double_as_longlong(min(val, assumed)))); + } while (__double_as_longlong(assumed) != __double_as_longlong(old)); + return old; +} + +/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */ +static __inline__ __device__ float atomicFPMin(float *addr, float val) { + float old, assumed; + old = *addr; + do { + assumed = old; + old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed), + float_as_int(min(val, assumed)))); + } while (float_as_int(assumed) != float_as_int(old)); + + return old; +} + +static __inline__ __device__ double atomicFPMax(double *addr, double val) { + double old, assumed; + old = *addr; + do { + assumed = old; + old = __longlong_as_double( + atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed), + __double_as_longlong(max(val, assumed)))); + } while (__double_as_longlong(assumed) != __double_as_longlong(old)); + return old; +} + +/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */ +static __inline__ __device__ float atomicFPMax(float *addr, float val) { + float old, assumed; + old = *addr; + do { + assumed = old; + old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed), + float_as_int(max(val, assumed)))); + } while (float_as_int(assumed) != float_as_int(old)); + + return old; +} + +static __inline__ __device__ double atomicFPOr(double *addr, double val) { + double old, assumed; + old = *addr; + do { + assumed = old; + old = __longlong_as_double( + atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed), + __double_as_longlong((bool)val | (bool)assumed))); + } while (__double_as_longlong(assumed) != __double_as_longlong(old)); + return old; +} + +/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */ +static __inline__ __device__ float atomicFPOr(float *addr, float val) { + float old, assumed; + old = *addr; + do { + assumed = old; + old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed), + float_as_int((bool)val | (bool)assumed))); + } while (float_as_int(assumed) != float_as_int(old)); + + return old; +} + +static __inline__ __device__ double atomicFPLog(double *addr, double val) { + double old, assumed; + old = *addr; + do { + assumed = old; + old = __longlong_as_double( + atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed), + __double_as_longlong(-log(exp(-val) + exp(-assumed))))); + } while (__double_as_longlong(assumed) != __double_as_longlong(old)); + return old; +} + +/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */ +static __inline__ __device__ float atomicFPLog(float *addr, float val) { + float old, assumed; + old = *addr; + do { + assumed = old; + old = + int_as_float(atomicCAS((int *)addr, float_as_int(assumed), + float_as_int(-logf(expf(-val) + expf(-assumed))))); + } while (float_as_int(assumed) != float_as_int(old)); + + return old; +} + } // namespace utils } // namespace raft diff --git a/cpp/test/spectral_solvers.cpp b/cpp/test/spectral_solvers.cpp index 068d61e9c1..a2c329e0c8 100644 --- a/cpp/test/spectral_solvers.cpp +++ b/cpp/test/spectral_solvers.cpp @@ -19,7 +19,6 @@ #include #include -//#include #include namespace raft { diff --git a/cpp/test/spectral_solvers.cu b/cpp/test/spectral_solvers.cu new file mode 100644 index 0000000000..410781369a --- /dev/null +++ b/cpp/test/spectral_solvers.cu @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +namespace raft { + +TEST(Raft, ClusterSolvers) { + using namespace matrix; + using index_type = int; + using value_type = double; + + handle_t h; + ASSERT_EQ(0, h.get_num_internal_streams()); + ASSERT_EQ(0, h.get_device()); + + index_type maxiter{100}; + value_type tol{1.0e-10}; + index_type iter; + value_type* eigvecs{nullptr}; + unsigned long long seed{100110021003}; + + auto stream = h.get_stream(); + //thrust::cuda::par.on(stream); + + index_type n{100}; + index_type d{10}; + index_type k{5}; + index_type* codes{nullptr}; + value_type residual; + + kmeans(h, thrust::cuda::par.on(stream), n, d, k, tol, maxiter, eigvecs, codes, + residual, iter, seed); +} + +} // namespace raft From 610348943e5094bd4d1aafe91faa2f6e1b452db4 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 16 Jun 2020 14:34:30 -0500 Subject: [PATCH 37/88] Adding higher level solvers to test: cluster solver interface. --- cpp/include/raft/spectral/cluster_solvers.hpp | 8 ++++---- cpp/test/spectral_solvers.cu | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp index cd0963506f..0b8999c6a0 100644 --- a/cpp/include/raft/spectral/cluster_solvers.hpp +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -38,11 +38,11 @@ struct cluster_solver_config_t { template struct kmeans_solver_t { - explicit kmeans_solver_t( - cluster_solver_config_t const& config) + explicit kmeans_solver_t(cluster_solver_config_t const& config) : config_(config) {} - template + template std::pair solve( handle_t handle, thrust_exe_policy_t t_exe_policy, size_type_t n_obs_vecs, size_type_t dim, value_type_t const* __restrict__ obs, @@ -58,6 +58,6 @@ struct kmeans_solver_t { auto const& get_config(void) const { return config_; } private: - cluster_solver_config_t config_; + cluster_solver_config_t config_; }; } // namespace raft diff --git a/cpp/test/spectral_solvers.cu b/cpp/test/spectral_solvers.cu index 410781369a..971950410c 100644 --- a/cpp/test/spectral_solvers.cu +++ b/cpp/test/spectral_solvers.cu @@ -19,7 +19,7 @@ #include #include -#include +#include namespace raft { From 7be516eb57a5c3e24035d8794a2d420416cdcc85 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 16 Jun 2020 14:50:16 -0500 Subject: [PATCH 38/88] Adding higher level solvers to test: eigen solver interface. --- cpp/include/raft/spectral/eigen_solvers.hpp | 12 +++++++----- cpp/test/spectral_solvers.cpp | 14 +++++++++----- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp index 9c1258c432..21d2f83dbb 100644 --- a/cpp/include/raft/spectral/eigen_solvers.hpp +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -40,12 +40,13 @@ struct eigen_solver_config_t { template struct lanczos_solver_t { - explicit lanczos_solver_t( - eigen_solver_config_t const& config) + explicit lanczos_solver_t(eigen_solver_config_t const& config) : config_(config) {} index_type_t solve_smallest_eigenvectors( - handle_t handle, sparse_matrix_t const& A, + handle_t const& handle, + sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) const { index_type_t iters{}; @@ -57,7 +58,8 @@ struct lanczos_solver_t { } index_type_t solve_largest_eigenvectors( - handle_t handle, sparse_matrix_t const& A, + handle_t const& handle, + sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) const { index_type_t iters{}; @@ -71,6 +73,6 @@ struct lanczos_solver_t { auto const& get_config(void) const { return config_; } private: - eigen_solver_config_t config_; + eigen_solver_config_t config_; }; } // namespace raft diff --git a/cpp/test/spectral_solvers.cpp b/cpp/test/spectral_solvers.cpp index a2c329e0c8..549c2e66d7 100644 --- a/cpp/test/spectral_solvers.cpp +++ b/cpp/test/spectral_solvers.cpp @@ -19,7 +19,7 @@ #include #include -#include +#include namespace raft { @@ -53,11 +53,15 @@ TEST(Raft, SpectralSolvers) { value_type* eigvals{nullptr}; value_type* eigvecs{nullptr}; unsigned long long seed{100110021003}; - computeSmallestEigenvectors(h, lm1, neigvs, maxiter, restart_iter, tol, - reorthog, iter, eigvals, eigvecs, seed); - computeLargestEigenvectors(h, lm1, neigvs, maxiter, restart_iter, tol, - reorthog, iter, eigvals, eigvecs, seed); + eigen_solver_config_t cfg{ + neigvs, maxiter, restart_iter, tol, reorthog, seed}; + + lanczos_solver_t eig_solver{cfg}; + + eig_solver.solve_smallest_eigenvectors(h, lm1, eigvals, eigvecs); + + eig_solver.solve_largest_eigenvectors(h, lm1, eigvals, eigvecs); } } // namespace raft From 9dee0b370bf7d3487507f0f6c5f6bc2af8e176a7 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 16 Jun 2020 15:02:32 -0500 Subject: [PATCH 39/88] Adding higher level solvers to test: cluster solver interface (fixed). --- cpp/include/raft/spectral/cluster_solvers.hpp | 5 +++-- cpp/test/spectral_solvers.cu | 11 ++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp index 0b8999c6a0..08579e22bd 100644 --- a/cpp/include/raft/spectral/cluster_solvers.hpp +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -44,8 +44,9 @@ struct kmeans_solver_t { template std::pair solve( - handle_t handle, thrust_exe_policy_t t_exe_policy, size_type_t n_obs_vecs, - size_type_t dim, value_type_t const* __restrict__ obs, + handle_t const& handle, thrust_exe_policy_t t_exe_policy, + size_type_t n_obs_vecs, size_type_t dim, + value_type_t const* __restrict__ obs, index_type_t* __restrict__ codes) const { value_type_t residual{}; index_type_t iters{}; diff --git a/cpp/test/spectral_solvers.cu b/cpp/test/spectral_solvers.cu index 971950410c..8c0d94b9e5 100644 --- a/cpp/test/spectral_solvers.cu +++ b/cpp/test/spectral_solvers.cu @@ -34,21 +34,22 @@ TEST(Raft, ClusterSolvers) { index_type maxiter{100}; value_type tol{1.0e-10}; - index_type iter; value_type* eigvecs{nullptr}; unsigned long long seed{100110021003}; auto stream = h.get_stream(); - //thrust::cuda::par.on(stream); index_type n{100}; index_type d{10}; index_type k{5}; index_type* codes{nullptr}; - value_type residual; - kmeans(h, thrust::cuda::par.on(stream), n, d, k, tol, maxiter, eigvecs, codes, - residual, iter, seed); + cluster_solver_config_t cfg{k, maxiter, tol, seed}; + + kmeans_solver_t cluster_solver{cfg}; + + auto pair_ret = + cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d, eigvecs, codes); } } // namespace raft From 1d4c05daf19c696275a3cbfb9819f25c4eac31d7 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 16 Jun 2020 16:59:50 -0500 Subject: [PATCH 40/88] Higher level API. --- cpp/CMakeLists.txt | 4 ++-- .../raft/spectral/modularity_maximization.hpp | 2 +- cpp/include/raft/spectral/partition.hpp | 12 +++++++----- cpp/include/raft/spectral/spectral_util.hpp | 11 ++++++++--- cpp/test/{spectral_solvers.cu => cluster_solvers.cu} | 0 cpp/test/{spectral_solvers.cpp => eigen_solvers.cu} | 3 +-- 6 files changed, 19 insertions(+), 13 deletions(-) rename cpp/test/{spectral_solvers.cu => cluster_solvers.cu} (100%) rename cpp/test/{spectral_solvers.cpp => eigen_solvers.cu} (96%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d96f936a3d..a9c0375de6 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -207,8 +207,8 @@ if(BUILD_RAFT_TESTS) test/mr/host/buffer.cpp test/test.cpp test/spectral_matrix.cpp - test/spectral_solvers.cpp - test/spectral_solvers.cu) + test/eigen_solvers.cu + test/cluster_solvers.cu) target_include_directories(test_raft PRIVATE diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index 1e387c0606..0480287936 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -191,7 +191,7 @@ void analyzeModularity(handle_t handle, ThrustExePolicy thrust_exec_policy, // Iterate through partitions for (i = 0; i < nClusters; ++i) { - if (!construct_indicator(handle, thrust_exec_policy, n, clustersize, + if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) { WARNING("empty partition"); continue; diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 7c886ff282..98ca84fbb9 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -67,7 +67,7 @@ template , typename ClusterSolver = kmeans_solver_t> std::tuple partition( - handle_t handle, ThrustExePolicy thrust_exec_policy, + handle_t const &handle, ThrustExePolicy thrust_exec_policy, GraphCSRView const &graph, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { @@ -94,7 +94,7 @@ std::tuple partition( laplacian_matrix_t L{handle, graph}; auto eigen_config = eigen_solver.get_config(); - auto nEigVecs = eigen_configs.n_eigVecs; + auto nEigVecs = eigen_config.n_eigVecs; // Compute smallest eigenvalues and eigenvectors std::get<0>(stats) = @@ -131,8 +131,10 @@ std::tuple partition( * @param cost On exit, partition cost function. * @return error flag. */ -template -void analyzePartition(handle_t handle, ThrustExePolicy thrust_exec_policy, +template +void analyzePartition(handle_t const &handle, + ThrustExePolicy thrust_exec_policy, GraphCSRView const &graph, vertex_t nClusters, const vertex_t *__restrict__ clusters, weight_t &edgeCut, weight_t &cost) { @@ -163,7 +165,7 @@ void analyzePartition(handle_t handle, ThrustExePolicy thrust_exec_policy, // Iterate through partitions for (i = 0; i < nClusters; ++i) { // Construct indicator vector for ith partition - if (!construct_indicator(handle, thrust_exec_policy, n, clustersize, + if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) { WARNING("empty partition"); continue; diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index a0c10284e3..7789247445 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -122,6 +122,9 @@ void transform_eigen_matrix(handle_t handle, ThrustExePolicy thrust_exec_policy, auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + const weight_t zero{0.0}; + const weight_t one{1.0}; + // Whiten eigenvector matrix for (auto i = 0; i < nEigVecs; ++i) { weight_t mean, std; @@ -190,9 +193,11 @@ struct equal_to_i_op { // Construct indicator vector for ith partition // -template +template bool construct_indicator(handle_t handle, ThrustExePolicy thrust_exec_policy, - edge_t n, weight_t& clustersize, weight_t& partStats, + edge_t index, edge_t n, weight_t& clustersize, + weight_t& partStats, vertex_t const* __restrict__ clusters, vector_t& part_i, vector_t& Bx, laplacian_matrix_t const& B) { @@ -206,7 +211,7 @@ bool construct_indicator(handle_t handle, ThrustExePolicy thrust_exec_policy, thrust::make_zip_iterator(thrust::make_tuple( thrust::device_pointer_cast(clusters + n), thrust::device_pointer_cast(part_i.raw() + n))), - equal_to_i_op(i)); + equal_to_i_op(index)); CUDA_CHECK_LAST(); // Compute size of ith partition diff --git a/cpp/test/spectral_solvers.cu b/cpp/test/cluster_solvers.cu similarity index 100% rename from cpp/test/spectral_solvers.cu rename to cpp/test/cluster_solvers.cu diff --git a/cpp/test/spectral_solvers.cpp b/cpp/test/eigen_solvers.cu similarity index 96% rename from cpp/test/spectral_solvers.cpp rename to cpp/test/eigen_solvers.cu index 549c2e66d7..506635ec46 100644 --- a/cpp/test/spectral_solvers.cpp +++ b/cpp/test/eigen_solvers.cu @@ -19,7 +19,7 @@ #include #include -#include +#include namespace raft { @@ -49,7 +49,6 @@ TEST(Raft, SpectralSolvers) { value_type tol{1.0e-10}; bool reorthog{true}; - index_type iter; value_type* eigvals{nullptr}; value_type* eigvecs{nullptr}; unsigned long long seed{100110021003}; From 4480dd33e391466fc1002cc26706f2e6dbe6c1fa Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 16 Jun 2020 17:47:22 -0500 Subject: [PATCH 41/88] Fixes in higher level API. --- cpp/include/raft/spectral/partition.hpp | 11 ++---- cpp/include/raft/spectral/spectral_util.hpp | 11 +++--- cpp/test/eigen_solvers.cu | 40 ++++++++++++++++++++- 3 files changed, 48 insertions(+), 14 deletions(-) diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 98ca84fbb9..2b6c54f49a 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -63,24 +63,19 @@ using namespace linalg; * @return statistics: number of eigensolver iterations, . */ template , - typename ClusterSolver = kmeans_solver_t> + typename ThrustExePolicy, typename EigenSolver, + typename ClusterSolver> std::tuple partition( handle_t const &handle, ThrustExePolicy thrust_exec_policy, GraphCSRView const &graph, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { - const weight_t zero{0.0}; - const weight_t one{1.0}; - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); std::tuple stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver - edge_t i; edge_t n = graph.number_of_vertices; // ------------------------------------------------------- @@ -98,7 +93,7 @@ std::tuple partition( // Compute smallest eigenvalues and eigenvectors std::get<0>(stats) = - eigen_solver.solve_smallest_eigenvectors(L, eigVals, eigVecs); + eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); // Whiten eigenvector matrix transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index 7789247445..2cc38cbbf1 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -117,8 +117,9 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_* obs) { template -void transform_eigen_matrix(handle_t handle, ThrustExePolicy thrust_exec_policy, - edge_t n, vertex_t nEigVecs, weight_t* eigVecs) { +void transform_eigen_matrix(handle_t const& handle, + ThrustExePolicy thrust_exec_policy, edge_t n, + vertex_t nEigVecs, weight_t* eigVecs) { auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -195,9 +196,9 @@ struct equal_to_i_op { // template -bool construct_indicator(handle_t handle, ThrustExePolicy thrust_exec_policy, - edge_t index, edge_t n, weight_t& clustersize, - weight_t& partStats, +bool construct_indicator(handle_t const& handle, + ThrustExePolicy thrust_exec_policy, edge_t index, + edge_t n, weight_t& clustersize, weight_t& partStats, vertex_t const* __restrict__ clusters, vector_t& part_i, vector_t& Bx, laplacian_matrix_t const& B) { diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index 506635ec46..604afdc76a 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -23,7 +23,7 @@ namespace raft { -TEST(Raft, SpectralSolvers) { +TEST(Raft, EigenSolvers) { using namespace matrix; using index_type = int; using value_type = double; @@ -63,4 +63,42 @@ TEST(Raft, SpectralSolvers) { eig_solver.solve_largest_eigenvectors(h, lm1, eigvals, eigvecs); } +TEST(Raft, SpectralSolvers) { + using namespace matrix; + using index_type = int; + using value_type = double; + + handle_t h; + ASSERT_EQ(0, h.get_num_internal_streams()); + ASSERT_EQ(0, h.get_device()); + + index_type neigvs{10}; + index_type maxiter{100}; + index_type restart_iter{10}; + value_type tol{1.0e-10}; + bool reorthog{true}; + + index_type* clusters{nullptr}; + value_type* eigvals{nullptr}; + value_type* eigvecs{nullptr}; + unsigned long long seed{100110021003}; + + eigen_solver_config_t eig_cfg{ + neigvs, maxiter, restart_iter, tol, reorthog, seed}; + lanczos_solver_t eig_solver{eig_cfg}; + + index_type k{5}; + + cluster_solver_config_t clust_cfg{k, maxiter, tol, + seed}; + kmeans_solver_t cluster_solver{clust_cfg}; + + auto stream = h.get_stream(); + GraphCSRView empty_graph; + auto t_exe_p = thrust::cuda::par.on(stream); + auto tuple_ret = + spectral::partition(h, t_exe_p, empty_graph, eig_solver, cluster_solver, + clusters, eigvals, eigvecs); +} + } // namespace raft From 282c4c9f034224961b7603da29c3800842ebce99 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 16 Jun 2020 19:46:09 -0500 Subject: [PATCH 42/88] Added / Fixed higher level modularity API and test. --- cpp/include/raft/spectral/cluster_solvers.hpp | 2 + cpp/include/raft/spectral/eigen_solvers.hpp | 4 ++ cpp/include/raft/spectral/error_temp.hpp | 10 +++- .../raft/spectral/modularity_maximization.hpp | 42 +++++--------- cpp/include/raft/spectral/partition.hpp | 6 ++ cpp/test/cluster_solvers.cu | 57 +++++++++++++++++-- cpp/test/eigen_solvers.cu | 23 ++++++-- 7 files changed, 104 insertions(+), 40 deletions(-) diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp index 08579e22bd..b19237d1a8 100644 --- a/cpp/include/raft/spectral/cluster_solvers.hpp +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -48,6 +48,8 @@ struct kmeans_solver_t { size_type_t n_obs_vecs, size_type_t dim, value_type_t const* __restrict__ obs, index_type_t* __restrict__ codes) const { + RAFT_EXPECT(obs != nullptr, "Null obs buffer."); + RAFT_EXPECT(codes != nullptr, "Null codes buffer."); value_type_t residual{}; index_type_t iters{}; RAFT_TRY(kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters, diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp index 21d2f83dbb..97114661c5 100644 --- a/cpp/include/raft/spectral/eigen_solvers.hpp +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -49,6 +49,8 @@ struct lanczos_solver_t { sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) const { + RAFT_EXPECT(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECT(eigVecs != nullptr, "Null eigVecs buffer."); index_type_t iters{}; RAFT_TRY(computeSmallestEigenvectors( handle, A, config_.n_eigVecs, config_.maxIter, config_.restartIter, @@ -62,6 +64,8 @@ struct lanczos_solver_t { sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) const { + RAFT_EXPECT(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECT(eigVecs != nullptr, "Null eigVecs buffer."); index_type_t iters{}; RAFT_TRY(computeLargestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, config_.restartIter, diff --git a/cpp/include/raft/spectral/error_temp.hpp b/cpp/include/raft/spectral/error_temp.hpp index 82beb75640..3fa5a38f5f 100644 --- a/cpp/include/raft/spectral/error_temp.hpp +++ b/cpp/include/raft/spectral/error_temp.hpp @@ -1,10 +1,15 @@ #pragma once +#include +#include + #define STRINGIFY_DETAIL(x) #x #define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) - -#define RAFT_EXPECT(cond, reason) +///#define RAFT_EXPECT(cond, reason) +inline void RAFT_EXPECT(bool cond, std::string const& reason) { + if (!cond) throw std::runtime_error(reason.c_str()); +} #define RAFT_TRY(error_expression) @@ -28,4 +33,3 @@ #else // DEBUG #define WARNING(message) #endif - diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index 0480287936..e406772666 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -83,16 +83,16 @@ using namespace linalg; * @return error flag. */ template , - typename ClusterSolver = kmeans_solver_t> + typename ThrustExePolicy, typename EigenSolver, + typename ClusterSolver> std::tuple modularity_maximization( - handle_t handle, ThrustExePolicy thrust_exec_policy, + handle_t const &handle, ThrustExePolicy thrust_exec_policy, GraphCSRView const &graph, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { - const weight_t zero{0.0}; - const weight_t one{1.0}; + RAFT_EXPECT(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECT(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECT(eigVecs != nullptr, "Null eigVecs buffer."); auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -109,11 +109,11 @@ std::tuple modularity_maximization( modularity_matrix_t B{handle, graph}; auto eigen_config = eigen_solver.get_config(); - auto nEigVecs = eigen_configs.n_eigVecs; + auto nEigVecs = eigen_config.n_eigVecs; // Compute eigenvectors corresponding to largest eigenvalues std::get<0>(stats) = - eigen_solver.solve_largest_eigenvectors(B, eigVals, eigVecs); + eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); // Whiten eigenvector matrix transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); @@ -136,24 +136,6 @@ std::tuple modularity_maximization( // Analysis of graph partition // ========================================================= -namespace { -/// Functor to generate indicator vectors -/** For use in Thrust transform - */ -template -struct equal_to_i_op { - const IndexType_ i; - - public: - equal_to_i_op(IndexType_ _i) : i(_i) {} - template - __host__ __device__ void operator()(Tuple_ t) { - thrust::get<1>(t) = - (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; - } -}; -} // namespace - /// Compute modularity /** This function determines the modularity based on a graph and cluster assignments * @param G Weighted graph in CSR format @@ -161,12 +143,16 @@ struct equal_to_i_op { * @param clusters (Input, device memory, n entries) Cluster assignments. * @param modularity On exit, modularity */ -template -void analyzeModularity(handle_t handle, ThrustExePolicy thrust_exec_policy, +template +void analyzeModularity(handle_t const &handle, + ThrustExePolicy thrust_exec_policy, GraphCSRView const &graph, vertex_t nClusters, vertex_t const *__restrict__ clusters, weight_t &modularity) { + RAFT_EXPECT(clusters != nullptr, "Null clusters buffer."); + edge_t i; edge_t n = graph.number_of_vertices; weight_t partModularity, clustersize; diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 2b6c54f49a..1b768ca4c4 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -70,6 +70,10 @@ std::tuple partition( GraphCSRView const &graph, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { + RAFT_EXPECT(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECT(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECT(eigVecs != nullptr, "Null eigVecs buffer."); + auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -133,6 +137,8 @@ void analyzePartition(handle_t const &handle, GraphCSRView const &graph, vertex_t nClusters, const vertex_t *__restrict__ clusters, weight_t &edgeCut, weight_t &cost) { + RAFT_EXPECT(clusters != nullptr, "Null clusters buffer."); + edge_t i; edge_t n = graph.number_of_vertices; diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu index 8c0d94b9e5..d3d6a04312 100644 --- a/cpp/test/cluster_solvers.cu +++ b/cpp/test/cluster_solvers.cu @@ -19,7 +19,7 @@ #include #include -#include +#include namespace raft { @@ -34,7 +34,6 @@ TEST(Raft, ClusterSolvers) { index_type maxiter{100}; value_type tol{1.0e-10}; - value_type* eigvecs{nullptr}; unsigned long long seed{100110021003}; auto stream = h.get_stream(); @@ -42,14 +41,64 @@ TEST(Raft, ClusterSolvers) { index_type n{100}; index_type d{10}; index_type k{5}; + + //nullptr expected to trigger exceptions: + // + value_type* eigvecs{nullptr}; index_type* codes{nullptr}; cluster_solver_config_t cfg{k, maxiter, tol, seed}; kmeans_solver_t cluster_solver{cfg}; - auto pair_ret = - cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d, eigvecs, codes); + EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d, + eigvecs, codes)); +} + +TEST(Raft, ModularitySolvers) { + using namespace matrix; + using index_type = int; + using value_type = double; + + handle_t h; + ASSERT_EQ(0, h.get_num_internal_streams()); + ASSERT_EQ(0, h.get_device()); + + index_type neigvs{10}; + index_type maxiter{100}; + index_type restart_iter{10}; + value_type tol{1.0e-10}; + bool reorthog{true}; + + //nullptr expected to trigger exceptions: + // + index_type* clusters{nullptr}; + value_type* eigvals{nullptr}; + value_type* eigvecs{nullptr}; + + unsigned long long seed{100110021003}; + + eigen_solver_config_t eig_cfg{ + neigvs, maxiter, restart_iter, tol, reorthog, seed}; + lanczos_solver_t eig_solver{eig_cfg}; + + index_type k{5}; + + cluster_solver_config_t clust_cfg{k, maxiter, tol, + seed}; + kmeans_solver_t cluster_solver{clust_cfg}; + + auto stream = h.get_stream(); + GraphCSRView empty_graph; + auto t_exe_p = thrust::cuda::par.on(stream); + + EXPECT_ANY_THROW(spectral::modularity_maximization( + h, t_exe_p, empty_graph, eig_solver, cluster_solver, clusters, eigvals, + eigvecs)); + + value_type modularity{0}; + EXPECT_ANY_THROW(spectral::analyzeModularity(h, t_exe_p, empty_graph, k, + clusters, modularity)); } } // namespace raft diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index 604afdc76a..d31aba896e 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -49,6 +49,8 @@ TEST(Raft, EigenSolvers) { value_type tol{1.0e-10}; bool reorthog{true}; + //nullptr expected to trigger exceptions: + // value_type* eigvals{nullptr}; value_type* eigvecs{nullptr}; unsigned long long seed{100110021003}; @@ -58,9 +60,11 @@ TEST(Raft, EigenSolvers) { lanczos_solver_t eig_solver{cfg}; - eig_solver.solve_smallest_eigenvectors(h, lm1, eigvals, eigvecs); + EXPECT_ANY_THROW( + eig_solver.solve_smallest_eigenvectors(h, lm1, eigvals, eigvecs)); - eig_solver.solve_largest_eigenvectors(h, lm1, eigvals, eigvecs); + EXPECT_ANY_THROW( + eig_solver.solve_largest_eigenvectors(h, lm1, eigvals, eigvecs)); } TEST(Raft, SpectralSolvers) { @@ -78,9 +82,12 @@ TEST(Raft, SpectralSolvers) { value_type tol{1.0e-10}; bool reorthog{true}; + //nullptr expected to trigger exceptions: + // index_type* clusters{nullptr}; value_type* eigvals{nullptr}; value_type* eigvecs{nullptr}; + unsigned long long seed{100110021003}; eigen_solver_config_t eig_cfg{ @@ -96,9 +103,15 @@ TEST(Raft, SpectralSolvers) { auto stream = h.get_stream(); GraphCSRView empty_graph; auto t_exe_p = thrust::cuda::par.on(stream); - auto tuple_ret = - spectral::partition(h, t_exe_p, empty_graph, eig_solver, cluster_solver, - clusters, eigvals, eigvecs); + + EXPECT_ANY_THROW(spectral::partition(h, t_exe_p, empty_graph, eig_solver, + cluster_solver, clusters, eigvals, + eigvecs)); + + value_type edgeCut{0}; + value_type cost{0}; + EXPECT_ANY_THROW(spectral::analyzePartition(h, t_exe_p, empty_graph, k, + clusters, edgeCut, cost)); } } // namespace raft From 6211f8d778150dccd2ae1cdaf4303b2be5a8e837 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 17 Jun 2020 17:53:36 -0500 Subject: [PATCH 43/88] Addressed CUDA 11 API changes in cusparse (csrmv, csrmm). --- cpp/include/raft/sparse/cusparse_wrappers.h | 57 +++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 865f93843d..56c3fc8dbc 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -176,6 +176,33 @@ inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, cscColPtrB, cscRowIndB, beta, C, ldc); } /** @} */ + +#if __CUDACC_VER_MAJOR__ > 10 +/** + * @defgroup Csrmv cusparse SpMV operations + * @{ + */ +inline cusparseStatus_t cusparsespmv_buffersize( + cusparseHandle_t handle, cusparseOperation_t opA, const void* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const void* beta, const cusparseDnVecDescr_t vecY, cudaDataType computeType, + cusparseSpMVAlg_t alg, size_t* bufferSize, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, + computeType, alg, bufferSize); +} + +inline cusparseStatus_t cusparsespmv( + cusparseHandle_t handle, cusparseOperation_t opA, const void* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const void* beta, const cusparseDnVecDescr_t vecY, cudaDataType computeType, + cusparseSpMVAlg_t alg, void* externalBuffer, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, computeType, + alg, externalBuffer); +} +/** @} */ +#else /** * @defgroup Csrmv cusparse csrmv operations * @{ @@ -207,7 +234,36 @@ inline cusparseStatus_t cusparsecsrmv( csrRowPtr, csrColInd, x, beta, y); } /** @} */ +#endif + +#if __CUDACC_VER_MAJOR__ > 10 +/** + * @defgroup Csrmm cusparse csrmm operations + * @{ + */ +inline cusparseStatus_t cusparsespmm_bufferSize( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const void* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const void* beta, cusparseDnMatDescr_t matC, + cudaDataType computeType, cusparseSpMMAlg_t alg, size_t* bufferSize, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, + matC, computeType, alg, bufferSize); +} +inline cusparseStatus_t cusparsespmm( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const void* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const void* beta, cusparseDnMatDescr_t matC, + cudaDataType computeType, cusparseSpMMAlg_t alg, void* externalBuffer, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC, + computeType, alg, externalBuffer); +} +/** @} */ +#else /** * @defgroup Csrmm cusparse csrmm operations * @{ @@ -241,6 +297,7 @@ inline cusparseStatus_t cusparsecsrmm( csrRowPtr, csrColInd, x, ldx, beta, y, ldy); } /** @} */ +#endif /** * @defgroup csr2coo cusparse CSR to COO converter methods From 4097a72536e1b75b89c36467f9125b26727ff9ab Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 17 Jun 2020 19:19:56 -0500 Subject: [PATCH 44/88] Fixes in CUDA 11 cusparse interface. --- cpp/include/raft/sparse/cusparse_wrappers.h | 52 +++++++++++++++++---- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 56c3fc8dbc..97a7823d86 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -192,13 +192,31 @@ inline cusparseStatus_t cusparsespmv_buffersize( computeType, alg, bufferSize); } +template +cusparseStatus_t cusparsespmv(cusparseHandle_t handle, cusparseOperation_t opA, + const T* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, const T* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, T* externalBuffer, + cudaStream_t stream); +template <> inline cusparseStatus_t cusparsespmv( - cusparseHandle_t handle, cusparseOperation_t opA, const void* alpha, + cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha, const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const void* beta, const cusparseDnVecDescr_t vecY, cudaDataType computeType, - cusparseSpMVAlg_t alg, void* externalBuffer, cudaStream_t stream) { + const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + float* externalBuffer, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, + alg, externalBuffer); +} +template <> +inline cusparseStatus_t cusparsespmv( + cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + double* externalBuffer, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, computeType, + return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, externalBuffer); } /** @} */ @@ -251,16 +269,32 @@ inline cusparseStatus_t cusparsespmm_bufferSize( return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, matC, computeType, alg, bufferSize); } - +template inline cusparseStatus_t cusparsespmm( cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const void* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const void* beta, cusparseDnMatDescr_t matC, - cudaDataType computeType, cusparseSpMMAlg_t alg, void* externalBuffer, + const T* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, T* externalBuffer, cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsespmm( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const float* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, float* externalBuffer, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC, + CUDA_R_32F, alg, externalBuffer); +} +template <> +inline cusparseStatus_t cusparsespmm( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const double* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const double* beta, + cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, double* externalBuffer, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC, - computeType, alg, externalBuffer); + CUDA_R_64F, alg, externalBuffer); } /** @} */ #else From c4130c99b09a268ef5181e3b08ffa017e123a8c4 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 18 Jun 2020 10:51:03 -0500 Subject: [PATCH 45/88] Added raft handle to sparse_matrix cnstr. necessary for mv() memf calls of cusparse and allocation. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 38 +++++++++++-------- .../raft/spectral/modularity_maximization.hpp | 4 +- cpp/include/raft/spectral/partition.hpp | 4 +- cpp/test/eigen_solvers.cu | 2 +- cpp/test/spectral_matrix.cpp | 4 +- 5 files changed, 30 insertions(+), 22 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 74dbd38be6..232d6a5cc6 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -86,18 +86,21 @@ class vector_t { template struct sparse_matrix_t { - sparse_matrix_t(index_type const* row_offsets, index_type const* col_indices, - value_type const* values, index_type const nrows, - index_type const nnz) - : row_offsets_(row_offsets), + sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, + index_type const* col_indices, value_type const* values, + index_type const nrows, index_type const nnz) + : handle_(raft_handle), + row_offsets_(row_offsets), col_indices_(col_indices), values_(values), nrows_(nrows), nnz_(nnz) {} sparse_matrix_t( + handle_t const& raft_handle, GraphCSRView const& csr_view) - : row_offsets_(csr_view.offsets), + : handle_(raft_handle), + row_offsets_(csr_view.offsets), col_indices_(csr_view.indices), values_(csr_view.edge_data), nrows_(csr_view.number_of_vertices), @@ -109,7 +112,8 @@ struct sparse_matrix_t { // y = alpha*A*x + beta*y // virtual void mv(value_type alpha, value_type const* __restrict__ x, - value_type beta, value_type* __restrict__ y) const { + value_type beta, value_type* __restrict__ y, + bool transpose = false, bool symmetric = false) const { //TODO: // //Cusparse::set_pointer_mode_host(); @@ -118,10 +122,10 @@ struct sparse_matrix_t { //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate + handle_t const& handle_; index_type const* row_offsets_; index_type const* col_indices_; - value_type const* - values_; // TODO: const-ness of this is debatable; cusparse primitives may not accept it... + value_type const* values_; index_type const nrows_; index_type const nnz_; }; @@ -131,8 +135,8 @@ struct laplacian_matrix_t : sparse_matrix_t { laplacian_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, index_type const* col_indices, value_type const* values, index_type const nrows, index_type const nnz) - : sparse_matrix_t(row_offsets, col_indices, values, - nrows, nnz), + : sparse_matrix_t(raft_handle, row_offsets, + col_indices, values, nrows, nnz), diagonal_(raft_handle, nrows) { auto* v = diagonal_.raw(); //TODO: more work, here: @@ -145,7 +149,7 @@ struct laplacian_matrix_t : sparse_matrix_t { laplacian_matrix_t( handle_t const& raft_handle, GraphCSRView const& csr_view) - : sparse_matrix_t(csr_view), + : sparse_matrix_t(raft_handle, csr_view), diagonal_(raft_handle, csr_view.number_of_vertices) { //TODO: more work, here: // @@ -157,7 +161,8 @@ struct laplacian_matrix_t : sparse_matrix_t { // y = alpha*A*x + beta*y // void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, - value_type* __restrict__ y) const override { + value_type* __restrict__ y, bool transpose = false, + bool symmetric = false) const override { //TODO: call cusparse::csrmv ... and more: // // if (beta == 0) @@ -213,7 +218,8 @@ struct modularity_matrix_t : laplacian_matrix_t { // y = alpha*A*x + beta*y // void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, - value_type* __restrict__ y) const override { + value_type* __restrict__ y, bool transpose = false, + bool symmetric = false) const override { //TODO: call cusparse::csrmv ... and more: // // // y = A*x @@ -225,9 +231,11 @@ struct modularity_matrix_t : laplacian_matrix_t { // Cublas::axpy(this->n, -(dot_res / this->edge_sum), D.raw(), 1, y, 1); } - value_type get_diag_nrm1(void) const { return diag_nrm1_; } + value_type get_diag_nrm1(void) const { + return diag_nrm1_; // TODO: replace w/ diag_.nrm1() + } - value_type diag_nrm1_; + value_type diag_nrm1_; // TODO: remove }; } // namespace matrix diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index e406772666..6ab1b16659 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -105,7 +105,7 @@ std::tuple modularity_maximization( // Compute eigenvectors of Modularity Matrix // Initialize Modularity Matrix - sparse_matrix_t A{graph}; + sparse_matrix_t A{handle, graph}; modularity_matrix_t B{handle, graph}; auto eigen_config = eigen_solver.get_config(); @@ -169,7 +169,7 @@ void analyzeModularity(handle_t const &handle, cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity - sparse_matrix_t A{graph}; + sparse_matrix_t A{handle, graph}; modularity_matrix_t B{handle, graph}; // Initialize output diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 1b768ca4c4..6cc2744e96 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -89,7 +89,7 @@ std::tuple partition( // Compute eigenvectors of Laplacian // Initialize Laplacian - sparse_matrix_t A{graph}; + sparse_matrix_t A{handle, graph}; laplacian_matrix_t L{handle, graph}; auto eigen_config = eigen_solver.get_config(); @@ -156,7 +156,7 @@ void analyzePartition(handle_t const &handle, cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Laplacian - sparse_matrix_t A{graph}; + sparse_matrix_t A{handle, graph}; laplacian_matrix_t L{handle, graph}; // Initialize output diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index d31aba896e..87bf74bdde 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -37,7 +37,7 @@ TEST(Raft, EigenSolvers) { value_type* vs{nullptr}; index_type nnz = 0; index_type nrows = 0; - sparse_matrix_t sm1{ro, ci, vs, nrows, nnz}; + sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; ASSERT_EQ(nullptr, sm1.row_offsets_); laplacian_matrix_t lm1{h, ro, ci, vs, nrows, nnz}; diff --git a/cpp/test/spectral_matrix.cpp b/cpp/test/spectral_matrix.cpp index 46d753550f..30346d5da5 100644 --- a/cpp/test/spectral_matrix.cpp +++ b/cpp/test/spectral_matrix.cpp @@ -42,8 +42,8 @@ TEST(Raft, SpectralMatrices) { value_type* vs{nullptr}; index_type nnz = 0; index_type nrows = 0; - sparse_matrix_t sm1{ro, ci, vs, nrows, nnz}; - sparse_matrix_t sm2{empty_graph}; + sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; + sparse_matrix_t sm2{h, empty_graph}; ASSERT_EQ(nullptr, sm1.row_offsets_); ASSERT_EQ(nullptr, sm2.row_offsets_); From 87c49845f692d167e333cb481358d7770e4d0edd Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 18 Jun 2020 12:02:21 -0500 Subject: [PATCH 46/88] Sparse MV forking: pre-CUDA 11 step. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 30 ++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 232d6a5cc6..8f9eab64bc 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -114,10 +114,32 @@ struct sparse_matrix_t { virtual void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, value_type* __restrict__ y, bool transpose = false, bool symmetric = false) const { - //TODO: - // - //Cusparse::set_pointer_mode_host(); - //cusparsecsrmv(...); + + using namespace sparse; + + auto cusparse_h = handle_.get_cusparse_handle(); + auto stream = handle_.get_stream(); +#if __CUDACC_VER_MAJOR__ > 10 +#else + CUSPARSE_CHECK( + cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); + + cusparseOperation_t trans = + transpose ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;//non-transpose + cusparseMatDescr_t descr = 0; + CUSPARSE_CHECK(cusparseCreateMatDescr(&descr)); + if (symmetric) { + CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_SYMMETRIC)); + } else { + CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); + } + CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, nrows_, nnz_, + &alpha, descr, values_, + row_offsets_, col_indices_, + x, &beta, y, stream)); + CUSPARSE_CHECK(cusparseDestroyMatDescr(descr)); +#endif } //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate From 2225214fcbd2efffa13d80403cb16a4428998770 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 18 Jun 2020 17:31:55 -0500 Subject: [PATCH 47/88] Added CUDA 11 path for SpMV calls. Step 2. --- cpp/include/raft/sparse/cusparse_wrappers.h | 123 ++++++++++++++++-- cpp/include/raft/spectral/matrix_wrappers.hpp | 55 ++++++-- 2 files changed, 161 insertions(+), 17 deletions(-) diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 97a7823d86..347cd7fa59 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -178,18 +178,108 @@ inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, /** @} */ #if __CUDACC_VER_MAJOR__ > 10 +/** + * @defgroup cusparse Create CSR operations + * @{ + */ +template +cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, int64_t cols, int64_t nnz, + IndexT* csrRowOffsets, IndexT* csrColInd, + ValueT* csrValues); +template <> +inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, int64_t cols, + int64_t nnz, int32_t* csrRowOffsets, + int32_t* csrColInd, + float* csrValues) { + return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, + csrColInd, csrValues, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F); +} +template <> +inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, int64_t cols, + int64_t nnz, int32_t* csrRowOffsets, + int32_t* csrColInd, + double* csrValues) { + return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, + csrColInd, csrValues, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F); +} +template <> +inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, int64_t cols, + int64_t nnz, int64_t* csrRowOffsets, + int64_t* csrColInd, + float* csrValues) { + return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, + csrColInd, csrValues, CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F); +} +template <> +inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, + int64_t rows, int64_t cols, + int64_t nnz, int64_t* csrRowOffsets, + int64_t* csrColInd, + double* csrValues) { + return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, + csrColInd, csrValues, CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F); +} +/** @} */ +/** + * @defgroup cusparse CreateDnVec operations + * @{ + */ +template +cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, + int64_t size, T* values); +template <> +inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, + int64_t size, float* values) { + return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_32F); +} +template <> +inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, + int64_t size, double* values) { + return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_64F); +} +/** @} */ + /** * @defgroup Csrmv cusparse SpMV operations * @{ */ +template +cusparseStatus_t cusparsespmv_buffersize( + cusparseHandle_t handle, cusparseOperation_t opA, const T* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const T* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + size_t* bufferSize, cudaStream_t stream); +template <> inline cusparseStatus_t cusparsespmv_buffersize( - cusparseHandle_t handle, cusparseOperation_t opA, const void* alpha, + cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha, const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const void* beta, const cusparseDnVecDescr_t vecY, cudaDataType computeType, - cusparseSpMVAlg_t alg, size_t* bufferSize, cudaStream_t stream) { + const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + size_t* bufferSize, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, - computeType, alg, bufferSize); + CUDA_R_32F, alg, bufferSize); +} +template <> +inline cusparseStatus_t cusparsespmv_buffersize( + cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + size_t* bufferSize, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, + CUDA_R_64F, alg, bufferSize); } template @@ -259,15 +349,32 @@ inline cusparseStatus_t cusparsecsrmv( * @defgroup Csrmm cusparse csrmm operations * @{ */ +template +cusparseStatus_t cusparsespmm_bufferSize( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const T* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsespmm_bufferSize( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const float* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, + matC, CUDA_R_32F, alg, bufferSize); +} +template <> inline cusparseStatus_t cusparsespmm_bufferSize( cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const void* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const void* beta, cusparseDnMatDescr_t matC, - cudaDataType computeType, cusparseSpMMAlg_t alg, size_t* bufferSize, + const double* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const double* beta, + cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, - matC, computeType, alg, bufferSize); + matC, CUDA_R_64F, alg, bufferSize); } template inline cusparseStatus_t cusparsespmm( diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 8f9eab64bc..e7bb8c9d52 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -114,18 +114,56 @@ struct sparse_matrix_t { virtual void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, value_type* __restrict__ y, bool transpose = false, bool symmetric = false) const { - using namespace sparse; - + auto cusparse_h = handle_.get_cusparse_handle(); auto stream = handle_.get_stream(); + + cusparseOperation_t trans = + transpose ? CUSPARSE_OPERATION_TRANSPOSE : // transpose + CUSPARSE_OPERATION_NON_TRANSPOSE; //non-transpose + #if __CUDACC_VER_MAJOR__ > 10 + + //create descriptors: + // + cusparseSpMatDescr_t matA; + CUSPARSE_CHECK(cusparsecreatecsr(&matA, nrows_, nrows_, nnz_, row_offsets_, + col_indices_, values_)); + + cusparseDnVecDescr_t vecX; + CUSPARSE_CHECK(cusparsecreatednvec(&vecX, nrows_, + x)); // TODO: const-cast down?! + + cusparseDnVecDescr_t vecY; + CUSPARSE_CHECK(cusparsecreatednvec(&vecY, nrows_, y)); + + //get (scratch) external device buffer size: + // + size_t bufferSize; + CUSPARSE_CHECK(cusparsespmv_buffersize(cusparse_h, opA, &alpha, matA, vecX, + &beta, vecY, alg, &bufferSize, + stream)); + + //allocate external buffer: + // + vector_t external_buffer(handle_, bufferSize); + + //finally perform SpMV: + // + CUSPARSE_CHECK(cusparsespmv(cusparse_h, trans, &alpha, matA, vecX, &beta, + vecY, CUSPARSE_CSRMV_ALG1, + external_buffer.raw(), stream)); + + //free descriptors: + //(TODO: maybe wrap them in a RAII struct?) + // + CUSPARSE_CHECK(cusparseDestroyDnVec(vecY)); + CUSPARSE_CHECK(cusparseDestroyDnVec(vecX)); + CUSPARSE_CHECK(cusparseDestroySpMat(matA)); #else CUSPARSE_CHECK( - cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); - - cusparseOperation_t trans = - transpose ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;//non-transpose + cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); cusparseMatDescr_t descr = 0; CUSPARSE_CHECK(cusparseCreateMatDescr(&descr)); if (symmetric) { @@ -135,9 +173,8 @@ struct sparse_matrix_t { } CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, nrows_, nnz_, - &alpha, descr, values_, - row_offsets_, col_indices_, - x, &beta, y, stream)); + &alpha, descr, values_, row_offsets_, + col_indices_, x, &beta, y, stream)); CUSPARSE_CHECK(cusparseDestroyMatDescr(descr)); #endif } From 7415edaf9bc27772b11d429fd48a1031a448eb78 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 18 Jun 2020 17:44:53 -0500 Subject: [PATCH 48/88] Removed constness of some parameters in SpMV, because CUDA 11 requires it. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index e7bb8c9d52..0c55ef7294 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -110,10 +110,13 @@ struct sparse_matrix_t { default; // virtual because used as base for following matrix types // y = alpha*A*x + beta*y + //(Note: removed const-ness of x, because CUDA 11 SpMV + // descriptor creation works with non-const, and const-casting + // down is dangerous) // - virtual void mv(value_type alpha, value_type const* __restrict__ x, - value_type beta, value_type* __restrict__ y, - bool transpose = false, bool symmetric = false) const { + virtual void mv(value_type alpha, value_type* __restrict__ x, value_type beta, + value_type* __restrict__ y, bool transpose = false, + bool symmetric = false) const { using namespace sparse; auto cusparse_h = handle_.get_cusparse_handle(); @@ -132,8 +135,7 @@ struct sparse_matrix_t { col_indices_, values_)); cusparseDnVecDescr_t vecX; - CUSPARSE_CHECK(cusparsecreatednvec(&vecX, nrows_, - x)); // TODO: const-cast down?! + CUSPARSE_CHECK(cusparsecreatednvec(&vecX, nrows_, x)); cusparseDnVecDescr_t vecY; CUSPARSE_CHECK(cusparsecreatednvec(&vecY, nrows_, y)); @@ -219,7 +221,7 @@ struct laplacian_matrix_t : sparse_matrix_t { // y = alpha*A*x + beta*y // - void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, + void mv(value_type alpha, value_type* __restrict__ x, value_type beta, value_type* __restrict__ y, bool transpose = false, bool symmetric = false) const override { //TODO: call cusparse::csrmv ... and more: @@ -276,7 +278,7 @@ struct modularity_matrix_t : laplacian_matrix_t { // y = alpha*A*x + beta*y // - void mv(value_type alpha, value_type const* __restrict__ x, value_type beta, + void mv(value_type alpha, value_type* __restrict__ x, value_type beta, value_type* __restrict__ y, bool transpose = false, bool symmetric = false) const override { //TODO: call cusparse::csrmv ... and more: From 6114c0f8f899d5259ff77712f1f1f3ff7a78e549 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 18 Jun 2020 20:30:50 -0500 Subject: [PATCH 49/88] Added some pre-conditions in mv(). --- cpp/include/raft/spectral/matrix_wrappers.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 0c55ef7294..f2a64d3c17 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -119,6 +119,9 @@ struct sparse_matrix_t { bool symmetric = false) const { using namespace sparse; + RAFT_EXPECT(x != nullptr, "Null x buffer."); + RAFT_EXPECT(y != nullptr, "Null y buffer."); + auto cusparse_h = handle_.get_cusparse_handle(); auto stream = handle_.get_stream(); From 761cacde85ae0957997ad8898613ef75d558a38d Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 19 Jun 2020 09:20:12 -0500 Subject: [PATCH 50/88] Fixed curand dependencies. --- cpp/CMakeLists.txt | 1 + cpp/include/raft/spectral/error_temp.hpp | 4 ++-- cpp/include/raft/spectral/lanczos.hpp | 26 ++++++++++++++++-------- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a9c0375de6..03b0222e97 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -185,6 +185,7 @@ set(RAFT_LINK_LIBRARIES ${CUDA_cusolver_LIBRARY} ${CUDA_CUDART_LIBRARY} ${CUDA_cusparse_LIBRARY} + ${CUDA_curand_LIBRARY} rmm) set(RAFT_LINK_DIRECTORIES "") diff --git a/cpp/include/raft/spectral/error_temp.hpp b/cpp/include/raft/spectral/error_temp.hpp index 3fa5a38f5f..7d525ae5f1 100644 --- a/cpp/include/raft/spectral/error_temp.hpp +++ b/cpp/include/raft/spectral/error_temp.hpp @@ -11,13 +11,13 @@ inline void RAFT_EXPECT(bool cond, std::string const& reason) { if (!cond) throw std::runtime_error(reason.c_str()); } -#define RAFT_TRY(error_expression) +#define RAFT_TRY(expression) (expression) //assume RAFT_FAIL() can take a std::string `reason` // #define RAFT_FAIL(reason) -#define CUDA_TRY(call) +#define CUDA_TRY(call) (call) #define CUDA_CHECK_LAST() diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index e9682f5c28..c15b7ade0d 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -36,7 +36,19 @@ namespace raft { using namespace matrix; using namespace linalg; -namespace { +namespace detail { + +// curandGeneratorNormalX +inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, + float *outputPtr, size_t n, + float mean, float stddev) { + return curandGenerateNormal(generator, outputPtr, n, mean, stddev); +} +inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, + double *outputPtr, size_t n, + double mean, double stddev) { + return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev); +} // ========================================================= // Helper functions @@ -565,7 +577,7 @@ static int lanczosRestart( return 0; } -} // namespace +} // namespace detail // ========================================================= // Eigensolver @@ -626,9 +638,7 @@ int computeSmallestEigenvectors( ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ lanczosVecs_dev, ValueType_ *__restrict__ work_dev, ValueType_ *__restrict__ eigVals_dev, ValueType_ *__restrict__ eigVecs_dev, unsigned long long seed) { - // ------------------------------------------------------- - // Variable declaration - // ------------------------------------------------------- + using namespace detail; // Useful constants const ValueType_ one = 1; @@ -858,6 +868,8 @@ int computeSmallestEigenvectors( ValueType_ tol, bool reorthogonalize, IndexType_ &iter, ValueType_ *__restrict__ eigVals_dev, ValueType_ *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) { + using namespace detail; + // Matrix dimension IndexType_ n = A.nrows_; @@ -945,9 +957,7 @@ int computeLargestEigenvectors( ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ lanczosVecs_dev, ValueType_ *__restrict__ work_dev, ValueType_ *__restrict__ eigVals_dev, ValueType_ *__restrict__ eigVecs_dev, unsigned long long seed) { - // ------------------------------------------------------- - // Variable declaration - // ------------------------------------------------------- + using namespace detail; // Useful constants const ValueType_ one = 1; From ca240beefbcc9150fc464012d914a88d530c3aba Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 19 Jun 2020 11:33:32 -0500 Subject: [PATCH 51/88] More CUDA 11 debt. --- cpp/CMakeLists.txt | 2 +- cpp/include/raft/spectral/matrix_wrappers.hpp | 1 + cpp/include/raft/spectral/sm_utils.hpp | 14 ++++++++++++++ .../{spectral_matrix.cpp => spectral_matrix.cu} | 0 4 files changed, 16 insertions(+), 1 deletion(-) rename cpp/test/{spectral_matrix.cpp => spectral_matrix.cu} (100%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 03b0222e97..928161ffbd 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -207,7 +207,7 @@ if(BUILD_RAFT_TESTS) test/mr/device/buffer.cpp test/mr/host/buffer.cpp test/test.cpp - test/spectral_matrix.cpp + test/spectral_matrix.cu test/eigen_solvers.cu test/cluster_solvers.cu) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index f2a64d3c17..4ff0bbf9dd 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -19,6 +19,7 @@ #include #include #include +#include // ========================================================= // Useful macros diff --git a/cpp/include/raft/spectral/sm_utils.hpp b/cpp/include/raft/spectral/sm_utils.hpp index 3c1c1e4484..34eeec16bd 100644 --- a/cpp/include/raft/spectral/sm_utils.hpp +++ b/cpp/include/raft/spectral/sm_utils.hpp @@ -416,6 +416,20 @@ static __inline__ __device__ float atomicFPLog(float *addr, float val) { return old; } +// Apply diagonal matrix to vector: +// +template +static __global__ void diagmv(IndexType_ n, ValueType_ alpha, + const ValueType_ *__restrict__ D, + const ValueType_ *__restrict__ x, + ValueType_ *__restrict__ y) { + IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; + while (i < n) { + y[i] += alpha * D[i] * x[i]; + i += blockDim.x * gridDim.x; + } +} + } // namespace utils } // namespace raft diff --git a/cpp/test/spectral_matrix.cpp b/cpp/test/spectral_matrix.cu similarity index 100% rename from cpp/test/spectral_matrix.cpp rename to cpp/test/spectral_matrix.cu From b40b70267644cc68591e7a64084978f58ea0d5e6 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 19 Jun 2020 16:01:53 -0500 Subject: [PATCH 52/88] Added correct version of L1 norm. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 4ff0bbf9dd..612b8ef65e 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -21,6 +21,8 @@ #include #include +#include + // ========================================================= // Useful macros // ========================================================= @@ -83,6 +85,16 @@ class vector_t { size_type size(void) const { return size_; } value_type* raw(void) { return buffer_; } + + template + value_type nrm1(ThrustExecPolicy t_exe_pol) const { + return thrust::reduce(t_exe_pol, buffer_, buffer_ + size_, value_type{0}, + [] __device__(auto left, auto right) { + auto abs_left = left > 0 ? left : -left; + auto abs_right = right > 0 ? right : -right; + return abs_left + abs_right; + }); + } }; template From 2dad70dd4be2fb750f08e3b8f94c21388ed00ecc Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 19 Jun 2020 19:27:03 -0500 Subject: [PATCH 53/88] Fixes for matrix wrappers and tests. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 61 +++++++++---------- .../raft/spectral/modularity_maximization.hpp | 6 +- cpp/include/raft/spectral/partition.hpp | 4 +- cpp/test/eigen_solvers.cu | 10 +-- cpp/test/spectral_matrix.cu | 33 +++++++--- 5 files changed, 64 insertions(+), 50 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 612b8ef65e..e1eaf237c3 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -21,6 +21,7 @@ #include #include +#include #include // ========================================================= @@ -95,6 +96,11 @@ class vector_t { return abs_left + abs_right; }); } + + template + void fill(ThrustExecPolicy t_exe_pol, value_type value) { + thrust::fill_n(t_exe_pol, buffer_, size_, value); + } }; template @@ -209,30 +215,31 @@ struct sparse_matrix_t { template struct laplacian_matrix_t : sparse_matrix_t { - laplacian_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, + template + laplacian_matrix_t(handle_t const& raft_handle, + ThrustExePolicy thrust_exec_policy, + index_type const* row_offsets, index_type const* col_indices, value_type const* values, index_type const nrows, index_type const nnz) : sparse_matrix_t(raft_handle, row_offsets, col_indices, values, nrows, nnz), diagonal_(raft_handle, nrows) { - auto* v = diagonal_.raw(); - //TODO: more work, here: - // - // vector_t ones(nrows); - // ones.fill(1.0); - // sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); + vector_t ones{raft_handle, nrows}; + ones.fill(thrust_exec_policy, 1.0); + sparse_matrix_t::mv(1, ones.raw(), 0, + diagonal_.raw()); } + template laplacian_matrix_t( - handle_t const& raft_handle, + handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, GraphCSRView const& csr_view) : sparse_matrix_t(raft_handle, csr_view), diagonal_(raft_handle, csr_view.number_of_vertices) { - //TODO: more work, here: - // - // vector_t ones(csr_view.number_of_vertices_); - // ones.fill(1.0); - // sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); + vector_t ones{raft_handle, csr_view.number_of_vertices}; + ones.fill(thrust_exec_policy, 1.0); + sparse_matrix_t::mv(1, ones.raw(), 0, + diagonal_.raw()); } // y = alpha*A*x + beta*y @@ -242,6 +249,8 @@ struct laplacian_matrix_t : sparse_matrix_t { bool symmetric = false) const override { //TODO: call cusparse::csrmv ... and more: // + // // scales y by beta: + // // // if (beta == 0) // CHECK_CUDA(cudaMemset(y, 0, (this->n) * sizeof(ValueType_))) // else if (beta != 1) @@ -271,26 +280,22 @@ struct laplacian_matrix_t : sparse_matrix_t { template struct modularity_matrix_t : laplacian_matrix_t { + template modularity_matrix_t(handle_t const& raft_handle, + ThrustExePolicy thrust_exec_policy, index_type const* row_offsets, index_type const* col_indices, value_type const* values, index_type const nrows, index_type const nnz) : laplacian_matrix_t( - raft_handle, row_offsets, col_indices, values, nrows, nnz) { - auto* v = laplacian_matrix_t::diagonal_.raw(); - //TODO: more work, here: - // - // diag_nrm1_ = diagonal_.nrm1(); - } + raft_handle, thrust_exec_policy, row_offsets, col_indices, values, + nrows, nnz) {} + template modularity_matrix_t( - handle_t const& raft_handle, + handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, GraphCSRView const& csr_view) - : laplacian_matrix_t(raft_handle, csr_view) { - //TODO: more work, here: - // - // diag_nrm1_ = diagonal_.nrm1(); - } + : laplacian_matrix_t( + raft_handle, thrust_exec_policy, csr_view) {} // y = alpha*A*x + beta*y // @@ -307,12 +312,6 @@ struct modularity_matrix_t : laplacian_matrix_t { // // y = y -(gamma/edge_sum)*d // Cublas::axpy(this->n, -(dot_res / this->edge_sum), D.raw(), 1, y, 1); } - - value_type get_diag_nrm1(void) const { - return diag_nrm1_; // TODO: replace w/ diag_.nrm1() - } - - value_type diag_nrm1_; // TODO: remove }; } // namespace matrix diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index 6ab1b16659..8e198e515f 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -106,7 +106,7 @@ std::tuple modularity_maximization( // Initialize Modularity Matrix sparse_matrix_t A{handle, graph}; - modularity_matrix_t B{handle, graph}; + modularity_matrix_t B{handle, thrust_exec_policy, graph}; auto eigen_config = eigen_solver.get_config(); auto nEigVecs = eigen_config.n_eigVecs; @@ -170,7 +170,7 @@ void analyzeModularity(handle_t const &handle, // Initialize Modularity sparse_matrix_t A{handle, graph}; - modularity_matrix_t B{handle, graph}; + modularity_matrix_t B{handle, thrust_exec_policy, graph}; // Initialize output modularity = 0; @@ -189,7 +189,7 @@ void analyzeModularity(handle_t const &handle, } // modularity = modularity/nClusters; // devide by nnz - modularity = modularity / B.get_diag_nrm1(); + modularity = modularity / B.diagonal_.nrm1(thrust_exec_policy); } } // namespace spectral diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 6cc2744e96..746bb54b60 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -90,7 +90,7 @@ std::tuple partition( // Initialize Laplacian sparse_matrix_t A{handle, graph}; - laplacian_matrix_t L{handle, graph}; + laplacian_matrix_t L{handle, thrust_exec_policy, graph}; auto eigen_config = eigen_solver.get_config(); auto nEigVecs = eigen_config.n_eigVecs; @@ -157,7 +157,7 @@ void analyzePartition(handle_t const &handle, // Initialize Laplacian sparse_matrix_t A{handle, graph}; - laplacian_matrix_t L{handle, graph}; + laplacian_matrix_t L{handle, thrust_exec_policy, graph}; // Initialize output cost = 0; diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index 87bf74bdde..b96bda0b34 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -37,12 +37,12 @@ TEST(Raft, EigenSolvers) { value_type* vs{nullptr}; index_type nnz = 0; index_type nrows = 0; + auto stream = h.get_stream(); + auto t_exe_pol = thrust::cuda::par.on(stream); + sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; ASSERT_EQ(nullptr, sm1.row_offsets_); - laplacian_matrix_t lm1{h, ro, ci, vs, nrows, nnz}; - ASSERT_EQ(nullptr, lm1.diagonal_.raw()); - index_type neigvs{10}; index_type maxiter{100}; index_type restart_iter{10}; @@ -61,10 +61,10 @@ TEST(Raft, EigenSolvers) { lanczos_solver_t eig_solver{cfg}; EXPECT_ANY_THROW( - eig_solver.solve_smallest_eigenvectors(h, lm1, eigvals, eigvecs)); + eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs)); EXPECT_ANY_THROW( - eig_solver.solve_largest_eigenvectors(h, lm1, eigvals, eigvecs)); + eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs)); } TEST(Raft, SpectralSolvers) { diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu index 30346d5da5..1052911c4f 100644 --- a/cpp/test/spectral_matrix.cu +++ b/cpp/test/spectral_matrix.cu @@ -47,15 +47,30 @@ TEST(Raft, SpectralMatrices) { ASSERT_EQ(nullptr, sm1.row_offsets_); ASSERT_EQ(nullptr, sm2.row_offsets_); - laplacian_matrix_t lm1{h, ro, ci, vs, nrows, nnz}; - laplacian_matrix_t lm2{h, empty_graph}; - ASSERT_EQ(nullptr, lm1.diagonal_.raw()); - ASSERT_EQ(nullptr, lm2.diagonal_.raw()); - - modularity_matrix_t mm1{h, ro, ci, vs, nrows, nnz}; - modularity_matrix_t mm2{h, empty_graph}; - ASSERT_EQ(nullptr, mm1.diagonal_.raw()); - ASSERT_EQ(nullptr, mm2.diagonal_.raw()); + auto stream = h.get_stream(); + auto t_exe_pol = thrust::cuda::par.on(stream); + + auto cnstr_lm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) { + laplacian_matrix_t lm1{h, t_exe_pol, ro, ci, + vs, nrows, nnz}; + }; + EXPECT_ANY_THROW(cnstr_lm1()); // because of nullptr ptr args + + auto cnstr_lm2 = [&h, t_exe_pol, &empty_graph](void) { + laplacian_matrix_t lm2{h, t_exe_pol, empty_graph}; + }; + EXPECT_ANY_THROW(cnstr_lm2()); // because of nullptr ptr args + + auto cnstr_mm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) { + modularity_matrix_t mm1{h, t_exe_pol, ro, ci, + vs, nrows, nnz}; + }; + EXPECT_ANY_THROW(cnstr_mm1()); // because of nullptr ptr args + + auto cnstr_mm2 = [&h, t_exe_pol, &empty_graph](void) { + modularity_matrix_t mm2{h, t_exe_pol, empty_graph}; + }; + EXPECT_ANY_THROW(cnstr_mm2()); // because of nullptr ptr args } } // namespace raft From 3f5ec592a551b3311eb3ed87f0f783cabfe08d80 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 19 Jun 2020 20:50:51 -0500 Subject: [PATCH 54/88] Fixed mv() for lapalacian matrix. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 63 +++++++++++-------- 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index e1eaf237c3..286fe5e6d2 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include #include @@ -24,6 +25,8 @@ #include #include +#include + // ========================================================= // Useful macros // ========================================================= @@ -87,6 +90,8 @@ class vector_t { value_type* raw(void) { return buffer_; } + value_type const* raw(void) const { return buffer_; } + template value_type nrm1(ThrustExecPolicy t_exe_pol) const { return thrust::reduce(t_exe_pol, buffer_, buffer_ + size_, value_type{0}, @@ -203,6 +208,8 @@ struct sparse_matrix_t { #endif } + handle_t const& get_handle(void) const { return handle_; } + //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate handle_t const& handle_; @@ -247,32 +254,38 @@ struct laplacian_matrix_t : sparse_matrix_t { void mv(value_type alpha, value_type* __restrict__ x, value_type beta, value_type* __restrict__ y, bool transpose = false, bool symmetric = false) const override { - //TODO: call cusparse::csrmv ... and more: + constexpr int BLOCK_SIZE = 1024; + auto n = sparse_matrix_t::nrows_; + + auto cublas_h = + sparse_matrix_t::get_handle().get_cublas_handle(); + auto stream = + sparse_matrix_t::get_handle().get_stream(); + + // scales y by beta: + // + if (beta == 0) { + CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream)); + } else if (beta != 1) { + CUBLAS_CHECK(linalg::cublasscal(cublas_h, n, &beta, y, 1, stream)); + } + + // Apply diagonal matrix + // + dim3 gridDim, blockDim; + gridDim.x = std::min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); + gridDim.y = 1; + gridDim.z = 1; + blockDim.x = BLOCK_SIZE; + blockDim.y = 1; + blockDim.z = 1; + utils::diagmv<<>>(n, alpha, diagonal_.raw(), + x, y); + CUDA_CHECK_LAST(); + + // Apply adjacency matrix // - // // scales y by beta: - // // - // if (beta == 0) - // CHECK_CUDA(cudaMemset(y, 0, (this->n) * sizeof(ValueType_))) - // else if (beta != 1) - // thrust::transform(thrust::device_pointer_cast(y), - // thrust::device_pointer_cast(y + this->n), - // thrust::make_constant_iterator(beta), - // thrust::device_pointer_cast(y), - // thrust::multiplies()); - - // // Apply diagonal matrix - // dim3 gridDim, blockDim; - // gridDim.x = min(((this->n) + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); - // gridDim.y = 1; - // gridDim.z = 1; - // blockDim.x = BLOCK_SIZE; - // blockDim.y = 1; - // blockDim.z = 1; - // diagmv<<s>>>(this->n, alpha, D.raw(), x, y); - // cudaCheckError(); - - // // Apply adjacency matrix - // sparse_matrix_t::mv(-alpha, x, 1, y); + sparse_matrix_t::mv(-alpha, x, 1, y); } vector_t diagonal_; From ad36433dda9f2eaadf48d8dad52a7911ff7ac0b1 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Fri, 19 Jun 2020 21:13:10 -0500 Subject: [PATCH 55/88] Fixed mv() for modularity matrix. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 45 ++++++++++++++----- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 286fe5e6d2..22c6416c4e 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -301,30 +301,55 @@ struct modularity_matrix_t : laplacian_matrix_t { index_type const nrows, index_type const nnz) : laplacian_matrix_t( raft_handle, thrust_exec_policy, row_offsets, col_indices, values, - nrows, nnz) {} + nrows, nnz) { + edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( + thrust_exec_policy); + } template modularity_matrix_t( handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, GraphCSRView const& csr_view) - : laplacian_matrix_t( - raft_handle, thrust_exec_policy, csr_view) {} + : laplacian_matrix_t(raft_handle, + thrust_exec_policy, csr_view) { + edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( + thrust_exec_policy); + } // y = alpha*A*x + beta*y // void mv(value_type alpha, value_type* __restrict__ x, value_type beta, value_type* __restrict__ y, bool transpose = false, bool symmetric = false) const override { - //TODO: call cusparse::csrmv ... and more: + auto n = sparse_matrix_t::nrows_; + + auto cublas_h = + sparse_matrix_t::get_handle().get_cublas_handle(); + auto stream = + sparse_matrix_t::get_handle().get_stream(); + + // y = A*x + // + sparse_matrix_t::mv(alpha, x, 0, y); + value_type dot_res; + + // gamma = d'*x // - // // y = A*x - // sparse_matrix_t::mv(alpha, x, 0, y); - // value_type dot_res; - // // gamma = d'*x // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); - // // y = y -(gamma/edge_sum)*d - // Cublas::axpy(this->n, -(dot_res / this->edge_sum), D.raw(), 1, y, 1); + CUBLAS_CHECK(linalg::cublasdot( + cublas_h, n, laplacian_matrix_t::diagonal_.raw(), + 1, x, 1, &dot_res, stream)); + + // y = y -(gamma/edge_sum)*d + // + value_type gamma_ = -dot_res / edge_sum_; + CUBLAS_CHECK(linalg::cublasaxpy( + cublas_h, n, &gamma_, + laplacian_matrix_t::diagonal_.raw(), 1, y, 1, + stream)); } + + value_type edge_sum_; }; } // namespace matrix From 8ea55828df1550b15395eab3192a296714bbe771 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 22 Jun 2020 09:30:52 -0500 Subject: [PATCH 56/88] Updated CHANGELOG.md. --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d9a391369f..457312f06b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # RAFT 0.15.0 (Date TBD) ## New Features +- PR #12: Spectral clustering. ## Improvements From 5474e99a1916902b1875143823baec6035226210 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 22 Jun 2020 18:38:52 -0500 Subject: [PATCH 57/88] Integrated raft error control from outer PR. --- cpp/include/raft/spectral/cluster_solvers.hpp | 10 +-- cpp/include/raft/spectral/eigen_solvers.hpp | 24 +++--- cpp/include/raft/spectral/kmeans.hpp | 57 ++++++++------- cpp/include/raft/spectral/lanczos.hpp | 73 ++++++++++--------- cpp/include/raft/spectral/lapack.hpp | 4 +- cpp/include/raft/spectral/matrix_wrappers.hpp | 8 +- .../raft/spectral/modularity_maximization.hpp | 10 +-- cpp/include/raft/spectral/partition.hpp | 8 +- cpp/include/raft/spectral/spectral_util.hpp | 11 ++- .../spectral/{error_temp.hpp => warn_dbg.hpp} | 18 +---- 10 files changed, 107 insertions(+), 116 deletions(-) rename cpp/include/raft/spectral/{error_temp.hpp => warn_dbg.hpp} (65%) diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp index b19237d1a8..922ae7cfab 100644 --- a/cpp/include/raft/spectral/cluster_solvers.hpp +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -48,13 +48,13 @@ struct kmeans_solver_t { size_type_t n_obs_vecs, size_type_t dim, value_type_t const* __restrict__ obs, index_type_t* __restrict__ codes) const { - RAFT_EXPECT(obs != nullptr, "Null obs buffer."); - RAFT_EXPECT(codes != nullptr, "Null codes buffer."); + RAFT_EXPECTS(obs != nullptr, "Null obs buffer."); + RAFT_EXPECTS(codes != nullptr, "Null codes buffer."); value_type_t residual{}; index_type_t iters{}; - RAFT_TRY(kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters, - config_.tol, config_.maxIter, obs, codes, residual, iters, - config_.seed)); + kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters, + config_.tol, config_.maxIter, obs, codes, residual, iters, + config_.seed); return std::make_pair(residual, iters); } diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp index 97114661c5..056189dcba 100644 --- a/cpp/include/raft/spectral/eigen_solvers.hpp +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -49,13 +49,13 @@ struct lanczos_solver_t { sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) const { - RAFT_EXPECT(eigVals != nullptr, "Null eigVals buffer."); - RAFT_EXPECT(eigVecs != nullptr, "Null eigVecs buffer."); + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); index_type_t iters{}; - RAFT_TRY(computeSmallestEigenvectors( - handle, A, config_.n_eigVecs, config_.maxIter, config_.restartIter, - config_.tol, config_.reorthogonalize, iters, eigVals, eigVecs, - config_.seed)); + computeSmallestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, + config_.restartIter, config_.tol, + config_.reorthogonalize, iters, eigVals, + eigVecs, config_.seed); return iters; } @@ -64,13 +64,13 @@ struct lanczos_solver_t { sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) const { - RAFT_EXPECT(eigVals != nullptr, "Null eigVals buffer."); - RAFT_EXPECT(eigVecs != nullptr, "Null eigVecs buffer."); + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); index_type_t iters{}; - RAFT_TRY(computeLargestEigenvectors(handle, A, config_.n_eigVecs, - config_.maxIter, config_.restartIter, - config_.tol, config_.reorthogonalize, - iters, eigVals, eigVecs, config_.seed)); + computeLargestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, + config_.restartIter, config_.tol, + config_.reorthogonalize, iters, eigVals, eigVecs, + config_.seed); return iters; } diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 444bf2491a..db85e25dea 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -29,11 +29,12 @@ #include #include +#include #include #include -#include #include #include +#include namespace { @@ -346,7 +347,7 @@ static int chooseNewCentroid(handle_t const& handle, thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), thrust::device_pointer_cast(distsCumSum)); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); CUDA_TRY(cudaMemcpy(&distsSum, distsCumSum + n - 1, sizeof(ValueType_), cudaMemcpyDeviceToHost)); @@ -357,7 +358,7 @@ static int chooseNewCentroid(handle_t const& handle, thrust_exec_policy, thrust::device_pointer_cast(distsCumSum), thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - thrust::device_pointer_cast(distsCumSum)); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); obsIndex = max(obsIndex, 0); obsIndex = min(obsIndex, n - 1); @@ -435,7 +436,7 @@ static int initializeCentroids( // Choose first centroid thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), 1); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), obs, dists, centroids)) WARNING("error in k-means++ (could not pick centroid)"); @@ -444,7 +445,7 @@ static int initializeCentroids( CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(ValueType_), stream)); computeDistances<<>>( n, d, 1, obs, centroids, dists); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Choose remaining centroids for (i = 1; i < k; ++i) { @@ -457,19 +458,19 @@ static int initializeCentroids( CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(ValueType_), stream)); computeDistances<<>>( n, d, 1, obs, centroids + IDX(0, i, d), dists + n); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Recompute minimum distances minDistances2<<>>(n, dists, dists + n, codes, i); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); } // Compute cluster sizes CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); computeClusterSizes<<>>(n, k, codes, clusterSizes); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); return 0; } @@ -520,7 +521,7 @@ static int assignCentroids( gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); computeDistances<<>>(n, d, k, obs, centroids, dists); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Find centroid closest to each observation vector CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); @@ -532,7 +533,7 @@ static int assignCentroids( gridDim.z = 1; minDistances<<>>(n, k, dists, codes, clusterSizes); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Compute residual sum of squares *residual_host = @@ -599,31 +600,31 @@ static int updateCentroids(handle_t const& handle, // Cluster assigned to each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); thrust::transform(thrust_exec_policy, rows, rows + d * n, thrust::make_constant_iterator(n), rows, thrust::modulus()); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); thrust::gather(thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Row associated with each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); thrust::transform(thrust_exec_policy, rows, rows + d * n, thrust::make_constant_iterator(n), rows, thrust::divides()); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Sort and reduce to add observation vectors in same cluster thrust::stable_sort_by_key(thrust_exec_policy, codes_copy, codes_copy + d * n, make_zip_iterator(make_tuple(obs_copy, rows))); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); thrust::reduce_by_key(thrust_exec_policy, rows, rows + d * n, obs_copy, codes_copy, // Output to codes_copy is ignored thrust::device_pointer_cast(centroids)); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Divide sums by cluster size to get centroid matrix blockDim.x = WARP_SIZE; @@ -634,7 +635,7 @@ static int updateCentroids(handle_t const& handle, gridDim.z = 1; divideCentroids<<>>(d, k, clusterSizes, centroids); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); return 0; } @@ -728,20 +729,20 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(ValueType_), stream)); computeDistances<<>>(n, d, 1, obs, centroids, work); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); *residual_host = thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); return 0; } if (n <= k) { thrust::sequence(thrust_exec_policy, thrust::device_pointer_cast(codes), thrust::device_pointer_cast(codes + n)); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); if (n < k) CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, @@ -802,7 +803,7 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), thrust::device_pointer_cast(clusterSizes + k), 0) - thrust::device_pointer_cast(clusterSizes)); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); } // Check for convergence @@ -852,11 +853,11 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, using namespace matrix; // Check that parameters are valid - RAFT_EXPECT(n > 0, "invalid parameter (n<1)"); - RAFT_EXPECT(d > 0, "invalid parameter (d<1)"); - RAFT_EXPECT(k > 0, "invalid parameter (k<1)"); - RAFT_EXPECT(tol > 0, "invalid parameter (tol<=0)"); - RAFT_EXPECT(maxiter >= 0, "invalid parameter (maxiter<0)"); + RAFT_EXPECTS(n > 0, "invalid parameter (n<1)"); + RAFT_EXPECTS(d > 0, "invalid parameter (d<1)"); + RAFT_EXPECTS(k > 0, "invalid parameter (k<1)"); + RAFT_EXPECTS(tol > 0, "invalid parameter (tol<=0)"); + RAFT_EXPECTS(maxiter >= 0, "invalid parameter (maxiter<0)"); // Allocate memory vector_t clusterSizes(handle, k); diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 6a4a016e4f..8aa615c25d 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -25,11 +25,12 @@ #include #include +#include #include #include -#include #include #include +#include namespace raft { @@ -100,7 +101,7 @@ int performLanczosIteration( auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); - RAFT_EXPECT(A != nullptr, "Null matrix pointer."); + RAFT_EXPECTS(A != nullptr, "Null matrix pointer."); IndexType_ n = A->nrows_; @@ -672,11 +673,12 @@ int computeSmallestEigenvectors( // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); - RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); - RAFT_EXPECT(tol > 0, "Invalid tolerance."); - RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); - RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, + "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -696,8 +698,8 @@ int computeSmallestEigenvectors( work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, - stream)); // ????? TODO: check / remove + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue to determine shift @@ -706,16 +708,15 @@ int computeSmallestEigenvectors( // Random number generator curandGenerator_t randGen; // Initialize random number generator - CUDA_TRY(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); + curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10); // FIXME: This is hard coded, which is good for unit testing... // but should really be a parameter so it could be // "random" for real runs and "fixed" for tests - CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, seed /*time(NULL)*/)); - // CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, time(NULL))); + curandSetPseudoRandomGeneratorSeed(randGen, seed /*time(NULL)*/); + // Initialize initial Lanczos vector - CUDA_TRY( - curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); + curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); ValueType_ normQ1; CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); @@ -821,7 +822,7 @@ int computeSmallestEigenvectors( *effIter, &zero, eigVecs_dev, n, stream)); // Clean up and exit - CUDA_TRY(curandDestroyGenerator(randGen)); + curandDestroyGenerator(randGen); return 0; } @@ -874,11 +875,12 @@ int computeSmallestEigenvectors( IndexType_ n = A.nrows_; // Check that parameters are valid - RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); - RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); - RAFT_EXPECT(tol > 0, "Invalid tolerance."); - RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); - RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, + "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); // Allocate memory std::vector alpha_host_v(restartIter); @@ -987,11 +989,12 @@ int computeLargestEigenvectors( // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); - RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); - RAFT_EXPECT(tol > 0, "Invalid tolerance."); - RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); - RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, + "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -1021,11 +1024,10 @@ int computeLargestEigenvectors( // Random number generator curandGenerator_t randGen; // Initialize random number generator - CUDA_TRY(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); - CUDA_TRY(curandSetPseudoRandomGeneratorSeed(randGen, seed)); + curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10); + curandSetPseudoRandomGeneratorSeed(randGen, seed); // Initialize initial Lanczos vector - CUDA_TRY( - curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); + curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); ValueType_ normQ1; CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); @@ -1141,7 +1143,7 @@ int computeLargestEigenvectors( *effIter, &zero, eigVecs_dev, n, stream)); // Clean up and exit - CUDA_TRY(curandDestroyGenerator(randGen)); + curandDestroyGenerator(randGen); return 0; } @@ -1194,11 +1196,12 @@ int computeLargestEigenvectors(handle_t const &handle, IndexType_ n = A.nrows_; // Check that parameters are valid - RAFT_EXPECT(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); - RAFT_EXPECT(restartIter > 0, "Invalid restartIter."); - RAFT_EXPECT(tol > 0, "Invalid tolerance."); - RAFT_EXPECT(maxIter >= nEigVecs, "Invalid maxIter."); - RAFT_EXPECT(restartIter >= nEigVecs, "Invalid restartIter."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, + "Invalid number of eigenvectors."); + RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); + RAFT_EXPECTS(tol > 0, "Invalid tolerance."); + RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); + RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); // Allocate memory std::vector alpha_host_v(restartIter); diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp index 0dab3d57b2..4417640705 100644 --- a/cpp/include/raft/spectral/lapack.hpp +++ b/cpp/include/raft/spectral/lapack.hpp @@ -19,7 +19,7 @@ #include #include -#include +#include //for now; TODO: check if/where this `define` should be; // @@ -33,7 +33,7 @@ namespace raft { std::stringstream ss; \ ss << "Lapack error: argument number " << -status \ << " had an illegal value."; \ - RAFT_FAIL(ss.str()); \ + throw exception(ss.str()); \ } else if (status > 0) \ RAFT_FAIL("Lapack error: internal error."); \ } diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 22c6416c4e..bd03038373 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -15,11 +15,11 @@ */ #pragma once +#include #include #include #include #include -#include #include #include @@ -143,8 +143,8 @@ struct sparse_matrix_t { bool symmetric = false) const { using namespace sparse; - RAFT_EXPECT(x != nullptr, "Null x buffer."); - RAFT_EXPECT(y != nullptr, "Null y buffer."); + RAFT_EXPECTS(x != nullptr, "Null x buffer."); + RAFT_EXPECTS(y != nullptr, "Null y buffer."); auto cusparse_h = handle_.get_cusparse_handle(); auto stream = handle_.get_stream(); @@ -281,7 +281,7 @@ struct laplacian_matrix_t : sparse_matrix_t { blockDim.z = 1; utils::diagmv<<>>(n, alpha, diagonal_.raw(), x, y); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Apply adjacency matrix // diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index 8e198e515f..679b5ae7df 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -90,9 +90,9 @@ std::tuple modularity_maximization( GraphCSRView const &graph, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { - RAFT_EXPECT(clusters != nullptr, "Null clusters buffer."); - RAFT_EXPECT(eigVals != nullptr, "Null eigVals buffer."); - RAFT_EXPECT(eigVecs != nullptr, "Null eigVecs buffer."); + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -121,7 +121,7 @@ std::tuple modularity_maximization( // notice that at this point the matrix has already been transposed, so we are scaling // columns scale_obs(nEigVecs, n, eigVecs); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Find partition clustering auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, @@ -151,7 +151,7 @@ void analyzeModularity(handle_t const &handle, vertex_t nClusters, vertex_t const *__restrict__ clusters, weight_t &modularity) { - RAFT_EXPECT(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); edge_t i; edge_t n = graph.number_of_vertices; diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 746bb54b60..0c72694f07 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -70,9 +70,9 @@ std::tuple partition( GraphCSRView const &graph, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { - RAFT_EXPECT(clusters != nullptr, "Null clusters buffer."); - RAFT_EXPECT(eigVals != nullptr, "Null eigVals buffer."); - RAFT_EXPECT(eigVecs != nullptr, "Null eigVecs buffer."); + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -137,7 +137,7 @@ void analyzePartition(handle_t const &handle, GraphCSRView const &graph, vertex_t nClusters, const vertex_t *__restrict__ clusters, weight_t &edgeCut, weight_t &cost) { - RAFT_EXPECT(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); edge_t i; edge_t n = graph.number_of_vertices; diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index 2cc38cbbf1..8f8eb3ad8b 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -16,8 +16,8 @@ #pragma once +#include #include -#include #include #include @@ -110,7 +110,6 @@ cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_* obs) { // launch scaling kernel (scale each column of obs by its norm) scale_obs_kernel<<>>(m, n, obs); - CUDA_CHECK_LAST(); return cudaSuccess; } @@ -133,7 +132,7 @@ void transform_eigen_matrix(handle_t const& handle, mean = thrust::reduce( thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); mean /= n; thrust::transform(thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), @@ -141,7 +140,7 @@ void transform_eigen_matrix(handle_t const& handle, thrust::make_constant_iterator(mean), thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::minus()); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); CUBLAS_CHECK( cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); @@ -154,7 +153,7 @@ void transform_eigen_matrix(handle_t const& handle, thrust::make_constant_iterator(std), thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), thrust::divides()); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); } // Transpose eigenvector matrix @@ -213,7 +212,7 @@ bool construct_indicator(handle_t const& handle, thrust::device_pointer_cast(clusters + n), thrust::device_pointer_cast(part_i.raw() + n))), equal_to_i_op(index)); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // Compute size of ith partition CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, diff --git a/cpp/include/raft/spectral/error_temp.hpp b/cpp/include/raft/spectral/warn_dbg.hpp similarity index 65% rename from cpp/include/raft/spectral/error_temp.hpp rename to cpp/include/raft/spectral/warn_dbg.hpp index 7d525ae5f1..406f1b7c7e 100644 --- a/cpp/include/raft/spectral/error_temp.hpp +++ b/cpp/include/raft/spectral/warn_dbg.hpp @@ -6,24 +6,12 @@ #define STRINGIFY_DETAIL(x) #x #define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) -///#define RAFT_EXPECT(cond, reason) -inline void RAFT_EXPECT(bool cond, std::string const& reason) { - if (!cond) throw std::runtime_error(reason.c_str()); -} - -#define RAFT_TRY(expression) (expression) - -//assume RAFT_FAIL() can take a std::string `reason` -// -#define RAFT_FAIL(reason) - -#define CUDA_TRY(call) (call) - -#define CUDA_CHECK_LAST() - #ifdef DEBUG #define COUT() (std::cout) #define CERR() (std::cerr) + +//nope: +// #define WARNING(message) \ do { \ std::stringstream ss; \ From 417581954daa1c1a6c906e225cb4be13a388ca9f Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 24 Jun 2020 18:54:55 -0500 Subject: [PATCH 58/88] Replaced buggy Thrust call with simplified logic. --- cpp/include/raft/spectral/kmeans.hpp | 31 ++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index db85e25dea..ec5d1d67da 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -354,10 +354,33 @@ static int chooseNewCentroid(handle_t const& handle, // Randomly choose observation vector // Probabilities are proportional to square of distance to closest // centroid (see k-means++ algorithm) - obsIndex = (thrust::lower_bound( - thrust_exec_policy, thrust::device_pointer_cast(distsCumSum), - thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - - thrust::device_pointer_cast(distsCumSum)); + // + //seg-faults due to Thrust bug + //on binary-search-like algorithms + //when run with stream dependent + //execution policies; fixed on Thrust GitHub + //hence replace w/ linear interpolation, + //until the Thrust issue gets resolved: + // + // obsIndex = (thrust::lower_bound( + // thrust_exec_policy, thrust::device_pointer_cast(distsCumSum), + // thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - + // thrust::device_pointer_cast(distsCumSum)); + // + //linear interpolation logic: + //{ + ValueType_ minSum{0}; + CUDA_TRY(cudaMemcpy(&minSum, distsCumSum, sizeof(ValueType_), + cudaMemcpyDeviceToHost)); + if (distsSum > minSum) { + ValueType_ vIndex = static_cast(n - 1); + obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / + (distsSum - minSum)); + } else { + obsIndex = 0; + } + //} + CHECK_CUDA(stream); obsIndex = max(obsIndex, 0); obsIndex = min(obsIndex, n - 1); From 9fdf4d618cffa9d32be6a329bce0d7934ebded2b Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 1 Jul 2020 16:05:55 -0500 Subject: [PATCH 59/88] Removed useless graph.hpp dependency in spectral clustering. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 28 +++++++++---------- .../raft/spectral/modularity_maximization.hpp | 26 ++++++++--------- cpp/include/raft/spectral/partition.hpp | 26 ++++++++--------- cpp/test/cluster_solvers.cu | 10 +++---- cpp/test/eigen_solvers.cu | 14 +++++----- cpp/test/spectral_matrix.cu | 25 +++++++++++------ 6 files changed, 67 insertions(+), 62 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index bd03038373..1c78fd16fd 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -18,7 +18,6 @@ #include #include #include -#include #include #include @@ -120,9 +119,8 @@ struct sparse_matrix_t { nrows_(nrows), nnz_(nnz) {} - sparse_matrix_t( - handle_t const& raft_handle, - GraphCSRView const& csr_view) + template + sparse_matrix_t(handle_t const& raft_handle, CSRView const& csr_view) : handle_(raft_handle), row_offsets_(csr_view.offsets), col_indices_(csr_view.indices), @@ -238,12 +236,14 @@ struct laplacian_matrix_t : sparse_matrix_t { } template - laplacian_matrix_t( - handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, - GraphCSRView const& csr_view) - : sparse_matrix_t(raft_handle, csr_view), - diagonal_(raft_handle, csr_view.number_of_vertices) { - vector_t ones{raft_handle, csr_view.number_of_vertices}; + laplacian_matrix_t(handle_t const& raft_handle, + ThrustExePolicy thrust_exec_policy, + sparse_matrix_t const& csr_m) + : sparse_matrix_t(raft_handle, csr_m.row_offsets_, + csr_m.col_indices_, csr_m.values_, + csr_m.nrows_, csr_m.nnz_), + diagonal_(raft_handle, csr_m.nrows_) { + vector_t ones{raft_handle, csr_m.nrows_}; ones.fill(thrust_exec_policy, 1.0); sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); @@ -307,11 +307,11 @@ struct modularity_matrix_t : laplacian_matrix_t { } template - modularity_matrix_t( - handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, - GraphCSRView const& csr_view) + modularity_matrix_t(handle_t const& raft_handle, + ThrustExePolicy thrust_exec_policy, + sparse_matrix_t const& csr_m) : laplacian_matrix_t(raft_handle, - thrust_exec_policy, csr_view) { + thrust_exec_policy, csr_m) { edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( thrust_exec_policy); } diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index 679b5ae7df..5ac33eda43 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -82,12 +82,11 @@ using namespace linalg; * performed. * @return error flag. */ -template +template std::tuple modularity_maximization( handle_t const &handle, ThrustExePolicy thrust_exec_policy, - GraphCSRView const &graph, + sparse_matrix_t const &csr_m, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); @@ -100,13 +99,13 @@ std::tuple modularity_maximization( std::tuple stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver - edge_t n = graph.number_of_vertices; + vertex_t n = csr_m.nrows_; // Compute eigenvectors of Modularity Matrix // Initialize Modularity Matrix - sparse_matrix_t A{handle, graph}; - modularity_matrix_t B{handle, thrust_exec_policy, graph}; + //sparse_matrix_t A{handle, graph}; + modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; auto eigen_config = eigen_solver.get_config(); auto nEigVecs = eigen_config.n_eigVecs; @@ -143,18 +142,17 @@ std::tuple modularity_maximization( * @param clusters (Input, device memory, n entries) Cluster assignments. * @param modularity On exit, modularity */ -template +template void analyzeModularity(handle_t const &handle, ThrustExePolicy thrust_exec_policy, - GraphCSRView const &graph, + sparse_matrix_t const &csr_m, vertex_t nClusters, vertex_t const *__restrict__ clusters, weight_t &modularity) { RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); - edge_t i; - edge_t n = graph.number_of_vertices; + vertex_t i; + vertex_t n = csr_m.nrows_; weight_t partModularity, clustersize; auto cublas_h = handle.get_cublas_handle(); @@ -169,8 +167,8 @@ void analyzeModularity(handle_t const &handle, cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity - sparse_matrix_t A{handle, graph}; - modularity_matrix_t B{handle, thrust_exec_policy, graph}; + ///sparse_matrix_t A{handle, graph}; + modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; // Initialize output modularity = 0; diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 0c72694f07..841fca04d9 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -62,12 +62,11 @@ using namespace linalg; * performed. * @return statistics: number of eigensolver iterations, . */ -template +template std::tuple partition( handle_t const &handle, ThrustExePolicy thrust_exec_policy, - GraphCSRView const &graph, + sparse_matrix_t const &csr_m, EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); @@ -80,7 +79,7 @@ std::tuple partition( std::tuple stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver - edge_t n = graph.number_of_vertices; + vertex_t n = csr_m.nrows_; // ------------------------------------------------------- // Spectral partitioner @@ -89,8 +88,8 @@ std::tuple partition( // Compute eigenvectors of Laplacian // Initialize Laplacian - sparse_matrix_t A{handle, graph}; - laplacian_matrix_t L{handle, thrust_exec_policy, graph}; + ///sparse_matrix_t A{handle, graph}; + laplacian_matrix_t L{handle, thrust_exec_policy, csr_m}; auto eigen_config = eigen_solver.get_config(); auto nEigVecs = eigen_config.n_eigVecs; @@ -130,17 +129,16 @@ std::tuple partition( * @param cost On exit, partition cost function. * @return error flag. */ -template +template void analyzePartition(handle_t const &handle, ThrustExePolicy thrust_exec_policy, - GraphCSRView const &graph, + sparse_matrix_t const &csr_m, vertex_t nClusters, const vertex_t *__restrict__ clusters, weight_t &edgeCut, weight_t &cost) { RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); - edge_t i; - edge_t n = graph.number_of_vertices; + vertex_t i; + vertex_t n = csr_m.nrows_; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -156,8 +154,8 @@ void analyzePartition(handle_t const &handle, cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Laplacian - sparse_matrix_t A{handle, graph}; - laplacian_matrix_t L{handle, thrust_exec_policy, graph}; + ///sparse_matrix_t A{handle, graph}; + laplacian_matrix_t L{handle, thrust_exec_policy, csr_m}; // Initialize output cost = 0; diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu index d3d6a04312..04a94fbf22 100644 --- a/cpp/test/cluster_solvers.cu +++ b/cpp/test/cluster_solvers.cu @@ -89,16 +89,16 @@ TEST(Raft, ModularitySolvers) { kmeans_solver_t cluster_solver{clust_cfg}; auto stream = h.get_stream(); - GraphCSRView empty_graph; + sparse_matrix_t sm{h, nullptr, nullptr, + nullptr, 0, 0}; auto t_exe_p = thrust::cuda::par.on(stream); EXPECT_ANY_THROW(spectral::modularity_maximization( - h, t_exe_p, empty_graph, eig_solver, cluster_solver, clusters, eigvals, - eigvecs)); + h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); value_type modularity{0}; - EXPECT_ANY_THROW(spectral::analyzeModularity(h, t_exe_p, empty_graph, k, - clusters, modularity)); + EXPECT_ANY_THROW( + spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity)); } } // namespace raft diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index b96bda0b34..e6ee09262e 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -101,17 +101,17 @@ TEST(Raft, SpectralSolvers) { kmeans_solver_t cluster_solver{clust_cfg}; auto stream = h.get_stream(); - GraphCSRView empty_graph; - auto t_exe_p = thrust::cuda::par.on(stream); - EXPECT_ANY_THROW(spectral::partition(h, t_exe_p, empty_graph, eig_solver, - cluster_solver, clusters, eigvals, - eigvecs)); + auto t_exe_p = thrust::cuda::par.on(stream); + sparse_matrix_t sm{h, nullptr, nullptr, + nullptr, 0, 0}; + EXPECT_ANY_THROW(spectral::partition( + h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); value_type edgeCut{0}; value_type cost{0}; - EXPECT_ANY_THROW(spectral::analyzePartition(h, t_exe_p, empty_graph, k, - clusters, edgeCut, cost)); + EXPECT_ANY_THROW( + spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost)); } } // namespace raft diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu index 1052911c4f..e5c2d52764 100644 --- a/cpp/test/spectral_matrix.cu +++ b/cpp/test/spectral_matrix.cu @@ -22,7 +22,16 @@ #include namespace raft { - +namespace { +template +struct csr_view_t { + index_type* offsets; + index_type* indices; + value_type* edge_data; + index_type number_of_vertices; + index_type number_of_edges; +}; +} // namespace TEST(Raft, SpectralMatrices) { using namespace matrix; using index_type = int; @@ -32,18 +41,18 @@ TEST(Raft, SpectralMatrices) { ASSERT_EQ(0, h.get_num_internal_streams()); ASSERT_EQ(0, h.get_device()); + csr_view_t csr_v{nullptr, nullptr, nullptr, 0, 0}; + int const sz = 10; vector_t d_v{h, sz}; - GraphCSRView empty_graph; - index_type* ro{nullptr}; index_type* ci{nullptr}; value_type* vs{nullptr}; index_type nnz = 0; index_type nrows = 0; sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; - sparse_matrix_t sm2{h, empty_graph}; + sparse_matrix_t sm2{h, csr_v}; ASSERT_EQ(nullptr, sm1.row_offsets_); ASSERT_EQ(nullptr, sm2.row_offsets_); @@ -56,8 +65,8 @@ TEST(Raft, SpectralMatrices) { }; EXPECT_ANY_THROW(cnstr_lm1()); // because of nullptr ptr args - auto cnstr_lm2 = [&h, t_exe_pol, &empty_graph](void) { - laplacian_matrix_t lm2{h, t_exe_pol, empty_graph}; + auto cnstr_lm2 = [&h, t_exe_pol, &sm2](void) { + laplacian_matrix_t lm2{h, t_exe_pol, sm2}; }; EXPECT_ANY_THROW(cnstr_lm2()); // because of nullptr ptr args @@ -67,8 +76,8 @@ TEST(Raft, SpectralMatrices) { }; EXPECT_ANY_THROW(cnstr_mm1()); // because of nullptr ptr args - auto cnstr_mm2 = [&h, t_exe_pol, &empty_graph](void) { - modularity_matrix_t mm2{h, t_exe_pol, empty_graph}; + auto cnstr_mm2 = [&h, t_exe_pol, &sm2](void) { + modularity_matrix_t mm2{h, t_exe_pol, sm2}; }; EXPECT_ANY_THROW(cnstr_mm2()); // because of nullptr ptr args } From 28e9d4adfe604bac89c825d27cc5a545417d535f Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Wed, 1 Jul 2020 16:10:32 -0500 Subject: [PATCH 60/88] Removed unnecessary graph.hpp. --- cpp/include/raft/graph.hpp | 550 ------------------------------------- 1 file changed, 550 deletions(-) delete mode 100644 cpp/include/raft/graph.hpp diff --git a/cpp/include/raft/graph.hpp b/cpp/include/raft/graph.hpp deleted file mode 100644 index 089decc8ee..0000000000 --- a/cpp/include/raft/graph.hpp +++ /dev/null @@ -1,550 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -/// #include // TODO: clarify what must be done about `comm` -#include -#include -#include - -#include -#include - -namespace raft { -namespace matrix { - -enum class PropType { PROP_UNDEF, PROP_FALSE, PROP_TRUE }; - -struct GraphProperties { - bool directed{false}; - bool weighted{false}; - bool multigraph{false}; - bool bipartite{false}; - bool tree{false}; - PropType has_negative_edges{PropType::PROP_UNDEF}; - GraphProperties() = default; -}; - -enum class DegreeDirection { - IN_PLUS_OUT = 0, ///> Compute sum of in and out degree - IN, ///> Compute in degree - OUT, ///> Compute out degree - DEGREE_DIRECTION_COUNT -}; - -/** - * @brief Base class graphs, all but vertices and edges - * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight - */ -template -class GraphViewBase { - public: - WT *edge_data; ///< edge weight - /// Comm comm; // TODO: clarify what must be done about `comm` - - GraphProperties prop; - - VT number_of_vertices; - ET number_of_edges; - - /** - * @brief Fill the identifiers array with the vertex identifiers. - * - * @param[out] identifier Pointer to device memory to store the vertex - * identifiers - */ - void get_vertex_identifiers(VT *identifiers) const; - /// void set_communicator(Comm &comm_) { comm = comm_; } // TODO: see above - - GraphViewBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) - : edge_data(edge_data_), - /// comm(), // TODO: see above - prop(), - number_of_vertices(number_of_vertices_), - number_of_edges(number_of_edges_) {} - bool has_data(void) const { return edge_data != nullptr; } -}; - -/** - * @brief A graph stored in COO (COOrdinate) format. - * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight - */ -template -class GraphCOOView : public GraphViewBase { - public: - VT *src_indices{nullptr}; ///< rowInd - VT *dst_indices{nullptr}; ///< colInd - - /** - * @brief Computes degree(in, out, in+out) of all the nodes of a Graph - * - * @throws cugraph::logic_error when an error occurs. - * - * @param[out] degree Device array of size V (V is number of vertices) initialized - * to zeros. Will contain the computed degree of every vertex. - * @param[in] direction IN_PLUS_OUT, IN or OUT - */ - void degree(ET *degree, DegreeDirection direction) const; - - /** - * @brief Default constructor - */ - GraphCOOView() : GraphViewBase(nullptr, 0, 0) {} - - /** - * @brief Wrap existing arrays representing an edge list in a Graph. - * - * GraphCOOView does not own the memory used to represent this graph. This - * function does not allocate memory. - * - * @param source_indices This array of size E (number of edges) contains the index of the - * source for each edge. Indices must be in the range [0, V-1]. - * @param destination_indices This array of size E (number of edges) contains the index of the - * destination for each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array size E (number of edges) contains the weight for each - * edge. This array can be null in which case the graph is considered unweighted. - * @param number_of_vertices The number of vertices in the graph - * @param number_of_edges The number of edges in the graph - */ - GraphCOOView(VT *src_indices_, VT *dst_indices_, WT *edge_data_, - VT number_of_vertices_, ET number_of_edges_) - : GraphViewBase(edge_data_, number_of_vertices_, - number_of_edges_), - src_indices(src_indices_), - dst_indices(dst_indices_) {} -}; - -/** - * @brief Base class for graph stored in CSR (Compressed Sparse Row) format or CSC (Compressed - * Sparse Column) format - * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight - */ -template -class GraphCompressedSparseBaseView : public GraphViewBase { - public: - ET *offsets{nullptr}; ///< CSR offsets - VT *indices{nullptr}; ///< CSR indices - - /** - * @brief Fill the identifiers in the array with the source vertex - * identifiers - * - * @param[out] src_indices Pointer to device memory to store the - * source vertex identifiers - */ - void get_source_indices(VT *src_indices) const; - - /** - * @brief Computes degree(in, out, in+out) of all the nodes of a Graph - * - * @throws cugraph::logic_error when an error occurs. - * - * @param[out] degree Device array of size V (V is number of vertices) initialized - * to zeros. Will contain the computed degree of every vertex. - * @param[in] x Integer value indicating type of degree calculation - * 0 : in+out degree - * 1 : in-degree - * 2 : out-degree - */ - void degree(ET *degree, DegreeDirection direction) const; - - /** - * @brief Wrap existing arrays representing adjacency lists in a Graph. - * GraphCSRView does not own the memory used to represent this graph. This - * function does not allocate memory. - * - * @param offsets This array of size V+1 (V is number of vertices) contains the - * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of - * edges). - * @param indices This array of size E contains the index of the destination for - * each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for - * each edge. This array can be null in which case the graph is considered unweighted. - * @param number_of_vertices The number of vertices in the graph - * @param number_of_edges The number of edges in the graph - */ - GraphCompressedSparseBaseView(ET *offsets_, VT *indices_, WT *edge_data_, - VT number_of_vertices_, ET number_of_edges_) - : GraphViewBase(edge_data_, number_of_vertices_, - number_of_edges_), - offsets{offsets_}, - indices{indices_} {} -}; - -/** - * @brief A graph stored in CSR (Compressed Sparse Row) format. - * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight - */ -template -class GraphCSRView : public GraphCompressedSparseBaseView { - public: - /** - * @brief Default constructor - */ - GraphCSRView() - : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, - 0) {} - - /** - * @brief Wrap existing arrays representing adjacency lists in a Graph. - * GraphCSRView does not own the memory used to represent this graph. This - * function does not allocate memory. - * - * @param offsets This array of size V+1 (V is number of vertices) contains the - * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of - * edges). - * @param indices This array of size E contains the index of the destination for - * each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for - * each edge. This array can be null in which case the graph is considered unweighted. - * @param number_of_vertices The number of vertices in the graph - * @param number_of_edges The number of edges in the graph - */ - GraphCSRView(ET *offsets_, VT *indices_, WT *edge_data_, - VT number_of_vertices_, ET number_of_edges_) - : GraphCompressedSparseBaseView( - offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) { - } -}; - -/** - * @brief A graph stored in CSC (Compressed Sparse Column) format. - * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight - */ -template -class GraphCSCView : public GraphCompressedSparseBaseView { - public: - /** - * @brief Default constructor - */ - GraphCSCView() - : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, - 0) {} - - /** - * @brief Wrap existing arrays representing transposed adjacency lists in a Graph. - * GraphCSCView does not own the memory used to represent this graph. This - * function does not allocate memory. - * - * @param offsets This array of size V+1 (V is number of vertices) contains the - * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of - * edges). - * @param indices This array of size E contains the index of the destination for - * each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for - * each edge. This array can be null in which case the graph is considered unweighted. - * @param number_of_vertices The number of vertices in the graph - * @param number_of_edges The number of edges in the graph - */ - GraphCSCView(ET *offsets_, VT *indices_, WT *edge_data_, - VT number_of_vertices_, ET number_of_edges_) - : GraphCompressedSparseBaseView( - offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) { - } -}; - -/** - * @brief TODO : Change this Take ownership of the provided graph arrays in COO format - * - * @param source_indices This array of size E (number of edges) contains the index of the - * source for each edge. Indices must be in the range [0, V-1]. - * @param destination_indices This array of size E (number of edges) contains the index of the - * destination for each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array size E (number of edges) contains the weight for each - * edge. This array can be null in which case the graph is considered unweighted. - * @param number_of_vertices The number of vertices in the graph - * @param number_of_edges The number of edges in the graph - */ -template -struct GraphCOOContents { - VT number_of_vertices; - ET number_of_edges; - std::unique_ptr src_indices; - std::unique_ptr dst_indices; - std::unique_ptr edge_data; -}; - -/** - * @brief A constructed graph stored in COO (COOrdinate) format. - * - * This class will src_indices and dst_indicies (until moved) - * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight - */ -template -class GraphCOO { - VT number_of_vertices_; - ET number_of_edges_; - rmm::device_buffer src_indices_{}; ///< rowInd - rmm::device_buffer dst_indices_{}; ///< colInd - rmm::device_buffer edge_data_{}; ///< CSR data - - public: - /** - * @brief Take ownership of the provided graph arrays in COO format - * - * @param source_indices This array of size E (number of edges) contains the index of the - * source for each edge. Indices must be in the range [0, V-1]. - * @param destination_indices This array of size E (number of edges) contains the index of the - * destination for each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array size E (number of edges) contains the weight for each - * edge. This array can be null in which case the graph is considered unweighted. - * @param number_of_vertices The number of vertices in the graph - * @param number_of_edges The number of edges in the graph - */ - GraphCOO( - VT number_of_vertices, ET number_of_edges, bool has_data = false, - cudaStream_t stream = nullptr, - rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) - : number_of_vertices_(number_of_vertices), - number_of_edges_(number_of_edges), - src_indices_(sizeof(VT) * number_of_edges, stream, mr), - dst_indices_(sizeof(VT) * number_of_edges, stream, mr), - edge_data_((has_data ? sizeof(WT) * number_of_edges : 0), stream, mr) {} - - GraphCOO( - GraphCOOView const &graph, cudaStream_t stream = nullptr, - rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) - : number_of_vertices_(graph.number_of_vertices), - number_of_edges_(graph.number_of_edges), - src_indices_(graph.src_indices, graph.number_of_edges * sizeof(VT), - stream, mr), - dst_indices_(graph.dst_indices, graph.number_of_edges * sizeof(VT), - stream, mr) { - if (graph.has_data()) { - edge_data_ = rmm::device_buffer{ - graph.edge_data, graph.number_of_edges * sizeof(WT), stream, mr}; - } - } - - VT number_of_vertices(void) { return number_of_vertices_; } - ET number_of_edges(void) { return number_of_edges_; } - VT *src_indices(void) { return static_cast(src_indices_.data()); } - VT *dst_indices(void) { return static_cast(dst_indices_.data()); } - WT *edge_data(void) { return static_cast(edge_data_.data()); } - - GraphCOOContents release() noexcept { - VT number_of_vertices = number_of_vertices_; - ET number_of_edges = number_of_edges_; - number_of_vertices_ = 0; - number_of_edges_ = 0; - return GraphCOOContents{ - number_of_vertices, number_of_edges, - std::make_unique(std::move(src_indices_)), - std::make_unique(std::move(dst_indices_)), - std::make_unique(std::move(edge_data_))}; - } - - GraphCOOView view(void) noexcept { - return GraphCOOView(src_indices(), dst_indices(), edge_data(), - number_of_vertices_, number_of_edges_); - } - - bool has_data(void) { return nullptr != edge_data_.data(); } -}; - -template -struct GraphSparseContents { - VT number_of_vertices; - ET number_of_edges; - std::unique_ptr offsets; - std::unique_ptr indices; - std::unique_ptr edge_data; -}; - -/** - * @brief Base class for constructted graphs stored in CSR (Compressed Sparse Row) format or - * CSC (Compressed Sparse Column) format - * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight - */ -template -class GraphCompressedSparseBase { - VT number_of_vertices_{0}; - ET number_of_edges_{0}; - rmm::device_buffer offsets_{}; ///< CSR offsets - rmm::device_buffer indices_{}; ///< CSR indices - rmm::device_buffer edge_data_{}; ///< CSR data - - bool has_data_{false}; - - public: - /** - * @brief Take ownership of the provided graph arrays in CSR/CSC format - * - * @param offsets This array of size V+1 (V is number of vertices) contains the - * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of - * edges). - * @param indices This array of size E contains the index of the destination for - * each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for - * each edge. This array can be null in which case the graph is considered unweighted. - * @param number_of_vertices The number of vertices in the graph - * @param number_of_edges The number of edges in the graph - */ - GraphCompressedSparseBase(VT number_of_vertices, ET number_of_edges, - bool has_data, cudaStream_t stream, - rmm::mr::device_memory_resource *mr) - : number_of_vertices_(number_of_vertices), - number_of_edges_(number_of_edges), - offsets_(sizeof(ET) * (number_of_vertices + 1), stream, mr), - indices_(sizeof(VT) * number_of_edges, stream, mr), - edge_data_((has_data ? sizeof(WT) * number_of_edges : 0), stream, mr) {} - - GraphCompressedSparseBase(GraphSparseContents &&contents) - : number_of_vertices_(contents.number_of_vertices), - number_of_edges_(contents.number_of_edges), - offsets_(std::move(*contents.offsets.release())), - indices_(std::move(*contents.indices.release())), - edge_data_(std::move(*contents.edge_data.release())) {} - - VT number_of_vertices(void) { return number_of_vertices_; } - ET number_of_edges(void) { return number_of_edges_; } - ET *offsets(void) { return static_cast(offsets_.data()); } - VT *indices(void) { return static_cast(indices_.data()); } - WT *edge_data(void) { return static_cast(edge_data_.data()); } - - GraphSparseContents release() noexcept { - VT number_of_vertices = number_of_vertices_; - ET number_of_edges = number_of_edges_; - number_of_vertices_ = 0; - number_of_edges_ = 0; - return GraphSparseContents{ - number_of_vertices, number_of_edges, - std::make_unique(std::move(offsets_)), - std::make_unique(std::move(indices_)), - std::make_unique(std::move(edge_data_))}; - } - - bool has_data(void) { return nullptr != edge_data_.data(); } -}; - -/** - * @brief A constructed graph stored in CSR (Compressed Sparse Row) format. - * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight - */ -template -class GraphCSR : public GraphCompressedSparseBase { - public: - /** - * @brief Default constructor - */ - GraphCSR() : GraphCompressedSparseBase() {} - - /** - * @brief Take ownership of the provided graph arrays in CSR format - * - * @param offsets This array of size V+1 (V is number of vertices) contains the - * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of - * edges). - * @param indices This array of size E contains the index of the destination for - * each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for - * each edge. This array can be null in which case the graph is considered unweighted. - * @param number_of_vertices The number of vertices in the graph - * @param number_of_edges The number of edges in the graph - */ - GraphCSR( - VT number_of_vertices_, ET number_of_edges_, bool has_data_ = false, - cudaStream_t stream = nullptr, - rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) - : GraphCompressedSparseBase( - number_of_vertices_, number_of_edges_, has_data_, stream, mr) {} - - GraphCSR(GraphSparseContents &&contents) - : GraphCompressedSparseBase(std::move(contents)) {} - - GraphCSRView view(void) noexcept { - return GraphCSRView( - GraphCompressedSparseBase::offsets(), - GraphCompressedSparseBase::indices(), - GraphCompressedSparseBase::edge_data(), - GraphCompressedSparseBase::number_of_vertices(), - GraphCompressedSparseBase::number_of_edges()); - } -}; - -/** - * @brief A constructed graph stored in CSC (Compressed Sparse Column) format. - * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight - */ -template -class GraphCSC : public GraphCompressedSparseBase { - public: - /** - * @brief Default constructor - */ - GraphCSC() : GraphCompressedSparseBase() {} - - /** - * @brief Take ownership of the provided graph arrays in CSR format - * - * @param offsets This array of size V+1 (V is number of vertices) contains the - * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of - * edges). - * @param indices This array of size E contains the index of the destination for - * each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for - * each edge. This array can be null in which case the graph is considered unweighted. - * @param number_of_vertices The number of vertices in the graph - * @param number_of_edges The number of edges in the graph - */ - GraphCSC( - VT number_of_vertices_, ET number_of_edges_, bool has_data_ = false, - cudaStream_t stream = nullptr, - rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) - : GraphCompressedSparseBase( - number_of_vertices_, number_of_edges_, has_data_, stream, mr) {} - - GraphCSC(GraphSparseContents &&contents) - : GraphCompressedSparseBase(contents) {} - - GraphCSCView view(void) noexcept { - return GraphCSCView( - GraphCompressedSparseBase::offsets(), - GraphCompressedSparseBase::indices(), - GraphCompressedSparseBase::edge_data(), - GraphCompressedSparseBase::number_of_vertices(), - GraphCompressedSparseBase::number_of_edges()); - } -}; - -} // namespace matrix -} // namespace raft From 87f6315d1d22a7cd7f5db5da15303f4bc1438ef9 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 6 Jul 2020 12:04:35 -0500 Subject: [PATCH 61/88] Addressed code reviews on kmeans dox. --- cpp/include/raft/spectral/kmeans.hpp | 145 +++++++++++++++++---------- 1 file changed, 91 insertions(+), 54 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index ec5d1d67da..53a1b1278a 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -52,12 +52,14 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); // CUDA kernels // ========================================================= -/// Compute distances between observation vectors and centroids -/** Block dimensions should be (warpSize, 1, - * blockSize/warpSize). Ideally, the grid is large enough so there - * are d threads in the x-direction, k threads in the y-direction, - * and n threads in the z-direction. - * +/** + * @brief Compute distances between observation vectors and centroids + * Block dimensions should be (warpSize, 1, + * blockSize/warpSize). Ideally, the grid is large enough so there + * are d threads in the x-direction, k threads in the y-direction, + * and n threads in the z-direction. + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -130,10 +132,12 @@ static __global__ void computeDistances( } } -/// Find closest centroid to observation vectors -/** Block and grid dimensions should be 1-dimensional. Ideally the - * grid is large enough so there are n threads. - * +/** + * @brief Find closest centroid to observation vectors. + * Block and grid dimensions should be 1-dimensional. Ideally the + * grid is large enough so there are n threads. + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. * @param n Number of observation vectors. * @param k Number of clusters. * @param centroids (Input, d*k entries) Centroid matrix. Matrix is @@ -188,10 +192,12 @@ static __global__ void minDistances(IndexType_ n, IndexType_ k, } } -/// Check if newly computed distances are smaller than old distances -/** Block and grid dimensions should be 1-dimensional. Ideally the - * grid is large enough so there are n threads. - * +/** + * @brief Check if newly computed distances are smaller than old distances. + * Block and grid dimensions should be 1-dimensional. Ideally the + * grid is large enough so there are n threads. + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. * @param n Number of observation vectors. * @param dists_old (Input/output, n entries) Distances between * observation vectors and closest centroids. On exit, entries @@ -236,10 +242,11 @@ static __global__ void minDistances2(IndexType_ n, } } -/// Compute size of k-means clusters -/** Block and grid dimensions should be 1-dimensional. Ideally the - * grid is large enough so there are n threads. - * +/** + * @brief Compute size of k-means clusters. + * Block and grid dimensions should be 1-dimensional. Ideally the + * grid is large enough so there are n threads. + * @tparam Index_Type_ the type of data used for indexing. * @param n Number of observation vectors. * @param k Number of clusters. * @param codes (Input, n entries) Cluster assignments. @@ -257,15 +264,17 @@ static __global__ void computeClusterSizes( } } -/// Divide rows of centroid matrix by cluster sizes -/** Divides the ith column of the sum matrix by the size of the ith - * cluster. If the sum matrix has been initialized so that the ith - * row is the sum of all observation vectors in the ith cluster, - * this kernel produces cluster centroids. The grid and block - * dimensions should be 2-dimensional. Ideally the grid is large - * enough so there are d threads in the x-direction and k threads - * in the y-direction. - * +/** + * @brief Divide rows of centroid matrix by cluster sizes. + * Divides the ith column of the sum matrix by the size of the ith + * cluster. If the sum matrix has been initialized so that the ith + * row is the sum of all observation vectors in the ith cluster, + * this kernel produces cluster centroids. The grid and block + * dimensions should be 2-dimensional. Ideally the grid is large + * enough so there are d threads in the x-direction and k threads + * in the y-direction. + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. * @param d Dimension of observation vectors. * @param k Number of clusters. * @param clusterSizes (Input, k entries) Number of points in each @@ -309,9 +318,13 @@ static __global__ void divideCentroids( // Helper functions // ========================================================= -/// Randomly choose new centroids -/** Centroid is randomly chosen with k-means++ algorithm. - * +/** + * @brief Randomly choose new centroids. + * Centroid is randomly chosen with k-means++ algorithm. + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. + * @tparam ThrustExePolicy the type of thrust execution policy. + * @param handle the raft handle. * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -393,9 +406,14 @@ static int chooseNewCentroid(handle_t const& handle, return 0; } -/// Choose initial cluster centroids for k-means algorithm -/** Centroids are randomly chosen with k-means++ algorithm - * +/** + * @brief Choose initial cluster centroids for k-means algorithm. + * Centroids are randomly chosen with k-means++ algorithm + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. + * @tparam ThrustExePolicy the type of thrust execution policy. + * @param handle the raft handle. + * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -498,9 +516,14 @@ static int initializeCentroids( return 0; } -/// Find cluster centroids closest to observation vectors -/** Distance is measured with Euclidean norm. - * +/** + * @brief Find cluster centroids closest to observation vectors. + * Distance is measured with Euclidean norm. + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. + * @tparam ThrustExePolicy the type of thrust execution policy. + * @param handle the raft handle. + * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -566,9 +589,14 @@ static int assignCentroids( return 0; } -/// Update cluster centroids for k-means algorithm -/** All clusters are assumed to be non-empty. - * +/** + * @brief Update cluster centroids for k-means algorithm. + * All clusters are assumed to be non-empty. + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. + * @tparam ThrustExePolicy the type of thrust execution policy. + * @param handle the raft handle. + * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -671,11 +699,16 @@ namespace raft { // k-means algorithm // ========================================================= -/// Find clusters with k-means algorithm -/** Initial centroids are chosen with k-means++ algorithm. Empty - * clusters are reinitialized by choosing new centroids with - * k-means++ algorithm. - * +/** + * @brief Find clusters with k-means algorithm. + * Initial centroids are chosen with k-means++ algorithm. Empty + * clusters are reinitialized by choosing new centroids with + * k-means++ algorithm. + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. + * @tparam ThrustExePolicy the type of thrust execution policy. + * @param handle the raft handle. + * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -701,6 +734,7 @@ namespace raft { * vectors and centroids). * @param iters_host (Output, host memory, 1 entry) Number of * k-means iterations. + * @param seed random seed to be used. * @return error flag. */ template @@ -778,8 +812,7 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, // Initialize cuBLAS CUBLAS_CHECK( - linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, - stream)); // ????? TODO: check / remove + linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // k-means++ algorithm @@ -844,13 +877,16 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, return 0; } -/// Find clusters with k-means algorithm -/** Initial centroids are chosen with k-means++ algorithm. Empty - * clusters are reinitialized by choosing new centroids with - * k-means++ algorithm. - * - * CNMEM must be initialized before calling this function. - * +/** + * @brief Find clusters with k-means algorithm. + * Initial centroids are chosen with k-means++ algorithm. Empty + * clusters are reinitialized by choosing new centroids with + * k-means++ algorithm. + * @tparam Index_Type_ the type of data used for indexing. + * @tparam ValueType_ the type of data used for weights, distances. + * @tparam ThrustExePolicy the type of thrust execution policy. + * @param handle the raft handle. + * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -864,7 +900,8 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, * assignments. * @param residual On exit, residual sum of squares (sum of squares * of distances between observation vectors and centroids). - * @param On exit, number of k-means iterations. + * @param iters on exit, number of k-means iterations. + * @param seed random seed to be used. * @return error flag */ template From 98d7af62d07ff7a665ff4ddf90e813b293dc14fb Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 6 Jul 2020 12:19:14 -0500 Subject: [PATCH 62/88] Addressed code reviews on kmeans lowercase_t types. --- cpp/include/raft/spectral/kmeans.hpp | 276 ++++++++++++++------------- 1 file changed, 143 insertions(+), 133 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 53a1b1278a..07c8748e1a 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -59,7 +59,7 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); * are d threads in the x-direction, k threads in the y-direction, * and n threads in the z-direction. * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. + * @tparam value_type_t the type of data used for weights, distances. * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -75,20 +75,22 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); * centroid. Matrix dimensions are n x k. Entries must be * initialized to zero. */ -template +template static __global__ void computeDistances( - IndexType_ n, IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, - const ValueType_* __restrict__ centroids, ValueType_* __restrict__ dists) { + index_type_t n, index_type_t d, index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists) { // Loop index - IndexType_ i; + index_type_t i; // Block indices - IndexType_ bidx; + index_type_t bidx; // Global indices - IndexType_ gidx, gidy, gidz; + index_type_t gidx, gidy, gidz; // Private memory - ValueType_ centroid_private, dist_private; + value_type_t centroid_private, dist_private; // Global x-index indicates index of vector entry bidx = blockIdx.x; @@ -137,7 +139,7 @@ static __global__ void computeDistances( * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. + * @tparam value_type_t the type of data used for weights, distances. * @param n Number of observation vectors. * @param k Number of clusters. * @param centroids (Input, d*k entries) Centroid matrix. Matrix is @@ -153,20 +155,20 @@ static __global__ void computeDistances( * @param clusterSizes (Output, k entries) Number of points in each * cluster. Entries must be initialized to zero. */ -template -static __global__ void minDistances(IndexType_ n, IndexType_ k, - ValueType_* __restrict__ dists, - IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes) { +template +static __global__ void minDistances(index_type_t n, index_type_t k, + value_type_t* __restrict__ dists, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) { // Loop index - IndexType_ i, j; + index_type_t i, j; // Current matrix entry - ValueType_ dist_curr; + value_type_t dist_curr; // Smallest entry in row - ValueType_ dist_min; - IndexType_ code_min; + value_type_t dist_min; + index_type_t code_min; // Each row in observation matrix is processed by a thread i = threadIdx.x + blockIdx.x * blockDim.x; @@ -197,7 +199,7 @@ static __global__ void minDistances(IndexType_ n, IndexType_ k, * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. + * @tparam value_type_t the type of data used for weights, distances. * @param n Number of observation vectors. * @param dists_old (Input/output, n entries) Distances between * observation vectors and closest centroids. On exit, entries @@ -211,18 +213,18 @@ static __global__ void minDistances(IndexType_ n, IndexType_ k, * centroid. * @param code_new Index associated with new centroid. */ -template -static __global__ void minDistances2(IndexType_ n, - ValueType_* __restrict__ dists_old, - const ValueType_* __restrict__ dists_new, - IndexType_* __restrict__ codes_old, - IndexType_ code_new) { +template +static __global__ void minDistances2(index_type_t n, + value_type_t* __restrict__ dists_old, + const value_type_t* __restrict__ dists_new, + index_type_t* __restrict__ codes_old, + index_type_t code_new) { // Loop index - IndexType_ i; + index_type_t i; // Distances - ValueType_ dist_old_private; - ValueType_ dist_new_private; + value_type_t dist_old_private; + value_type_t dist_new_private; // Each row is processed by a thread i = threadIdx.x + blockIdx.x * blockDim.x; @@ -253,11 +255,11 @@ static __global__ void minDistances2(IndexType_ n, * @param clusterSizes (Output, k entries) Number of points in each * cluster. Entries must be initialized to zero. */ -template +template static __global__ void computeClusterSizes( - IndexType_ n, IndexType_ k, const IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes) { - IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; + index_type_t n, index_type_t k, const index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) { + index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { atomicAdd(clusterSizes + codes[i], 1); i += blockDim.x * gridDim.x; @@ -274,7 +276,7 @@ static __global__ void computeClusterSizes( * enough so there are d threads in the x-direction and k threads * in the y-direction. * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. + * @tparam value_type_t the type of data used for weights, distances. * @param d Dimension of observation vectors. * @param k Number of clusters. * @param clusterSizes (Input, k entries) Number of points in each @@ -285,15 +287,15 @@ static __global__ void computeClusterSizes( * cluster. On exit, the matrix is the centroid matrix (each * column is the mean position of a cluster). */ -template +template static __global__ void divideCentroids( - IndexType_ d, IndexType_ k, const IndexType_* __restrict__ clusterSizes, - ValueType_* __restrict__ centroids) { + index_type_t d, index_type_t k, const index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids) { // Global indices - IndexType_ gidx, gidy; + index_type_t gidx, gidy; // Current cluster size - IndexType_ clusterSize_private; + index_type_t clusterSize_private; // Observation vector is determined by global y-index gidy = threadIdx.y + blockIdx.y * blockDim.y; @@ -322,8 +324,8 @@ static __global__ void divideCentroids( * @brief Randomly choose new centroids. * Centroid is randomly chosen with k-means++ algorithm. * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. - * @tparam ThrustExePolicy the type of thrust execution policy. + * @tparam value_type_t the type of data used for weights, distances. + * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param n Number of observation vectors. * @param d Dimension of observation vectors. @@ -339,19 +341,21 @@ static __global__ void divideCentroids( * coordinates. * @return Zero if successful. Otherwise non-zero. */ -template +template static int chooseNewCentroid(handle_t const& handle, - ThrustExePolicy thrust_exec_policy, IndexType_ n, - IndexType_ d, IndexType_ k, ValueType_ rand, - const ValueType_* __restrict__ obs, - ValueType_* __restrict__ dists, - ValueType_* __restrict__ centroid) { + thrust_exe_pol_t thrust_exec_policy, + index_type_t n, index_type_t d, index_type_t k, + value_type_t rand, + const value_type_t* __restrict__ obs, + value_type_t* __restrict__ dists, + value_type_t* __restrict__ centroid) { // Cumulative sum of distances - ValueType_* distsCumSum = dists + n; + value_type_t* distsCumSum = dists + n; // Residual sum of squares - ValueType_ distsSum{0}; + value_type_t distsSum{0}; // Observation vector that is chosen as new centroid - IndexType_ obsIndex; + index_type_t obsIndex; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -361,7 +365,7 @@ static int chooseNewCentroid(handle_t const& handle, thrust::device_pointer_cast(dists + n), thrust::device_pointer_cast(distsCumSum)); CHECK_CUDA(stream); - CUDA_TRY(cudaMemcpy(&distsSum, distsCumSum + n - 1, sizeof(ValueType_), + CUDA_TRY(cudaMemcpy(&distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost)); // Randomly choose observation vector @@ -382,13 +386,13 @@ static int chooseNewCentroid(handle_t const& handle, // //linear interpolation logic: //{ - ValueType_ minSum{0}; - CUDA_TRY(cudaMemcpy(&minSum, distsCumSum, sizeof(ValueType_), + value_type_t minSum{0}; + CUDA_TRY(cudaMemcpy(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost)); if (distsSum > minSum) { - ValueType_ vIndex = static_cast(n - 1); - obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / - (distsSum - minSum)); + value_type_t vIndex = static_cast(n - 1); + obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / + (distsSum - minSum)); } else { obsIndex = 0; } @@ -400,7 +404,7 @@ static int chooseNewCentroid(handle_t const& handle, // Record new centroid position CUDA_TRY(cudaMemcpyAsync(centroid, obs + IDX(0, obsIndex, d), - d * sizeof(ValueType_), cudaMemcpyDeviceToDevice, + d * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); return 0; @@ -410,8 +414,8 @@ static int chooseNewCentroid(handle_t const& handle, * @brief Choose initial cluster centroids for k-means algorithm. * Centroids are randomly chosen with k-means++ algorithm * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. - * @tparam ThrustExePolicy the type of thrust execution policy. + * @tparam value_type_t the type of data used for weights, distances. + * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. @@ -432,26 +436,27 @@ static int chooseNewCentroid(handle_t const& handle, * distance between observation vectors and the closest centroid. * @return Zero if successful. Otherwise non-zero. */ -template +template static int initializeCentroids( - handle_t const& handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, - IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, - ValueType_* __restrict__ centroids, IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes, ValueType_* __restrict__ dists, + handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n, + index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, + value_type_t* __restrict__ centroids, index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ dists, unsigned long long seed) { // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- // Loop index - IndexType_ i; + index_type_t i; // CUDA grid dimensions dim3 blockDim_warp, gridDim_warp, gridDim_block; // Random number generator thrust::default_random_engine rng(seed); - thrust::uniform_real_distribution uniformDist(0, 1); + thrust::uniform_real_distribution uniformDist(0, 1); auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -472,7 +477,7 @@ static int initializeCentroids( gridDim_block.z = 1; // Assign observation vectors to code 0 - CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_), stream)); + CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); // Choose first centroid thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists), @@ -483,7 +488,7 @@ static int initializeCentroids( WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from first centroid - CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(ValueType_), stream)); + CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream)); computeDistances<<>>( n, d, 1, obs, centroids, dists); CHECK_CUDA(stream); @@ -496,7 +501,7 @@ static int initializeCentroids( WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from ith centroid - CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(ValueType_), stream)); + CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream)); computeDistances<<>>( n, d, 1, obs, centroids + IDX(0, i, d), dists + n); CHECK_CUDA(stream); @@ -508,7 +513,7 @@ static int initializeCentroids( } // Compute cluster sizes - CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); + CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); computeClusterSizes<<>>(n, k, codes, clusterSizes); CHECK_CUDA(stream); @@ -520,8 +525,8 @@ static int initializeCentroids( * @brief Find cluster centroids closest to observation vectors. * Distance is measured with Euclidean norm. * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. - * @tparam ThrustExePolicy the type of thrust execution policy. + * @tparam value_type_t the type of data used for weights, distances. + * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. @@ -544,13 +549,14 @@ static int initializeCentroids( * of squares of assignment. * @return Zero if successful. Otherwise non-zero. */ -template +template static int assignCentroids( - handle_t const& handle, ThrustExePolicy thrust_exec_policy, IndexType_ n, - IndexType_ d, IndexType_ k, const ValueType_* __restrict__ obs, - const ValueType_* __restrict__ centroids, ValueType_* __restrict__ dists, - IndexType_* __restrict__ codes, IndexType_* __restrict__ clusterSizes, - ValueType_* residual_host) { + handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n, + index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, value_type_t* __restrict__ dists, + index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, + value_type_t* residual_host) { // CUDA grid dimensions dim3 blockDim, gridDim; @@ -558,7 +564,7 @@ static int assignCentroids( auto stream = handle.get_stream(); // Compute distance between centroids and observation vectors - CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(ValueType_), stream)); + CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream)); blockDim.x = WARP_SIZE; blockDim.y = 1; blockDim.z = BLOCK_SIZE / WARP_SIZE; @@ -570,7 +576,7 @@ static int assignCentroids( CHECK_CUDA(stream); // Find centroid closest to each observation vector - CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_), stream)); + CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); blockDim.x = BLOCK_SIZE; blockDim.y = 1; blockDim.z = 1; @@ -593,8 +599,8 @@ static int assignCentroids( * @brief Update cluster centroids for k-means algorithm. * All clusters are assumed to be non-empty. * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. - * @tparam ThrustExePolicy the type of thrust execution policy. + * @tparam value_type_t the type of data used for weights, distances. + * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. @@ -615,23 +621,24 @@ static int assignCentroids( * Workspace. * @return Zero if successful. Otherwise non-zero. */ -template +template static int updateCentroids(handle_t const& handle, - ThrustExePolicy thrust_exec_policy, IndexType_ n, - IndexType_ d, IndexType_ k, - const ValueType_* __restrict__ obs, - const IndexType_* __restrict__ codes, - const IndexType_* __restrict__ clusterSizes, - ValueType_* __restrict__ centroids, - ValueType_* __restrict__ work, - IndexType_* __restrict__ work_int) { + thrust_exe_pol_t thrust_exec_policy, index_type_t n, + index_type_t d, index_type_t k, + const value_type_t* __restrict__ obs, + const index_type_t* __restrict__ codes, + const index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids, + value_type_t* __restrict__ work, + index_type_t* __restrict__ work_int) { // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- // Useful constants - const ValueType_ one = 1; - const ValueType_ zero = 0; + const value_type_t one = 1; + const value_type_t zero = 0; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -640,21 +647,21 @@ static int updateCentroids(handle_t const& handle, dim3 blockDim, gridDim; // Device memory - thrust::device_ptr obs_copy(work); - thrust::device_ptr codes_copy(work_int); - thrust::device_ptr rows(work_int + d * n); + thrust::device_ptr obs_copy(work); + thrust::device_ptr codes_copy(work_int); + thrust::device_ptr rows(work_int + d * n); // Take transpose of observation matrix CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, n, d, &one, obs, - d, &zero, (ValueType_*)NULL, n, + d, &zero, (value_type_t*)NULL, n, thrust::raw_pointer_cast(obs_copy), n, stream)); // Cluster assigned to each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); CHECK_CUDA(stream); thrust::transform(thrust_exec_policy, rows, rows + d * n, - thrust::make_constant_iterator(n), rows, - thrust::modulus()); + thrust::make_constant_iterator(n), rows, + thrust::modulus()); CHECK_CUDA(stream); thrust::gather(thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy); @@ -664,8 +671,8 @@ static int updateCentroids(handle_t const& handle, thrust::sequence(thrust_exec_policy, rows, rows + d * n); CHECK_CUDA(stream); thrust::transform(thrust_exec_policy, rows, rows + d * n, - thrust::make_constant_iterator(n), rows, - thrust::divides()); + thrust::make_constant_iterator(n), rows, + thrust::divides()); CHECK_CUDA(stream); // Sort and reduce to add observation vectors in same cluster @@ -705,8 +712,8 @@ namespace raft { * clusters are reinitialized by choosing new centroids with * k-means++ algorithm. * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. - * @tparam ThrustExePolicy the type of thrust execution policy. + * @tparam value_type_t the type of data used for weights, distances. + * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. @@ -737,28 +744,30 @@ namespace raft { * @param seed random seed to be used. * @return error flag. */ -template -int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, - IndexType_ n, IndexType_ d, IndexType_ k, ValueType_ tol, - IndexType_ maxiter, const ValueType_* __restrict__ obs, - IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes, - ValueType_* __restrict__ centroids, ValueType_* __restrict__ work, - IndexType_* __restrict__ work_int, ValueType_* residual_host, - IndexType_* iters_host, unsigned long long seed) { +template +int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, + index_type_t n, index_type_t d, index_type_t k, value_type_t tol, + index_type_t maxiter, const value_type_t* __restrict__ obs, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids, + value_type_t* __restrict__ work, index_type_t* __restrict__ work_int, + value_type_t* residual_host, index_type_t* iters_host, + unsigned long long seed) { // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- // Current iteration - IndexType_ iter; + index_type_t iter; // Residual sum of squares at previous iteration - ValueType_ residualPrev = 0; + value_type_t residualPrev = 0; // Random number generator thrust::default_random_engine rng(seed); - thrust::uniform_real_distribution uniformDist(0, 1); + thrust::uniform_real_distribution uniformDist(0, 1); // ------------------------------------------------------- // Initialization @@ -769,8 +778,8 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, // Trivial cases if (k == 1) { - CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_), stream)); - CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(IndexType_), + CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); + CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream)); if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) @@ -783,7 +792,7 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, gridDim.y = 1; gridDim.z = min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), 65535); - CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(ValueType_), stream)); + CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream)); computeDistances<<>>(n, d, 1, obs, centroids, work); CHECK_CUDA(stream); @@ -803,8 +812,8 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, if (n < k) CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, - (k - n) * sizeof(IndexType_), stream)); - CUDA_TRY(cudaMemcpyAsync(centroids, obs, d * n * sizeof(ValueType_), + (k - n) * sizeof(index_type_t), stream)); + CUDA_TRY(cudaMemcpyAsync(centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); *residual_host = 0; return 0; @@ -837,7 +846,7 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, WARNING("could not assign observation vectors to k-means clusters"); // Reinitialize empty clusters with new centroids - IndexType_ emptyCentroid = + index_type_t emptyCentroid = (thrust::find(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), thrust::device_pointer_cast(clusterSizes + k), 0) - @@ -883,8 +892,8 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, * clusters are reinitialized by choosing new centroids with * k-means++ algorithm. * @tparam Index_Type_ the type of data used for indexing. - * @tparam ValueType_ the type of data used for weights, distances. - * @tparam ThrustExePolicy the type of thrust execution policy. + * @tparam value_type_t the type of data used for weights, distances. + * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy. * @param n Number of observation vectors. @@ -904,12 +913,13 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, * @param seed random seed to be used. * @return error flag */ -template -int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, - IndexType_ n, IndexType_ d, IndexType_ k, ValueType_ tol, - IndexType_ maxiter, const ValueType_* __restrict__ obs, - IndexType_* __restrict__ codes, ValueType_& residual, - IndexType_& iters, unsigned long long seed = 123456) { +template +int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, + index_type_t n, index_type_t d, index_type_t k, value_type_t tol, + index_type_t maxiter, const value_type_t* __restrict__ obs, + index_type_t* __restrict__ codes, value_type_t& residual, + index_type_t& iters, unsigned long long seed = 123456) { using namespace matrix; // Check that parameters are valid @@ -920,13 +930,13 @@ int kmeans(handle_t const& handle, ThrustExePolicy thrust_exec_policy, RAFT_EXPECTS(maxiter >= 0, "invalid parameter (maxiter<0)"); // Allocate memory - vector_t clusterSizes(handle, k); - vector_t centroids(handle, d * k); - vector_t work(handle, n * max(k, d)); - vector_t work_int(handle, 2 * d * n); + vector_t clusterSizes(handle, k); + vector_t centroids(handle, d * k); + vector_t work(handle, n * max(k, d)); + vector_t work_int(handle, 2 * d * n); // Perform k-means - return kmeans( + return kmeans( handle, thrust_exec_policy, n, d, k, tol, maxiter, obs, codes, clusterSizes.raw(), centroids.raw(), work.raw(), work_int.raw(), &residual, &iters, seed); From 7c79256a614b1e4a7e72f9788404858dc99c6e6f Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 6 Jul 2020 12:31:50 -0500 Subject: [PATCH 63/88] Addressed code reviews on kmeans cudaMemcpyAsync(). --- cpp/include/raft/spectral/kmeans.hpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 07c8748e1a..e5c0876211 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -365,8 +365,8 @@ static int chooseNewCentroid(handle_t const& handle, thrust::device_pointer_cast(dists + n), thrust::device_pointer_cast(distsCumSum)); CHECK_CUDA(stream); - CUDA_TRY(cudaMemcpy(&distsSum, distsCumSum + n - 1, sizeof(value_type_t), - cudaMemcpyDeviceToHost)); + CUDA_TRY(cudaMemcpyAsync(&distsSum, distsCumSum + n - 1, sizeof(value_type_t), + cudaMemcpyDeviceToHost, stream)); // Randomly choose observation vector // Probabilities are proportional to square of distance to closest @@ -387,8 +387,10 @@ static int chooseNewCentroid(handle_t const& handle, //linear interpolation logic: //{ value_type_t minSum{0}; - CUDA_TRY(cudaMemcpy(&minSum, distsCumSum, sizeof(value_type_t), - cudaMemcpyDeviceToHost)); + CUDA_TRY(cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), + cudaMemcpyDeviceToHost, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + if (distsSum > minSum) { value_type_t vIndex = static_cast(n - 1); obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / From 283fa0b3c6c83604d2b31f953f2eda1e4820c5c9 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 6 Jul 2020 17:20:58 -0500 Subject: [PATCH 64/88] Addressed code reviews on kmeans use of dim3{} cnstr. --- cpp/include/raft/spectral/kmeans.hpp | 72 +++++++++++++--------------- 1 file changed, 34 insertions(+), 38 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index e5c0876211..c5c5e88b88 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -453,9 +453,6 @@ static int initializeCentroids( // Loop index index_type_t i; - // CUDA grid dimensions - dim3 blockDim_warp, gridDim_warp, gridDim_block; - // Random number generator thrust::default_random_engine rng(seed); thrust::uniform_real_distribution uniformDist(0, 1); @@ -468,15 +465,14 @@ static int initializeCentroids( // ------------------------------------------------------- // Initialize grid dimensions - blockDim_warp.x = WARP_SIZE; - blockDim_warp.y = 1; - blockDim_warp.z = BSIZE_DIV_WSIZE; - gridDim_warp.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim_warp.y = 1; - gridDim_warp.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); - gridDim_block.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); - gridDim_block.y = 1; - gridDim_block.z = 1; + dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE}; + + // CUDA grid dimensions + dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, 65535), 1, + min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535)}; + + // CUDA grid dimensions + dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1}; // Assign observation vectors to code 0 CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); @@ -559,20 +555,22 @@ static int assignCentroids( const value_type_t* __restrict__ centroids, value_type_t* __restrict__ dists, index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, value_type_t* residual_host) { - // CUDA grid dimensions - dim3 blockDim, gridDim; - auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); // Compute distance between centroids and observation vectors CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream)); - blockDim.x = WARP_SIZE; - blockDim.y = 1; - blockDim.z = BLOCK_SIZE / WARP_SIZE; - gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim.y = min(k, 65535); - gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); + + // CUDA grid dimensions + dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; + + dim3 gridDim; + constexpr index_type_t grid_lower_bound{65535}; + gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound); + gridDim.y = min(k, grid_lower_bound); + gridDim.z = + min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound); + computeDistances<<>>(n, d, k, obs, centroids, dists); CHECK_CUDA(stream); @@ -645,9 +643,6 @@ static int updateCentroids(handle_t const& handle, auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); - // CUDA grid dimensions - dim3 blockDim, gridDim; - // Device memory thrust::device_ptr obs_copy(work); thrust::device_ptr codes_copy(work_int); @@ -687,12 +682,14 @@ static int updateCentroids(handle_t const& handle, CHECK_CUDA(stream); // Divide sums by cluster size to get centroid matrix - blockDim.x = WARP_SIZE; - blockDim.y = BLOCK_SIZE / WARP_SIZE; - blockDim.z = 1; - gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim.y = min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); - gridDim.z = 1; + // + // CUDA grid dimensions + dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1}; + + // CUDA grid dimensions + dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, 65535), + min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535), 1}; + divideCentroids<<>>(d, k, clusterSizes, centroids); CHECK_CUDA(stream); @@ -786,14 +783,13 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) WARNING("could not compute k-means centroids"); - dim3 blockDim, gridDim; - blockDim.x = WARP_SIZE; - blockDim.y = 1; - blockDim.z = BLOCK_SIZE / WARP_SIZE; - gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim.y = 1; - gridDim.z = - min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), 65535); + + dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; + + dim3 gridDim{ + min((d + WARP_SIZE - 1) / WARP_SIZE, 65535), 1, + min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), 65535)}; + CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream)); computeDistances<<>>(n, d, 1, obs, centroids, work); From 4ff50686fa50d4a4ba4dc0b225aca11a3eb20433 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 6 Jul 2020 17:32:50 -0500 Subject: [PATCH 65/88] Addressed code reviews on lowercase_t type names. --- cpp/include/raft/spectral/lanczos.hpp | 376 +++++++++++++------------- 1 file changed, 189 insertions(+), 187 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 8aa615c25d..796369abc0 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -81,29 +81,30 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, * Workspace. Not needed if full reorthogonalization is disabled. * @return Zero if successful. Otherwise non-zero. */ -template +template int performLanczosIteration( - handle_t const &handle, sparse_matrix_t const *A, - IndexType_ *iter, IndexType_ maxIter, ValueType_ shift, ValueType_ tol, - bool reorthogonalize, ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev) { + handle_t const &handle, sparse_matrix_t const *A, + index_type_t *iter, index_type_t maxIter, value_type_t shift, + value_type_t tol, bool reorthogonalize, value_type_t *__restrict__ alpha_host, + value_type_t *__restrict__ beta_host, + value_type_t *__restrict__ lanczosVecs_dev, + value_type_t *__restrict__ work_dev) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful variables - const ValueType_ one = 1; - const ValueType_ negOne = -1; - const ValueType_ zero = 0; - ValueType_ alpha; + const value_type_t one = 1; + const value_type_t negOne = -1; + const value_type_t zero = 0; + value_type_t alpha; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); RAFT_EXPECTS(A != nullptr, "Null matrix pointer."); - IndexType_ n = A->nrows_; + index_type_t n = A->nrows_; // ------------------------------------------------------- // Compute second Lanczos vector @@ -114,8 +115,8 @@ int performLanczosIteration( // Apply matrix if (shift != 0) CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev, - n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, - stream)); + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, stream)); A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); // Orthogonalize Lanczos vector @@ -149,7 +150,7 @@ int performLanczosIteration( if (shift != 0) CUDA_TRY(cudaMemcpyAsync( lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n, - n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); + n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); @@ -165,7 +166,7 @@ int performLanczosIteration( lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1), - sizeof(ValueType_), cudaMemcpyDeviceToHost, + sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); CUBLAS_CHECK(cublasgemv( @@ -228,8 +229,9 @@ int performLanczosIteration( * @param P (Output, host memory, 9 entries) Householder transform * matrix. Matrix dimensions are 3 x 3. */ -template -static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) { +template +static void findHouseholder3(value_type_t *v, value_type_t *Pv, + value_type_t *P) { // Compute norm of vector *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); @@ -239,7 +241,7 @@ static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) { v[0] -= *Pv; // Normalize Householder vector - ValueType_ normHouseholder = + value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); if (normHouseholder != 0) { v[0] /= normHouseholder; @@ -252,7 +254,7 @@ static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) { } // Construct Householder matrix - IndexType_ i, j; + index_type_t i, j; for (j = 0; j < 3; ++j) for (i = 0; i < 3; ++i) P[IDX(i, j, 3)] = -2 * v[i] * v[j]; for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1; @@ -267,12 +269,12 @@ static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) { * @param v (Input, host memory, 3 entries) Householder vector. * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. */ -template -static void applyHouseholder3(const ValueType_ *v, ValueType_ *A) { +template +static void applyHouseholder3(const value_type_t *v, value_type_t *A) { // Loop indices - IndexType_ i, j; + index_type_t i, j; // Dot product between Householder vector and matrix row/column - ValueType_ vDotA; + value_type_t vDotA; // Pre-apply Householder transform for (j = 0; j < 4; ++j) { @@ -307,31 +309,31 @@ static void applyHouseholder3(const ValueType_ *v, ValueType_ *A) { * @param work (Output, host memory, 3*n entries) Workspace. * @return Zero if successful. Otherwise non-zero. */ -template -static int francisQRIteration(IndexType_ n, ValueType_ shift1, - ValueType_ shift2, ValueType_ *alpha, - ValueType_ *beta, ValueType_ *V, - ValueType_ *work) { +template +static int francisQRIteration(index_type_t n, value_type_t shift1, + value_type_t shift2, value_type_t *alpha, + value_type_t *beta, value_type_t *V, + value_type_t *work) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Temporary storage of 4x4 bulge and Householder vector - ValueType_ bulge[16]; + value_type_t bulge[16]; // Householder vector - ValueType_ householder[3]; + value_type_t householder[3]; // Householder matrix - ValueType_ householderMatrix[3 * 3]; + value_type_t householderMatrix[3 * 3]; // Shifts are roots of the polynomial p(x)=x^2+b*x+c - ValueType_ b = -shift1 - shift2; - ValueType_ c = shift1 * shift2; + value_type_t b = -shift1 - shift2; + value_type_t c = shift1 * shift2; // Loop indices - IndexType_ i, j, pos; + index_type_t i, j, pos; // Temporary variable - ValueType_ temp; + value_type_t temp; // ------------------------------------------------------- // Implementation @@ -341,20 +343,20 @@ static int francisQRIteration(IndexType_ n, ValueType_ shift1, householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c; householder[1] = beta[0] * (alpha[0] + alpha[1] + b); householder[2] = beta[0] * beta[1]; - findHouseholder3(householder, &temp, - householderMatrix); + findHouseholder3(householder, &temp, + householderMatrix); // Apply initial Householder transform to create bulge - memset(bulge, 0, 16 * sizeof(ValueType_)); + memset(bulge, 0, 16 * sizeof(value_type_t)); for (i = 0; i < 4; ++i) bulge[IDX(i, i, 4)] = alpha[i]; for (i = 0; i < 3; ++i) { bulge[IDX(i + 1, i, 4)] = beta[i]; bulge[IDX(i, i + 1, 4)] = beta[i]; } - applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, - 0, work, n); - memcpy(V, work, 3 * n * sizeof(ValueType_)); + applyHouseholder3(householder, bulge); + Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, + 3, 0, work, n); + memcpy(V, work, 3 * n * sizeof(value_type_t)); // Chase bulge to bottom-right of matrix with Householder transforms for (pos = 0; pos < n - 4; ++pos) { @@ -374,12 +376,12 @@ static int francisQRIteration(IndexType_ n, ValueType_ shift1, bulge[IDX(3, 3, 4)] = alpha[pos + 4]; // Apply Householder transform - findHouseholder3(householder, beta + pos, - householderMatrix); - applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), - n, householderMatrix, 3, 0, work, n); - memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(ValueType_)); + findHouseholder3(householder, beta + pos, + householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), + n, householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t)); } // Apply penultimate Householder transform @@ -397,12 +399,12 @@ static int francisQRIteration(IndexType_ n, ValueType_ shift1, bulge[IDX(1, 3, 4)] = 0; bulge[IDX(2, 3, 4)] = 0; bulge[IDX(3, 3, 4)] = 0; - findHouseholder3(householder, beta + n - 4, - householderMatrix); - applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, - householderMatrix, 3, 0, work, n); - memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(ValueType_)); + findHouseholder3(householder, beta + n - 4, + householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, + householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t)); // Apply final Householder transform // Values in the last two rows and columns are zero @@ -412,12 +414,12 @@ static int francisQRIteration(IndexType_ n, ValueType_ shift1, householder[2] = 0; for (j = 0; j < 3; ++j) for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; - findHouseholder3(householder, beta + n - 3, - householderMatrix); - applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, - householderMatrix, 3, 0, work, n); - memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(ValueType_)); + findHouseholder3(householder, beta + n - 3, + householderMatrix); + applyHouseholder3(householder, bulge); + Lapack::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, + householderMatrix, 3, 0, work, n); + memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t)); // Bulge has been eliminated alpha[n - 2] = bulge[IDX(0, 0, 4)]; @@ -456,49 +458,49 @@ static int francisQRIteration(IndexType_ n, ValueType_ shift1, * @param work_dev (Output, device memory, (n+iter)*iter entries) * Workspace. */ -template +template static int lanczosRestart( - handle_t const &handle, IndexType_ n, IndexType_ iter, IndexType_ iter_new, - ValueType_ *shiftUpper, ValueType_ *shiftLower, - ValueType_ *__restrict__ alpha_host, ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ V_host, ValueType_ *__restrict__ work_host, - ValueType_ *__restrict__ lanczosVecs_dev, ValueType_ *__restrict__ work_dev, - bool smallest_eig) { + handle_t const &handle, index_type_t n, index_type_t iter, + index_type_t iter_new, value_type_t *shiftUpper, value_type_t *shiftLower, + value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, + value_type_t *__restrict__ V_host, value_type_t *__restrict__ work_host, + value_type_t *__restrict__ lanczosVecs_dev, + value_type_t *__restrict__ work_dev, bool smallest_eig) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful constants - const ValueType_ zero = 0; - const ValueType_ one = 1; + const value_type_t zero = 0; + const value_type_t one = 1; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); // Loop index - IndexType_ i; + index_type_t i; // Number of implicit restart steps // Assumed to be even since each call to Francis algorithm is // equivalent to two calls of QR algorithm - IndexType_ restartSteps = iter - iter_new; + index_type_t restartSteps = iter - iter_new; // Ritz values from Lanczos method - ValueType_ *ritzVals_host = work_host + 3 * iter; + value_type_t *ritzVals_host = work_host + 3 * iter; // Shifts for implicit restart - ValueType_ *shifts_host; + value_type_t *shifts_host; // Orthonormal matrix for similarity transform - ValueType_ *V_dev = work_dev + n * iter; + value_type_t *V_dev = work_dev + n * iter; // ------------------------------------------------------- // Implementation // ------------------------------------------------------- // Compute Ritz values - memcpy(ritzVals_host, alpha_host, iter * sizeof(ValueType_)); - memcpy(work_host, beta_host, (iter - 1) * sizeof(ValueType_)); - Lapack::sterf(iter, ritzVals_host, work_host); + memcpy(ritzVals_host, alpha_host, iter * sizeof(value_type_t)); + memcpy(work_host, beta_host, (iter - 1) * sizeof(value_type_t)); + Lapack::sterf(iter, ritzVals_host, work_host); // Debug: Print largest eigenvalues // for (int i = iter-iter_new; i < iter; ++i) @@ -506,7 +508,7 @@ static int lanczosRestart( // std::cout <(M_PI) / restartSteps); + cos((i + 0.5) * static_cast(M_PI) / restartSteps); shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower)); shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower)); } @@ -544,7 +546,7 @@ static int lanczosRestart( WARNING("error in implicitly shifted QR algorithm"); // Obtain new residual - CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(ValueType_), + CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); beta_host[iter - 1] = @@ -560,13 +562,13 @@ static int lanczosRestart( work_dev, n, stream)); CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev, - n * iter_new * sizeof(ValueType_), + n * iter_new * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); // Normalize residual to obtain new Lanczos vector CUDA_TRY(cudaMemcpyAsync( lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n), - n * sizeof(ValueType_), cudaMemcpyDeviceToDevice, stream)); + n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream)); @@ -630,45 +632,46 @@ static int lanczosRestart( * with dimensions n x nEigVecs. * @return error flag. */ -template +template int computeSmallestEigenvectors( - handle_t const &handle, sparse_matrix_t const *A, - IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, - ValueType_ tol, bool reorthogonalize, IndexType_ *effIter, - IndexType_ *totalIter, ValueType_ *shift, ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev, ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev, unsigned long long seed) { + handle_t const &handle, sparse_matrix_t const *A, + index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, + value_type_t tol, bool reorthogonalize, index_type_t *effIter, + index_type_t *totalIter, value_type_t *shift, + value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, + value_type_t *__restrict__ lanczosVecs_dev, + value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, + value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { using namespace spectral; // Useful constants - const ValueType_ one = 1; - const ValueType_ zero = 0; + const value_type_t one = 1; + const value_type_t zero = 0; // Matrix dimension - IndexType_ n = A->nrows_; + index_type_t n = A->nrows_; // Shift for implicit restart - ValueType_ shiftUpper; - ValueType_ shiftLower; + value_type_t shiftUpper; + value_type_t shiftLower; // Lanczos iteration counters - IndexType_ maxIter_curr = restartIter; // Maximum size of Lanczos system + index_type_t maxIter_curr = restartIter; // Maximum size of Lanczos system // Status flags int status; // Loop index - IndexType_ i; + index_type_t i; // Host memory - ValueType_ *Z_host; // Eigenvectors in Lanczos basis - ValueType_ *work_host; // Workspace + value_type_t *Z_host; // Eigenvectors in Lanczos basis + value_type_t *work_host; // Workspace // ------------------------------------------------------- // Check that LAPACK is enabled // ------------------------------------------------------- - // Lapack::check_lapack_enabled(); + // Lapack::check_lapack_enabled(); // ------------------------------------------------------- // Check that parameters are valid @@ -691,8 +694,8 @@ int computeSmallestEigenvectors( *totalIter = 0; // Allocate host memory - std::vector Z_host_v(restartIter * restartIter); - std::vector work_host_v(4 * restartIter); + std::vector Z_host_v(restartIter * restartIter); + std::vector work_host_v(4 * restartIter); Z_host = Z_host_v.data(); work_host = work_host_v.data(); @@ -717,7 +720,7 @@ int computeSmallestEigenvectors( // Initialize initial Lanczos vector curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); - ValueType_ normQ1; + value_type_t normQ1; CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); auto h_val = 1 / normQ1; @@ -725,22 +728,22 @@ int computeSmallestEigenvectors( // Estimate number of Lanczos iterations // See bounds in Kuczynski and Wozniakowski (1992). - // const ValueType_ relError = 0.25; // Relative error - // const ValueType_ failProb = 1e-4; // Probability of failure + // const value_type_t relError = 0.25; // Relative error + // const value_type_t failProb = 1e-4; // Probability of failure // maxIter_curr = log(n/pow(failProb,2))/(4*std::sqrt(relError)) + 1; // maxIter_curr = min(maxIter_curr, restartIter); // Obtain tridiagonal matrix with Lanczos *effIter = 0; *shift = 0; - status = performLanczosIteration( + status = performLanczosIteration( handle, A, effIter, maxIter_curr, *shift, 0.0, reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); // Determine largest eigenvalue - Lapack::sterf(*effIter, alpha_host, beta_host); + Lapack::sterf(*effIter, alpha_host, beta_host); *shift = -alpha_host[*effIter - 1]; // std::cout << *shift <( + status = performLanczosIteration( handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); @@ -762,14 +765,14 @@ int computeSmallestEigenvectors( while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) { // Determine number of restart steps // Number of steps must be even due to Francis algorithm - IndexType_ iter_new = nEigVecs + 1; + index_type_t iter_new = nEigVecs + 1; if (restartIter - (maxIter - *totalIter) > nEigVecs + 1) iter_new = restartIter - (maxIter - *totalIter); if ((restartIter - iter_new) % 2) iter_new -= 1; if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart( + status = lanczosRestart( handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, true); if (status) WARNING("error in Lanczos implicit restart"); @@ -780,7 +783,7 @@ int computeSmallestEigenvectors( // Proceed with Lanczos method // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); - status = performLanczosIteration( + status = performLanczosIteration( handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); @@ -794,12 +797,12 @@ int computeSmallestEigenvectors( // Solve tridiagonal system memcpy(work_host + 2 * (*effIter), alpha_host, - (*effIter) * sizeof(ValueType_)); + (*effIter) * sizeof(value_type_t)); memcpy(work_host + 3 * (*effIter), beta_host, - (*effIter - 1) * sizeof(ValueType_)); - Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), - work_host + 3 * (*effIter), Z_host, *effIter, - work_host); + (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), + work_host + 3 * (*effIter), Z_host, *effIter, + work_host); // Obtain desired eigenvalues by applying shift for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; @@ -807,13 +810,13 @@ int computeSmallestEigenvectors( // Copy results to device memory CUDA_TRY(cudaMemcpy(eigVals_dev, work_host + 2 * (*effIter), - nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); + nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice)); // for (int i = 0; i < nEigVecs; ++i) //{ // std::cout <<*(work_host+(2*(*effIter)+i))<< std::endl; //} CUDA_TRY(cudaMemcpy(work_dev, Z_host, - (*effIter) * nEigVecs * sizeof(ValueType_), + (*effIter) * nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice)); // Convert eigenvectors from Lanczos basis to standard basis @@ -862,17 +865,17 @@ int computeSmallestEigenvectors( * with dimensions n x nEigVecs. * @return error flag. */ -template +template int computeSmallestEigenvectors( - handle_t const &handle, sparse_matrix_t const &A, - IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, - ValueType_ tol, bool reorthogonalize, IndexType_ &iter, - ValueType_ *__restrict__ eigVals_dev, ValueType_ *__restrict__ eigVecs_dev, - unsigned long long seed = 1234567) { + handle_t const &handle, sparse_matrix_t const &A, + index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, + value_type_t tol, bool reorthogonalize, index_type_t &iter, + value_type_t *__restrict__ eigVals_dev, + value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) { using namespace spectral; // Matrix dimension - IndexType_ n = A.nrows_; + index_type_t n = A.nrows_; // Check that parameters are valid RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, @@ -883,19 +886,19 @@ int computeSmallestEigenvectors( RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); // Allocate memory - std::vector alpha_host_v(restartIter); - std::vector beta_host_v(restartIter); + std::vector alpha_host_v(restartIter); + std::vector beta_host_v(restartIter); - ValueType_ *alpha_host = alpha_host_v.data(); - ValueType_ *beta_host = beta_host_v.data(); + value_type_t *alpha_host = alpha_host_v.data(); + value_type_t *beta_host = beta_host_v.data(); //TODO: replace and fix allocation via RAFT handle - vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); - vector_t work_dev(handle, (n + restartIter) * restartIter); + vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); + vector_t work_dev(handle, (n + restartIter) * restartIter); // Perform Lanczos method - IndexType_ effIter; - ValueType_ shift; + index_type_t effIter; + value_type_t shift; int status = computeSmallestEigenvectors( handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, &iter, &shift, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), @@ -950,41 +953,42 @@ int computeSmallestEigenvectors( * with dimensions n x nEigVecs. * @return error flag. */ -template +template int computeLargestEigenvectors( - handle_t const &handle, sparse_matrix_t const *A, - IndexType_ nEigVecs, IndexType_ maxIter, IndexType_ restartIter, - ValueType_ tol, bool reorthogonalize, IndexType_ *effIter, - IndexType_ *totalIter, ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev, ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev, unsigned long long seed) { + handle_t const &handle, sparse_matrix_t const *A, + index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, + value_type_t tol, bool reorthogonalize, index_type_t *effIter, + index_type_t *totalIter, value_type_t *__restrict__ alpha_host, + value_type_t *__restrict__ beta_host, + value_type_t *__restrict__ lanczosVecs_dev, + value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, + value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { using namespace spectral; // Useful constants - const ValueType_ one = 1; - const ValueType_ zero = 0; + const value_type_t one = 1; + const value_type_t zero = 0; // Matrix dimension - IndexType_ n = A->nrows_; + index_type_t n = A->nrows_; // Lanczos iteration counters - IndexType_ maxIter_curr = restartIter; // Maximum size of Lanczos system + index_type_t maxIter_curr = restartIter; // Maximum size of Lanczos system // Status flags int status; // Loop index - IndexType_ i; + index_type_t i; // Host memory - ValueType_ *Z_host; // Eigenvectors in Lanczos basis - ValueType_ *work_host; // Workspace + value_type_t *Z_host; // Eigenvectors in Lanczos basis + value_type_t *work_host; // Workspace // ------------------------------------------------------- // Check that LAPACK is enabled // ------------------------------------------------------- - // Lapack::check_lapack_enabled(); + // Lapack::check_lapack_enabled(); // ------------------------------------------------------- // Check that parameters are valid @@ -1007,8 +1011,8 @@ int computeLargestEigenvectors( *totalIter = 0; // Allocate host memory - std::vector Z_host_v(restartIter * restartIter); - std::vector work_host_v(4 * restartIter); + std::vector Z_host_v(restartIter * restartIter); + std::vector work_host_v(4 * restartIter); Z_host = Z_host_v.data(); work_host = work_host_v.data(); @@ -1028,7 +1032,7 @@ int computeLargestEigenvectors( curandSetPseudoRandomGeneratorSeed(randGen, seed); // Initialize initial Lanczos vector curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); - ValueType_ normQ1; + value_type_t normQ1; CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream)); auto h_val = 1 / normQ1; @@ -1036,36 +1040,36 @@ int computeLargestEigenvectors( // Estimate number of Lanczos iterations // See bounds in Kuczynski and Wozniakowski (1992). - // const ValueType_ relError = 0.25; // Relative error - // const ValueType_ failProb = 1e-4; // Probability of failure + // const value_type_t relError = 0.25; // Relative error + // const value_type_t failProb = 1e-4; // Probability of failure // maxIter_curr = log(n/pow(failProb,2))/(4*std::sqrt(relError)) + 1; // maxIter_curr = min(maxIter_curr, restartIter); // Obtain tridiagonal matrix with Lanczos *effIter = 0; - ValueType_ shift_val = 0.0; - ValueType_ *shift = &shift_val; + value_type_t shift_val = 0.0; + value_type_t *shift = &shift_val; // maxIter_curr = min(maxIter, restartIter); - status = performLanczosIteration( + status = performLanczosIteration( handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter; // Apply Lanczos method until convergence - ValueType_ shiftLower = 1; - ValueType_ shiftUpper = -1; + value_type_t shiftLower = 1; + value_type_t shiftUpper = -1; while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) { // Determine number of restart steps // Number of steps must be even due to Francis algorithm - IndexType_ iter_new = nEigVecs + 1; + index_type_t iter_new = nEigVecs + 1; if (restartIter - (maxIter - *totalIter) > nEigVecs + 1) iter_new = restartIter - (maxIter - *totalIter); if ((restartIter - iter_new) % 2) iter_new -= 1; if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart( + status = lanczosRestart( handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, false); if (status) WARNING("error in Lanczos implicit restart"); @@ -1076,7 +1080,7 @@ int computeLargestEigenvectors( // Proceed with Lanczos method // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); - status = performLanczosIteration( + status = performLanczosIteration( handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); @@ -1092,12 +1096,12 @@ int computeLargestEigenvectors( } // Solve tridiagonal system memcpy(work_host + 2 * (*effIter), alpha_host, - (*effIter) * sizeof(ValueType_)); + (*effIter) * sizeof(value_type_t)); memcpy(work_host + 3 * (*effIter), beta_host, - (*effIter - 1) * sizeof(ValueType_)); - Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), - work_host + 3 * (*effIter), Z_host, *effIter, - work_host); + (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), + work_host + 3 * (*effIter), Z_host, *effIter, + work_host); // note: We need to pick the top nEigVecs eigenvalues // but effItter can be larger than nEigVecs @@ -1105,7 +1109,7 @@ int computeLargestEigenvectors( // matrix of size effIter. remember the array is sorted, so it is not needed for smallest // eigenvalues case because the first ones are the smallest ones - IndexType_ top_eigenparis_idx_offset = *effIter - nEigVecs; + index_type_t top_eigenparis_idx_offset = *effIter - nEigVecs; // Debug : print nEigVecs largest eigenvalues // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i) @@ -1130,12 +1134,12 @@ int computeLargestEigenvectors( // skip smallest eigenvalue if needed CUDA_TRY(cudaMemcpy(eigVals_dev, work_host + 2 * (*effIter) + top_eigenparis_idx_offset, - nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); + nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice)); // skip smallest eigenvector if needed CUDA_TRY(cudaMemcpy( work_dev, Z_host + (top_eigenparis_idx_offset * (*effIter)), - (*effIter) * nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); + (*effIter) * nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice)); // Convert eigenvectors from Lanczos basis to standard basis CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, @@ -1183,17 +1187,15 @@ int computeLargestEigenvectors( * with dimensions n x nEigVecs. * @return error flag. */ -template -int computeLargestEigenvectors(handle_t const &handle, - sparse_matrix_t const &A, - IndexType_ nEigVecs, IndexType_ maxIter, - IndexType_ restartIter, ValueType_ tol, - bool reorthogonalize, IndexType_ &iter, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev, - unsigned long long seed = 123456) { +template +int computeLargestEigenvectors( + handle_t const &handle, sparse_matrix_t const &A, + index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, + value_type_t tol, bool reorthogonalize, index_type_t &iter, + value_type_t *__restrict__ eigVals_dev, + value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) { // Matrix dimension - IndexType_ n = A.nrows_; + index_type_t n = A.nrows_; // Check that parameters are valid RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, @@ -1204,18 +1206,18 @@ int computeLargestEigenvectors(handle_t const &handle, RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); // Allocate memory - std::vector alpha_host_v(restartIter); - std::vector beta_host_v(restartIter); + std::vector alpha_host_v(restartIter); + std::vector beta_host_v(restartIter); - ValueType_ *alpha_host = alpha_host_v.data(); - ValueType_ *beta_host = beta_host_v.data(); + value_type_t *alpha_host = alpha_host_v.data(); + value_type_t *beta_host = beta_host_v.data(); //TODO: replace and fix allocation via RAFT handle - vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); - vector_t work_dev(handle, (n + restartIter) * restartIter); + vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); + vector_t work_dev(handle, (n + restartIter) * restartIter); // Perform Lanczos method - IndexType_ effIter; + index_type_t effIter; int status = computeLargestEigenvectors( handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, &iter, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), From ff5ad230d561033788ddeb79db93bfb339550699 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 6 Jul 2020 18:23:08 -0500 Subject: [PATCH 66/88] Addressed code reviews on replacing cudaDeviceSynchronize(). --- cpp/include/raft/spectral/lanczos.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 796369abc0..6a53879723 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -209,7 +209,7 @@ int performLanczosIteration( lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } - CUDA_TRY(cudaDeviceSynchronize()); + CUDA_TRY(cudaStreamSynchronize(stream)); return 0; } From 8b30cda3c512caae95ee85b70f5ea4a7484e9387 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 11:39:57 -0500 Subject: [PATCH 67/88] Addressed comments on @brief in lanczos. --- cpp/include/raft/spectral/kmeans.hpp | 22 ++-- cpp/include/raft/spectral/lanczos.hpp | 155 +++++++++++++++----------- 2 files changed, 100 insertions(+), 77 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index c5c5e88b88..9017d2b8d4 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -58,7 +58,7 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); * blockSize/warpSize). Ideally, the grid is large enough so there * are d threads in the x-direction, k threads in the y-direction, * and n threads in the z-direction. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @param n Number of observation vectors. * @param d Dimension of observation vectors. @@ -138,7 +138,7 @@ static __global__ void computeDistances( * @brief Find closest centroid to observation vectors. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @param n Number of observation vectors. * @param k Number of clusters. @@ -198,7 +198,7 @@ static __global__ void minDistances(index_type_t n, index_type_t k, * @brief Check if newly computed distances are smaller than old distances. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @param n Number of observation vectors. * @param dists_old (Input/output, n entries) Distances between @@ -248,7 +248,7 @@ static __global__ void minDistances2(index_type_t n, * @brief Compute size of k-means clusters. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @param n Number of observation vectors. * @param k Number of clusters. * @param codes (Input, n entries) Cluster assignments. @@ -275,7 +275,7 @@ static __global__ void computeClusterSizes( * dimensions should be 2-dimensional. Ideally the grid is large * enough so there are d threads in the x-direction and k threads * in the y-direction. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -323,7 +323,7 @@ static __global__ void divideCentroids( /** * @brief Randomly choose new centroids. * Centroid is randomly chosen with k-means++ algorithm. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. @@ -415,7 +415,7 @@ static int chooseNewCentroid(handle_t const& handle, /** * @brief Choose initial cluster centroids for k-means algorithm. * Centroids are randomly chosen with k-means++ algorithm - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. @@ -522,7 +522,7 @@ static int initializeCentroids( /** * @brief Find cluster centroids closest to observation vectors. * Distance is measured with Euclidean norm. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. @@ -598,7 +598,7 @@ static int assignCentroids( /** * @brief Update cluster centroids for k-means algorithm. * All clusters are assumed to be non-empty. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. @@ -710,7 +710,7 @@ namespace raft { * Initial centroids are chosen with k-means++ algorithm. Empty * clusters are reinitialized by choosing new centroids with * k-means++ algorithm. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. @@ -889,7 +889,7 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, * Initial centroids are chosen with k-means++ algorithm. Empty * clusters are reinitialized by choosing new centroids with * k-means++ algorithm. - * @tparam Index_Type_ the type of data used for indexing. + * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 6a53879723..8a80706f48 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -55,9 +55,12 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, // Helper functions // ========================================================= -/// Perform Lanczos iteration -/** Lanczos iteration is performed on a shifted matrix A+shift*I. - * +/** + * @brief Perform Lanczos iteration + * Lanczos iteration is performed on a shifted matrix A+shift*I. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. * @param A Matrix. * @param iter Pointer to current Lanczos iteration. On exit, the * variable is set equal to the final Lanczos iteration. @@ -214,12 +217,14 @@ int performLanczosIteration( return 0; } -/// Find Householder transform for 3-dimensional system -/** Given an input vector v=[x,y,z]', this function finds a - * Householder transform P such that P*v is a multiple of - * e_1=[1,0,0]'. The input vector v is overwritten with the - * Householder vector such that P=I-2*v*v'. - * +/** + * @brief Find Householder transform for 3-dimensional system + * Given an input vector v=[x,y,z]', this function finds a + * Householder transform P such that P*v is a multiple of + * e_1=[1,0,0]'. The input vector v is overwritten with the + * Householder vector such that P=I-2*v*v'. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. * @param v (Input/output, host memory, 3 entries) Input * 3-dimensional vector. On exit, the vector is set to the * Householder vector. @@ -260,12 +265,14 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv, for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1; } -/// Apply 3-dimensional Householder transform to 4 x 4 matrix -/** The Householder transform is pre-applied to the top three rows +/** + * @brief Apply 3-dimensional Householder transform to 4 x 4 matrix + * The Householder transform is pre-applied to the top three rows * of the matrix and post-applied to the left three columns. The * 4 x 4 matrix is intended to contain the bulge that is produced * in the Francis QR algorithm. - * + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. * @param v (Input, host memory, 3 entries) Householder vector. * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. */ @@ -291,10 +298,12 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) { } } -/// Perform one step of Francis QR algorithm -/** Equivalent to two steps of the classical QR algorithm on a - * tridiagonal matrix. - * +/** + * @brief Perform one step of Francis QR algorithm + * Equivalent to two steps of the classical QR algorithm on a + * tridiagonal matrix. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. * @param n Matrix dimension. * @param shift1 QR algorithm shift. * @param shift2 QR algorithm shift. @@ -429,9 +438,12 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, return 0; } -/// Perform implicit restart of Lanczos algorithm -/** Shifts are Chebyshev nodes of unwanted region of matrix spectrum. - * +/** + * @brief Perform implicit restart of Lanczos algorithm + * Shifts are Chebyshev nodes of unwanted region of matrix spectrum. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. * @param n Matrix dimension. * @param iter Current Lanczos iteration. * @param iter_new Lanczos iteration after restart. @@ -457,6 +469,9 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, * column-major matrix with dimensions n x (iter+1). * @param work_dev (Output, device memory, (n+iter)*iter entries) * Workspace. + * @param smallest_eig specifies whether smallest (true) or largest + * (false) eigenvalues are to be calculated. + * @return error flag. */ template static int lanczosRestart( @@ -586,17 +601,19 @@ static int lanczosRestart( // Eigensolver // ========================================================= -/// Compute smallest eigenvectors of symmetric matrix -/** Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are smallest in - * magnitude. - * - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied to A+s*I, where s is negative the largest - * eigenvalue. - * +/** + * @brief Compute smallest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are smallest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied to A+s*I, where s is negative the largest + * eigenvalue. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. * @param A Matrix. * @param nEigVecs Number of eigenvectors to compute. * @param maxIter Maximum number of Lanczos steps. Does not include @@ -630,6 +647,7 @@ static int lanczosRestart( * Eigenvectors corresponding to smallest eigenvalues of * matrix. Vectors are stored as columns of a column-major matrix * with dimensions n x nEigVecs. + * @param seed random seed. * @return error flag. */ template @@ -829,19 +847,19 @@ int computeSmallestEigenvectors( return 0; } -/// Compute smallest eigenvectors of symmetric matrix -/** Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are smallest in - * magnitude. - * - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied to A+s*I, where s is negative the largest - * eigenvalue. - * - * CNMEM must be initialized before calling this function. - * +/** + * @brief Compute smallest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are smallest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied to A+s*I, where s is negative the largest + * eigenvalue. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. * @param A Matrix. * @param nEigVecs Number of eigenvectors to compute. * @param maxIter Maximum number of Lanczos steps. Does not include @@ -863,6 +881,7 @@ int computeSmallestEigenvectors( * Eigenvectors corresponding to smallest eigenvalues of * matrix. Vectors are stored as columns of a column-major matrix * with dimensions n x nEigVecs. + * @param seed random seed. * @return error flag. */ template @@ -912,16 +931,18 @@ int computeSmallestEigenvectors( // Eigensolver // ========================================================= -/// Compute largest eigenvectors of symmetric matrix -/** Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are largest in - * magnitude. - * - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied. - * +/** + * @brief Compute largest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are largest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. * @param A Matrix. * @param nEigVecs Number of eigenvectors to compute. * @param maxIter Maximum number of Lanczos steps. @@ -951,6 +972,7 @@ int computeSmallestEigenvectors( * Eigenvectors corresponding to largest eigenvalues of * matrix. Vectors are stored as columns of a column-major matrix * with dimensions n x nEigVecs. + * @param seed random seed. * @return error flag. */ template @@ -1151,19 +1173,19 @@ int computeLargestEigenvectors( return 0; } -/// Compute largest eigenvectors of symmetric matrix -/** Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are largest in - * magnitude. - * - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied to A+s*I, where s is negative the largest - * eigenvalue. - * - * CNMEM must be initialized before calling this function. - * +/** + * @brief Compute largest eigenvectors of symmetric matrix + * Computes eigenvalues and eigenvectors that are least + * positive. If matrix is positive definite or positive + * semidefinite, the computed eigenvalues are largest in + * magnitude. + * The largest eigenvalue is estimated by performing several + * Lanczos iterations. An implicitly restarted Lanczos method is + * then applied to A+s*I, where s is negative the largest + * eigenvalue. + * @tparam index_type_t the type of data used for indexing. + * @tparam value_type_t the type of data used for weights, distances. + * @param handle the raft handle. * @param A Matrix. * @param nEigVecs Number of eigenvectors to compute. * @param maxIter Maximum number of Lanczos steps. Does not include @@ -1185,6 +1207,7 @@ int computeLargestEigenvectors( * Eigenvectors corresponding to largest eigenvalues of * matrix. Vectors are stored as columns of a column-major matrix * with dimensions n x nEigVecs. + * @param seed random seed. * @return error flag. */ template From 8bf8589504394afd73f92d1344c970454641e96c Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 12:06:52 -0500 Subject: [PATCH 68/88] Addressed comments on host memory pointers and constexpr one/zero. --- cpp/include/raft/spectral/lanczos.hpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 8a80706f48..94340fd5e6 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -97,9 +97,9 @@ int performLanczosIteration( // ------------------------------------------------------- // Useful variables - const value_type_t one = 1; - const value_type_t negOne = -1; - const value_type_t zero = 0; + constexpr value_type_t one = 1; + constexpr value_type_t negOne = -1; + constexpr value_type_t zero = 0; value_type_t alpha; auto cublas_h = handle.get_cublas_handle(); @@ -447,11 +447,11 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, * @param n Matrix dimension. * @param iter Current Lanczos iteration. * @param iter_new Lanczos iteration after restart. - * @param shiftUpper Pointer to upper bound for unwanted + * @param shiftUpper Pointer (host memory) to upper bound for unwanted * region. Value is ignored if less than *shiftLower. If a * stronger upper bound has been found, the value is updated on * exit. - * @param shiftLower Pointer to lower bound for unwanted + * @param shiftLower Pointer (host memory) to lower bound for unwanted * region. Value is ignored if greater than *shiftUpper. If a * stronger lower bound has been found, the value is updated on * exit. @@ -486,8 +486,8 @@ static int lanczosRestart( // ------------------------------------------------------- // Useful constants - const value_type_t zero = 0; - const value_type_t one = 1; + constexpr value_type_t zero = 0; + constexpr value_type_t one = 1; auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -663,8 +663,8 @@ int computeSmallestEigenvectors( using namespace spectral; // Useful constants - const value_type_t one = 1; - const value_type_t zero = 0; + constexpr value_type_t one = 1; + constexpr value_type_t zero = 0; // Matrix dimension index_type_t n = A->nrows_; @@ -988,8 +988,8 @@ int computeLargestEigenvectors( using namespace spectral; // Useful constants - const value_type_t one = 1; - const value_type_t zero = 0; + constexpr value_type_t one = 1; + constexpr value_type_t zero = 0; // Matrix dimension index_type_t n = A->nrows_; From f30e636f67e9eb222ec0539605e8896424bea649 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 12:15:22 -0500 Subject: [PATCH 69/88] Addressed comments on removing stale commented code. --- cpp/include/raft/spectral/lanczos.hpp | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 94340fd5e6..2513e55855 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -744,13 +744,6 @@ int computeSmallestEigenvectors( auto h_val = 1 / normQ1; CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); - // Estimate number of Lanczos iterations - // See bounds in Kuczynski and Wozniakowski (1992). - // const value_type_t relError = 0.25; // Relative error - // const value_type_t failProb = 1e-4; // Probability of failure - // maxIter_curr = log(n/pow(failProb,2))/(4*std::sqrt(relError)) + 1; - // maxIter_curr = min(maxIter_curr, restartIter); - // Obtain tridiagonal matrix with Lanczos *effIter = 0; *shift = 0; @@ -1060,13 +1053,6 @@ int computeLargestEigenvectors( auto h_val = 1 / normQ1; CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); - // Estimate number of Lanczos iterations - // See bounds in Kuczynski and Wozniakowski (1992). - // const value_type_t relError = 0.25; // Relative error - // const value_type_t failProb = 1e-4; // Probability of failure - // maxIter_curr = log(n/pow(failProb,2))/(4*std::sqrt(relError)) + 1; - // maxIter_curr = min(maxIter_curr, restartIter); - // Obtain tridiagonal matrix with Lanczos *effIter = 0; value_type_t shift_val = 0.0; From 44ff8bf09a39b40809c88cf14b1be5a52f02d660 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 12:35:02 -0500 Subject: [PATCH 70/88] Addressed comments on removing stale commented code. --- cpp/include/raft/spectral/lanczos.hpp | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 2513e55855..e995b60778 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -756,14 +756,14 @@ int computeSmallestEigenvectors( Lapack::sterf(*effIter, alpha_host, beta_host); *shift = -alpha_host[*effIter - 1]; - // std::cout << *shift <( handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); @@ -793,7 +793,7 @@ int computeSmallestEigenvectors( if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break; // Proceed with Lanczos method - // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); + status = performLanczosIteration( handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); @@ -822,10 +822,7 @@ int computeSmallestEigenvectors( // Copy results to device memory CUDA_TRY(cudaMemcpy(eigVals_dev, work_host + 2 * (*effIter), nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice)); - // for (int i = 0; i < nEigVecs; ++i) - //{ - // std::cout <<*(work_host+(2*(*effIter)+i))<< std::endl; - //} + CUDA_TRY(cudaMemcpy(work_dev, Z_host, (*effIter) * nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice)); @@ -1057,7 +1054,7 @@ int computeLargestEigenvectors( *effIter = 0; value_type_t shift_val = 0.0; value_type_t *shift = &shift_val; - // maxIter_curr = min(maxIter, restartIter); + status = performLanczosIteration( handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); @@ -1087,7 +1084,7 @@ int computeLargestEigenvectors( if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break; // Proceed with Lanczos method - // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); + status = performLanczosIteration( handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); From 04e8790a14f11cf532196343c1ba2c760f1e7023 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 12:39:02 -0500 Subject: [PATCH 71/88] Addressed comments on removing stale (fixed) FIXME comment. --- cpp/include/raft/spectral/lanczos.hpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index e995b60778..808f8a1e35 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -731,10 +731,7 @@ int computeSmallestEigenvectors( // Initialize random number generator curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10); - // FIXME: This is hard coded, which is good for unit testing... - // but should really be a parameter so it could be - // "random" for real runs and "fixed" for tests - curandSetPseudoRandomGeneratorSeed(randGen, seed /*time(NULL)*/); + curandSetPseudoRandomGeneratorSeed(randGen, seed); // Initialize initial Lanczos vector curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one); From 5de2234428ff499e6a5c4c57179ef82e8d787502 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 13:03:38 -0500 Subject: [PATCH 72/88] Addressed comment on async copies. --- cpp/include/raft/spectral/lanczos.hpp | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 808f8a1e35..f6c876a56e 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -817,12 +817,14 @@ int computeSmallestEigenvectors( for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory - CUDA_TRY(cudaMemcpy(eigVals_dev, work_host + 2 * (*effIter), - nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpyAsync(eigVals_dev, work_host + 2 * (*effIter), + nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, stream)); - CUDA_TRY(cudaMemcpy(work_dev, Z_host, - (*effIter) * nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host, + (*effIter) * nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); // Convert eigenvectors from Lanczos basis to standard basis CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, @@ -1134,14 +1136,17 @@ int computeLargestEigenvectors( // Copy results to device memory // skip smallest eigenvalue if needed - CUDA_TRY(cudaMemcpy(eigVals_dev, - work_host + 2 * (*effIter) + top_eigenparis_idx_offset, - nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpyAsync( + eigVals_dev, work_host + 2 * (*effIter) + top_eigenparis_idx_offset, + nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); // skip smallest eigenvector if needed - CUDA_TRY(cudaMemcpy( - work_dev, Z_host + (top_eigenparis_idx_offset * (*effIter)), - (*effIter) * nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpyAsync(work_dev, + Z_host + (top_eigenparis_idx_offset * (*effIter)), + (*effIter) * nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, stream)); + + CHECK_CUDA(cudaStreamSynchronize(stream)); // Convert eigenvectors from Lanczos basis to standard basis CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, From b21d5ebf5b23bfe9777a938aa6187f13c43835f3 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 13:29:08 -0500 Subject: [PATCH 73/88] Addressed comments on removing outdated TODOs. --- cpp/include/raft/spectral/lanczos.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index f6c876a56e..483d900b45 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -900,7 +900,6 @@ int computeSmallestEigenvectors( value_type_t *alpha_host = alpha_host_v.data(); value_type_t *beta_host = beta_host_v.data(); - //TODO: replace and fix allocation via RAFT handle vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); vector_t work_dev(handle, (n + restartIter) * restartIter); @@ -1029,8 +1028,8 @@ int computeLargestEigenvectors( work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, - stream)); // ????? TODO: check / remove + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue @@ -1220,7 +1219,6 @@ int computeLargestEigenvectors( value_type_t *alpha_host = alpha_host_v.data(); value_type_t *beta_host = beta_host_v.data(); - //TODO: replace and fix allocation via RAFT handle vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); vector_t work_dev(handle, (n + restartIter) * restartIter); From 9df3f9aa1fccbca4d82c1262322fe8185a9ed188 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 13:57:45 -0500 Subject: [PATCH 74/88] Addressed comments on removing dead code. --- cpp/include/raft/spectral/lapack.hpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp index 4417640705..d14bf05f37 100644 --- a/cpp/include/raft/spectral/lapack.hpp +++ b/cpp/include/raft/spectral/lapack.hpp @@ -103,14 +103,11 @@ class Lapack { // computes the QR factorization of a general matrix static void geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork); // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf. - //static void orgqr( int m, int n, int k, T* a, int lda, const T* tau, T* work, int* lwork ); + // multiply C by implicit Q static void ormqr(bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork); - //static void unmqr (bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork); - //static void qrf (int n, T *H, T *Q, T *R); - //static void hseqr (T* Q, T* R, T* eigenvalues,T* eigenvectors, int dim, int ldh, int ldq); static void geev(T *A, T *eigenvalues, int dim, int lda); static void geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, int ldvr); From fb46b7711245fbc66389860821b5475ed0c2002e Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 14:13:46 -0500 Subject: [PATCH 75/88] Addressed comments on using dim3{} cnstr. --- cpp/include/raft/spectral/matrix_wrappers.hpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 1c78fd16fd..5f72da45b7 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -272,13 +272,10 @@ struct laplacian_matrix_t : sparse_matrix_t { // Apply diagonal matrix // - dim3 gridDim, blockDim; - gridDim.x = std::min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); - gridDim.y = 1; - gridDim.z = 1; - blockDim.x = BLOCK_SIZE; - blockDim.y = 1; - blockDim.z = 1; + dim3 gridDim{ + std::min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1}; + + dim3 blockDim{BLOCK_SIZE, 1, 1}; utils::diagmv<<>>(n, alpha, diagonal_.raw(), x, y); CHECK_CUDA(stream); From d359d1acb4c94577d1f1529b1c317d0057098dd4 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 15:12:15 -0500 Subject: [PATCH 76/88] Addressed comments on using cleaning-up modularity header. --- cpp/include/raft/spectral/modularity_maximization.hpp | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index 5ac33eda43..a920eb39c9 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -31,9 +31,6 @@ #include #include -//#define COLLECT_TIME_STATISTICS 1 -//#undef COLLECT_TIME_STATISTICS - #ifdef COLLECT_TIME_STATISTICS #include #include @@ -97,14 +94,13 @@ std::tuple modularity_maximization( auto stream = handle.get_stream(); std::tuple - stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver + stats; // # iters eigen solver, cluster solver residual, # iters cluster solver vertex_t n = csr_m.nrows_; // Compute eigenvectors of Modularity Matrix // Initialize Modularity Matrix - //sparse_matrix_t A{handle, graph}; modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; auto eigen_config = eigen_solver.get_config(); @@ -167,7 +163,6 @@ void analyzeModularity(handle_t const &handle, cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity - ///sparse_matrix_t A{handle, graph}; modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; // Initialize output @@ -183,10 +178,8 @@ void analyzeModularity(handle_t const &handle, // Record results modularity += partModularity; - // std::cout<< "partModularity " < Date: Tue, 7 Jul 2020 15:46:46 -0500 Subject: [PATCH 77/88] Addressed comments on cleaning-up spectral_util header. --- cpp/include/raft/spectral/spectral_util.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index 8f8eb3ad8b..acf59f9d63 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -40,8 +40,7 @@ static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, mm = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x); // m in multiple of blockDim.x alpha = 0.0; - // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, - // li, mn); + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { for (i = threadIdx.x; i < mm; i += blockDim.x) { From e5ef1a71dd01efffe824188d58f1fb62014fcc48 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 16:04:52 -0500 Subject: [PATCH 78/88] Addressed comments on cleaning-up cluster_solvers.cu. --- cpp/test/cluster_solvers.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu index 04a94fbf22..4ff6cdf5fa 100644 --- a/cpp/test/cluster_solvers.cu +++ b/cpp/test/cluster_solvers.cu @@ -29,8 +29,6 @@ TEST(Raft, ClusterSolvers) { using value_type = double; handle_t h; - ASSERT_EQ(0, h.get_num_internal_streams()); - ASSERT_EQ(0, h.get_device()); index_type maxiter{100}; value_type tol{1.0e-10}; From beaa4991a1bf897f7fedfdb1dcdd7bf0f6e98760 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 16:16:19 -0500 Subject: [PATCH 79/88] Addressed comments on more clean-up in lanczos. --- cpp/include/raft/spectral/lanczos.hpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/spectral/lanczos.hpp index 483d900b45..37719579cb 100644 --- a/cpp/include/raft/spectral/lanczos.hpp +++ b/cpp/include/raft/spectral/lanczos.hpp @@ -686,11 +686,6 @@ int computeSmallestEigenvectors( value_type_t *Z_host; // Eigenvectors in Lanczos basis value_type_t *work_host; // Workspace - // ------------------------------------------------------- - // Check that LAPACK is enabled - // ------------------------------------------------------- - // Lapack::check_lapack_enabled(); - // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- From 52764ef97e3492d23c68a79532913645e27e0b38 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 16:35:32 -0500 Subject: [PATCH 80/88] Addressed comments on dim3{} and type lowercase_t in spectral_util. --- cpp/include/raft/spectral/spectral_util.hpp | 48 ++++++++++----------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index acf59f9d63..1b90ab959f 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -28,11 +28,11 @@ namespace raft { namespace spectral { -template -static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, - ValueType_* obs) { - IndexType_ i, j, k, index, mm; - ValueType_ alpha, v, last; +template +static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, + value_type_t* obs) { + index_type_t i, j, k, index, mm; + value_type_t alpha, v, last; bool valid; // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension @@ -76,9 +76,9 @@ static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, } } -template -IndexType_ next_pow2(IndexType_ n) { - IndexType_ v; +template +index_type_t next_pow2(index_type_t n) { + index_type_t v; // Reference: // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float v = n - 1; @@ -90,25 +90,21 @@ IndexType_ next_pow2(IndexType_ n) { return v + 1; } -template -cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_* obs) { - IndexType_ p2m; - dim3 nthreads, nblocks; +template +cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) { + index_type_t p2m; // find next power of 2 - p2m = next_pow2(m); + p2m = next_pow2(m); // setup launch configuration - nthreads.x = max(2, min(p2m, 32)); - nthreads.y = 256 / nthreads.x; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = (n + nthreads.y - 1) / nthreads.y; - nblocks.z = 1; - // printf("m=%d(%d),n=%d,obs=%p, - // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); + unsigned int xsize = max(2, min(p2m, 32)); + dim3 nthreads{xsize, 256 / xsize, 1}; + + dim3 nblocks{1, (n + nthreads.y - 1) / nthreads.y, 1}; // launch scaling kernel (scale each column of obs by its norm) - scale_obs_kernel<<>>(m, n, obs); + scale_obs_kernel + <<>>(m, n, obs); return cudaSuccess; } @@ -176,16 +172,16 @@ namespace { /// Functor to generate indicator vectors /** For use in Thrust transform */ -template +template struct equal_to_i_op { - const IndexType_ i; + const index_type_t i; public: - equal_to_i_op(IndexType_ _i) : i(_i) {} + equal_to_i_op(index_type_t _i) : i(_i) {} template __host__ __device__ void operator()(Tuple_ t) { thrust::get<1>(t) = - (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; + (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0; } }; } // namespace From e8fc0fb081e5acb849037c4161d7ddb1da774139 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Tue, 7 Jul 2020 16:49:24 -0500 Subject: [PATCH 81/88] Fixed duplicate in CHANGELOG.md. --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4695742c89..0e376f5f76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,6 @@ ## New Features - Initial RAFT version - PR #3: defining raft::handle_t, device_buffer, host_buffer, allocator classes -- PR #12: Spectral Clustering ## Bug Fixes - PR #5: Small build.sh fixes From aeb2e5617b301d8ded8dbde14f921b69557f5d5d Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 9 Jul 2020 14:55:34 -0500 Subject: [PATCH 82/88] Moved lanczos.hpp to linalg. --- cpp/include/raft/{spectral => linalg}/lanczos.hpp | 0 cpp/include/raft/spectral/eigen_solvers.hpp | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename cpp/include/raft/{spectral => linalg}/lanczos.hpp (100%) diff --git a/cpp/include/raft/spectral/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp similarity index 100% rename from cpp/include/raft/spectral/lanczos.hpp rename to cpp/include/raft/linalg/lanczos.hpp diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp index 056189dcba..e36dca2e0c 100644 --- a/cpp/include/raft/spectral/eigen_solvers.hpp +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -15,7 +15,7 @@ */ #pragma once -#include +#include namespace raft { From 1502821cbc747436bcd07bd5ddec54160a320259 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 9 Jul 2020 15:20:22 -0500 Subject: [PATCH 83/88] Moved sm_utils.hpp to utils. --- cpp/include/raft/spectral/kmeans.hpp | 2 +- cpp/include/raft/spectral/matrix_wrappers.hpp | 2 +- cpp/include/raft/spectral/spectral_util.hpp | 2 +- cpp/include/raft/{spectral => utils}/sm_utils.hpp | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename cpp/include/raft/{spectral => utils}/sm_utils.hpp (100%) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 9017d2b8d4..13f4d2c82a 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include namespace { diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 5f72da45b7..08f213cd3a 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index 1b90ab959f..b77375b33b 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/include/raft/spectral/sm_utils.hpp b/cpp/include/raft/utils/sm_utils.hpp similarity index 100% rename from cpp/include/raft/spectral/sm_utils.hpp rename to cpp/include/raft/utils/sm_utils.hpp From 2b5a6cd0e32e9fd36a8ff053ce14c0f3abd32184 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 9 Jul 2020 17:16:14 -0500 Subject: [PATCH 84/88] Fixed CHECK_CUDA() calls. --- cpp/include/raft/linalg/lanczos.hpp | 2 +- cpp/include/raft/spectral/kmeans.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp index 37719579cb..f5967b556c 100644 --- a/cpp/include/raft/linalg/lanczos.hpp +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -819,7 +819,7 @@ int computeSmallestEigenvectors( CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host, (*effIter) * nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 13f4d2c82a..10357671d6 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -389,7 +389,7 @@ static int chooseNewCentroid(handle_t const& handle, value_type_t minSum{0}; CUDA_TRY(cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + CHECK_CUDA(stream);//cudaStreamSynchronize(stream)); if (distsSum > minSum) { value_type_t vIndex = static_cast(n - 1); From e2873503a5baf3e780bc18702217759506d1306b Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Thu, 9 Jul 2020 17:25:56 -0500 Subject: [PATCH 85/88] Fixed CHECK_CUDA() redux. --- cpp/include/raft/linalg/lanczos.hpp | 2 +- cpp/include/raft/spectral/kmeans.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp index f5967b556c..b775a1f696 100644 --- a/cpp/include/raft/linalg/lanczos.hpp +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -1140,7 +1140,7 @@ int computeLargestEigenvectors( (*effIter) * nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 10357671d6..9e31c1ef5b 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -33,8 +33,8 @@ #include #include #include -#include #include +#include namespace { @@ -389,7 +389,7 @@ static int chooseNewCentroid(handle_t const& handle, value_type_t minSum{0}; CUDA_TRY(cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); - CHECK_CUDA(stream);//cudaStreamSynchronize(stream)); + CHECK_CUDA(stream); if (distsSum > minSum) { value_type_t vIndex = static_cast(n - 1); From 13aa96e6ead7def67d058612e859fce854d8ae03 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 13 Jul 2020 10:51:41 -0500 Subject: [PATCH 86/88] Addressed comments on in-place initializers and thrust exe policy dox. --- cpp/include/raft/spectral/kmeans.hpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 9e31c1ef5b..10670c2721 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -220,14 +220,13 @@ static __global__ void minDistances2(index_type_t n, index_type_t* __restrict__ codes_old, index_type_t code_new) { // Loop index - index_type_t i; + index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; // Distances value_type_t dist_old_private; value_type_t dist_new_private; // Each row is processed by a thread - i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { // Get old and new distances dist_old_private = dists_old[i]; @@ -419,7 +418,8 @@ static int chooseNewCentroid(handle_t const& handle, * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy. + * @param thrust_exec_policy thrust execution policy + * (assumed to be same as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -526,7 +526,8 @@ static int initializeCentroids( * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy. + * @param thrust_exec_policy thrust execution policy + * (assumed to be same as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -602,7 +603,8 @@ static int assignCentroids( * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy. + * @param thrust_exec_policy thrust execution policy + * (assumed to be same as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -714,7 +716,8 @@ namespace raft { * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy. + * @param thrust_exec_policy thrust execution policy + * (assumed to be same as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -893,7 +896,8 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy. + * @param thrust_exec_policy thrust execution policy + * (assumed to be same as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. From d513d21474f851db64c68095db1ea47eeeb1c740 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 13 Jul 2020 11:15:38 -0500 Subject: [PATCH 87/88] Addressed comment on replacing 65535 by named constant. --- cpp/include/raft/spectral/kmeans.hpp | 36 ++++++++++++++++++---------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index 10670c2721..08913d41e7 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -419,7 +419,7 @@ static int chooseNewCentroid(handle_t const& handle, * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy - * (assumed to be same as handle.stream). + * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -460,6 +460,8 @@ static int initializeCentroids( auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); + constexpr index_type_t grid_lower_bound{65535}; + // ------------------------------------------------------- // Implementation // ------------------------------------------------------- @@ -468,11 +470,13 @@ static int initializeCentroids( dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE}; // CUDA grid dimensions - dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, 65535), 1, - min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535)}; + dim3 gridDim_warp{ + min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1, + min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)}; // CUDA grid dimensions - dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1}; + dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), + 1, 1}; // Assign observation vectors to code 0 CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); @@ -527,7 +531,7 @@ static int initializeCentroids( * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy - * (assumed to be same as handle.stream). + * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -581,7 +585,7 @@ static int assignCentroids( blockDim.x = BLOCK_SIZE; blockDim.y = 1; blockDim.z = 1; - gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); + gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound); gridDim.y = 1; gridDim.z = 1; minDistances<<>>(n, k, dists, codes, @@ -604,7 +608,7 @@ static int assignCentroids( * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy - * (assumed to be same as handle.stream). + * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -642,6 +646,8 @@ static int updateCentroids(handle_t const& handle, const value_type_t one = 1; const value_type_t zero = 0; + constexpr index_type_t grid_lower_bound{65535}; + auto cublas_h = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -689,8 +695,9 @@ static int updateCentroids(handle_t const& handle, dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1}; // CUDA grid dimensions - dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, 65535), - min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535), 1}; + dim3 gridDim{ + min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), 1}; divideCentroids<<>>(d, k, clusterSizes, centroids); @@ -717,7 +724,7 @@ namespace raft { * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy - * (assumed to be same as handle.stream). + * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. @@ -764,6 +771,8 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, // Current iteration index_type_t iter; + constexpr index_type_t grid_lower_bound{65535}; + // Residual sum of squares at previous iteration value_type_t residualPrev = 0; @@ -790,8 +799,9 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; dim3 gridDim{ - min((d + WARP_SIZE - 1) / WARP_SIZE, 65535), 1, - min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), 65535)}; + min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1, + min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), + grid_lower_bound)}; CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream)); computeDistances<<>>(n, d, 1, obs, centroids, @@ -897,7 +907,7 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. * @param thrust_exec_policy thrust execution policy - * (assumed to be same as handle.stream). + * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. * @param k Number of clusters. From e16b9c413511913a112179c63ce637db5ed83459 Mon Sep 17 00:00:00 2001 From: Andrei Schaffer Date: Mon, 13 Jul 2020 19:21:17 -0500 Subject: [PATCH 88/88] Fixed a file inclusion style (use brackets) that was rejected in CI (quotes). --- cpp/include/raft/spectral/modularity_maximization.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index a920eb39c9..f8dfe5daa3 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -32,11 +32,11 @@ #include #ifdef COLLECT_TIME_STATISTICS +#include #include #include #include #include -#include "cuda_profiler_api.h" #endif #ifdef COLLECT_TIME_STATISTICS