From 9aface197eec36880a0d01aafedb46d23b611375 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 10 May 2019 11:46:50 -0700
Subject: [PATCH] fixed issues after rebasing

---
 cpp/src/COOtoCSR.cuh         |  4 +--
 cpp/src/cugraph.cu           |  4 +--
 cpp/src/graph_utils.cuh      | 60 +++++++++++++++---------------------
 cpp/src/grmat.cu             |  2 +-
 cpp/src/renumber.cuh         |  2 +-
 cpp/src/snmg/pagerank.cuh    |  6 ++--
 cpp/src/snmg/spmv.cuh        |  4 +--
 cpp/src/two_hop_neighbors.cu |  2 +-
 8 files changed, 36 insertions(+), 48 deletions(-)
diff --git a/cpp/src/COOtoCSR.cuh b/cpp/src/COOtoCSR.cuh
index 0324062db32..f00b352f0e4 100644
--- a/cpp/src/COOtoCSR.cuh
+++ b/cpp/src/COOtoCSR.cuh
@@ -74,7 +74,7 @@ gdf_error ConvertCOOtoCSR(T* sources, T* destinations, int64_t nnz, CSR_Result<T
     //   Allocate local memory for operating on
     T* srcs{nullptr}, *dests{nullptr};
 
-    auto stream = cudaStream_t{nullptr};
+    cudaStream_t stream {nullptr};
     
     ALLOC_TRY((void**)&srcs, sizeof(T) * nnz, stream);
     ALLOC_TRY((void**)&dests, sizeof(T) * nnz, stream);
@@ -147,7 +147,7 @@ gdf_error ConvertCOOtoCSR_weighted(T* sources, T* destinations, W* edgeWeights,
     T* dests{nullptr};
     W* weights{nullptr};
     
-    auto stream = cudaStream_t{nullptr};
+    cudaStream_t stream {nullptr};
 
     ALLOC_TRY((void**)&srcs, sizeof(T) * nnz, stream);
     ALLOC_TRY((void**)&dests, sizeof(T) * nnz, stream);
diff --git a/cpp/src/cugraph.cu b/cpp/src/cugraph.cu
index 186b85519a3..b80711df2f3 100644
--- a/cpp/src/cugraph.cu
+++ b/cpp/src/cugraph.cu
@@ -39,7 +39,7 @@
  */
 void gdf_col_delete(gdf_column* col) {
   if (col != nullptr) {
-    auto stream = cudaStream_t{nullptr};
+    cudaStream_t stream {nullptr};
     if (col->data != nullptr) {
       ALLOC_FREE_TRY(col->data, stream);
     }
@@ -599,7 +599,7 @@ gdf_error gdf_louvain(gdf_graph *graph, void *final_modularity, void *num_level,
       value_ptr = graph->adjList->edge_data->data;
   }
   else {
-      auto stream = cudaStream_t{nullptr};
+      cudaStream_t stream {nullptr};
       d_values.resize(graph->adjList->indices->size);
       thrust::fill(rmm::exec_policy(stream)->on(stream), d_values.begin(), d_values.end(), 1.0);
       value_ptr = (void * ) thrust::raw_pointer_cast(d_values.data());
diff --git a/cpp/src/graph_utils.cuh b/cpp/src/graph_utils.cuh
index ea30cfc1857..b173cb61e8e 100644
--- a/cpp/src/graph_utils.cuh
+++ b/cpp/src/graph_utils.cuh
@@ -39,20 +39,6 @@ namespace cugraph
 #define DEFAULT_MASK 0xffffffff
 #define US
 
-//error check
-#ifdef DEBUG
-#define WHERE " at: " << __FILE__ << ':' << __LINE__
-#define cudaCheckError() {                                                           \
-    cudaError_t e=cudaGetLastError();                                                \
-    if(e!=cudaSuccess) {                                                             \
-      std::cerr << "Cuda failure: "  << cudaGetErrorString(e) << WHERE << std::endl; \
-    }                                                                                \
-  }
-#else 
-#define cudaCheckError()
-#define WHERE ""
-#endif 
-
     template<typename T>
     static __device__  __forceinline__ T shfl_up(T r, int offset, int bound = 32, int mask = DEFAULT_MASK) {
 #if __CUDA_ARCH__ >= 300
@@ -128,7 +114,7 @@ namespace cugraph
 //dot
     template<typename T>
     T dot(size_t n, T* x, T* y) {
-        auto stream = cudaStream_t{nullptr};
+        cudaStream_t stream {nullptr};
         T result = thrust::inner_product(rmm::exec_policy(stream)->on(stream),
                                          thrust::device_pointer_cast(x),
                                          thrust::device_pointer_cast(x + n),
@@ -153,7 +139,7 @@ namespace cugraph
 
     template<typename T>
     void axpy(size_t n, T a, T* x, T* y) {
-        auto stream = cudaStream_t{nullptr};
+        cudaStream_t stream {nullptr};
         thrust::transform(rmm::exec_policy(stream)->on(stream),
                           thrust::device_pointer_cast(x),
                           thrust::device_pointer_cast(x + n),
@@ -174,7 +160,7 @@ namespace cugraph
 
     template<typename T>
     T nrm2(size_t n, T* x) {
-        auto stream = cudaStream_t{nullptr};
+        cudaStream_t stream {nullptr};
         T init = 0;
         T result = std::sqrt(thrust::transform_reduce(rmm::exec_policy(stream)->on(stream),
                                                       thrust::device_pointer_cast(x),
@@ -188,7 +174,7 @@ namespace cugraph
 
     template<typename T>
     T nrm1(size_t n, T* x) {
-        auto stream = cudaStream_t{nullptr};
+        cudaStream_t stream {nullptr};
         T result = thrust::reduce(rmm::exec_policy(stream)->on(stream),
                                   thrust::device_pointer_cast(x),
                                   thrust::device_pointer_cast(x + n));
@@ -198,7 +184,7 @@ namespace cugraph
 
     template<typename T>
     void scal(size_t n, T val, T* x) {
-        auto stream = cudaStream_t{nullptr};
+        cudaStream_t stream {nullptr};
         thrust::transform(rmm::exec_policy(stream)->on(stream),
                           thrust::device_pointer_cast(x),
                           thrust::device_pointer_cast(x + n),
@@ -208,9 +194,21 @@ namespace cugraph
         cudaCheckError();
     }
 
+    template<typename T>
+    void addv(size_t n, T val, T* x) {
+        cudaStream_t stream {nullptr};
+        thrust::transform(rmm::exec_policy(stream)->on(stream),
+                          thrust::device_pointer_cast(x),
+                          thrust::device_pointer_cast(x + n),
+                          thrust::make_constant_iterator(val),
+                          thrust::device_pointer_cast(x),
+                          thrust::plus<T>());
+        cudaCheckError();
+    }
+
     template<typename T>
     void fill(size_t n, T* x, T value) {
-        auto stream = cudaStream_t{nullptr};
+        cudaStream_t stream {nullptr};
         thrust::fill(rmm::exec_policy(stream)->on(stream),
                      thrust::device_pointer_cast(x),
                      thrust::device_pointer_cast(x + n), value);
@@ -231,7 +229,7 @@ namespace cugraph
     void copy(size_t n, T *x, T *res) {
         thrust::device_ptr<T> dev_ptr(x);
         thrust::device_ptr<T> res_ptr(res);
-        auto stream = cudaStream_t{nullptr};
+        cudaStream_t stream {nullptr};
         thrust::copy_n(rmm::exec_policy(stream)->on(stream), dev_ptr, n, res_ptr);
         cudaCheckError();
     }
@@ -258,7 +256,7 @@ namespace cugraph
 
     template<typename T>
     void update_dangling_nodes(size_t n, T* dangling_nodes, T damping_factor) {
-        auto stream = cudaStream_t{nullptr};
+        cudaStream_t stream {nullptr};
         thrust::transform_if(rmm::exec_policy(stream)->on(stream),
                              thrust::device_pointer_cast(dangling_nodes),
                              thrust::device_pointer_cast(dangling_nodes + n),
@@ -275,21 +273,11 @@ namespace cugraph
         for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x)
             atomicAdd(&degree[ind[i]], 1.0);
     }
-    template<typename IndexType, typename ValueType>
-    __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
-    equi_prob(const IndexType n,
-              const IndexType e,
-              const IndexType *ind,
-              ValueType *val,
-              IndexType *degree) {
-        for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x)
-            val[i] = 1.0 / degree[ind[i]];
-    }
 
     template<typename IndexType, typename ValueType>
     __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
-    flag_leafs(const IndexType n, IndexType *degree, ValueType *bookmark) {
-        for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
+    flag_leafs_kernel(const size_t n, const IndexType *degree, ValueType *bookmark) {
+        for (auto i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
             if (degree[i] == 0)
                 bookmark[i] = 1.0;
     }
@@ -383,7 +371,7 @@ namespace cugraph
         nblocks.x = min((n + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS);
         nblocks.y = 1;
         nblocks.z = 1;
-        flag_leafs<IndexType, ValueType> <<<nblocks, nthreads>>>(n, degree, bookmark);
+        flag_leafs_kernel<IndexType, ValueType> <<<nblocks, nthreads>>>(n, degree, bookmark);
         cudaCheckError();
         ALLOC_FREE_TRY(degree, stream);
     }
@@ -406,7 +394,7 @@ namespace cugraph
 // This will sort the COO Matrix, row will be sorted and each column of same row will be sorted. 
     template<typename IndexType, typename ValueType, typename SizeT>
     void remove_duplicate(IndexType* src, IndexType* dest, ValueType* val, SizeT &nnz) {
-        auto stream = cudaStream_t{nullptr};
+        cudaStream_t stream {nullptr};
         if (val != NULL) {
             thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream),
                                        thrust::raw_pointer_cast(val),
diff --git a/cpp/src/grmat.cu b/cpp/src/grmat.cu
index cd4523b5da6..8b5a50aacd7 100644
--- a/cpp/src/grmat.cu
+++ b/cpp/src/grmat.cu
@@ -177,7 +177,7 @@ gdf_error main_(gdf_column *src,  gdf_column *dest, gdf_column *val, CommandLine
     if (util::SetDevice(gpu_idx[0]))
         return GDF_CUDA_ERROR;
 
-    auto stream = cudaStream_t{nullptr};
+    cudaStream_t stream {nullptr};
     ALLOC_TRY((void**)&coo.row, sizeof(VertexId) * rmat_all_edges, stream);
     ALLOC_TRY((void**)&coo.col, sizeof(VertexId) * rmat_all_edges, stream);
     if (val != nullptr)
diff --git a/cpp/src/renumber.cuh b/cpp/src/renumber.cuh
index d59d3df4e50..b5681491776 100644
--- a/cpp/src/renumber.cuh
+++ b/cpp/src/renumber.cuh
@@ -203,7 +203,7 @@ namespace cugraph {
     // We need 3 for hashing, and one array for data
     //
 
-    auto stream = cudaStream_t{nullptr};
+    cudaStream_t stream {nullptr};
 
     T_in *hash_data;
 
diff --git a/cpp/src/snmg/pagerank.cuh b/cpp/src/snmg/pagerank.cuh
index e19f3a855de..e8c4127ed5b 100644
--- a/cpp/src/snmg/pagerank.cuh
+++ b/cpp/src/snmg/pagerank.cuh
@@ -84,8 +84,8 @@ class SNMGpagerank
       e_loc = tmp_e;
       stream = nullptr;
       is_setup = false;
-      ALLOC_MANAGED_TRY ((void**)&bookmark,   sizeof(ValueType) * v_glob, stream);
-      ALLOC_MANAGED_TRY ((void**)&val, sizeof(ValueType) * e_loc, stream);
+      ALLOC_TRY ((void**)&bookmark,   sizeof(ValueType) * v_glob, stream);
+      ALLOC_TRY ((void**)&val, sizeof(ValueType) * e_loc, stream);
     } 
     ~SNMGpagerank() { 
       ALLOC_FREE_TRY(bookmark, stream); 
@@ -113,7 +113,7 @@ class SNMGpagerank
         alpha=_alpha;
         ValueType zero = 0.0; 
         IndexType *degree;
-        ALLOC_MANAGED_TRY ((void**)&degree,   sizeof(IndexType) * v_glob, stream);
+        ALLOC_TRY ((void**)&degree,   sizeof(IndexType) * v_glob, stream);
         
         // TODO snmg degree
         int nthreads = min(static_cast<IndexType>(e_loc), 256);
diff --git a/cpp/src/snmg/spmv.cuh b/cpp/src/snmg/spmv.cuh
index 5d042a0a1c3..27e15a1b1e3 100644
--- a/cpp/src/snmg/spmv.cuh
+++ b/cpp/src/snmg/spmv.cuh
@@ -65,14 +65,14 @@ class SNMGcsrmv
       e_loc = tmp;
 
       // Allocate the local result
-      ALLOC_MANAGED_TRY ((void**)&y_loc, v_loc*sizeof(ValueType), stream);
+      ALLOC_TRY ((void**)&y_loc, v_loc*sizeof(ValueType), stream);
 
       // get temporary storage size for CUB
       cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, 
                                       val, off, ind, x[i], y_loc, v_loc, v_glob, e_loc);
       cudaCheckError();
       // Allocate CUB's temporary storage
-      ALLOC_MANAGED_TRY ((void**)&cub_d_temp_storage, cub_temp_storage_bytes, stream);
+      ALLOC_TRY ((void**)&cub_d_temp_storage, cub_temp_storage_bytes, stream);
     } 
 
     ~SNMGcsrmv() { 
diff --git a/cpp/src/two_hop_neighbors.cu b/cpp/src/two_hop_neighbors.cu
index 2f5a576c096..de8bd9bfb0c 100644
--- a/cpp/src/two_hop_neighbors.cu
+++ b/cpp/src/two_hop_neighbors.cu
@@ -39,7 +39,7 @@ gdf_error gdf_get_two_hop_neighbors_impl(IndexType num_verts,
     IndexType num_edges;
     cudaMemcpy(&num_edges, &offsets[num_verts], sizeof(IndexType), cudaMemcpyDefault);
 
-    auto stream = cudaStream_t{nullptr};
+    cudaStream_t stream {nullptr};
 
     // Allocate memory for temporary stuff
     IndexType *exsum_degree = nullptr;