diff --git a/README.md b/README.md index 2f26a7511aa..7594e359c74 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,23 @@ There are 3 ways to get cuGraph :

--- +# cuGraph News + +### Scaling to 1 Trillion Edges +cuGraph was recently tested on the Selene supercomputer using 2,048 GPUs and processing a graph with `1.1 Trillion edges`. + +
 
cuGraph Scaling
+

+ +### cuGraph Software Stack +cuGraph has a new multi-layer software stack that allows users and system integrators to access cuGraph at different layers. + +
 
cuGraph Software Stack
+

+ + + + # Currently Supported Features As of Release 21.08 - including 21.08 nightly @@ -50,24 +67,24 @@ _Italic_ algorithms are planned for future releases. | ------------ | -------------------------------------- | ------------ | ------------------- | | Centrality | | | | | | Katz | Multi-GPU | | -| | Betweenness Centrality | Single-GPU | | +| | Betweenness Centrality | Single-GPU | MG planned for 22.08 | | | Edge Betweenness Centrality | Single-GPU | | +| | _Eigenvector Centrality_ | | _MG planned for 22.06_ | | Community | | | | -| | EgoNet | Single-GPU | | | | Leiden | Single-GPU | | | | Louvain | Multi-GPU | [C++ README](cpp/src/community/README.md#Louvain) | | | Ensemble Clustering for Graphs | Single-GPU | | | | Spectral-Clustering - Balanced Cut | Single-GPU | | | | Spectral-Clustering - Modularity | Single-GPU | | | | Subgraph Extraction | Single-GPU | | -| | Triangle Counting | Single-GPU | | -| | K-Truss | Single-GPU | | +| | Triangle Counting | Single-GPU | MG planned for 22.06 | +| | K-Truss | Single-GPU | MG planned for 22.10 | | Components | | | | | | Weakly Connected Components | Multi-GPU | | -| | Strongly Connected Components | Single-GPU | | +| | Strongly Connected Components | Single-GPU | MG planned for 22.06 | | Core | | | | -| | K-Core | Single-GPU | | -| | Core Number | Single-GPU | | +| | K-Core | Single-GPU | MG planned for 22.10 | +| | Core Number | Single-GPU | MG planned for 22.08 | | _Flow_ | | | | | | _MaxFlow_ | --- | | | _Influence_ | | | | @@ -79,7 +96,7 @@ _Italic_ algorithms are planned for future releases. | Link Analysis| | | | | | Pagerank | Multi-GPU | [C++ README](cpp/src/centrality/README.md#Pagerank) | | | Personal Pagerank | Multi-GPU | [C++ README](cpp/src/centrality/README.md#Personalized-Pagerank) | -| | HITS | Single-GPU | Multi-GPU C code is ready, Python wrapper in 22.04 | +| | HITS | Multi-GPU | | | Link Prediction | | | | | | Jaccard Similarity | Single-GPU | | | | Weighted Jaccard Similarity | Single-GPU | | @@ -89,10 +106,12 @@ _Italic_ algorithms are planned for future releases. | Sampling | | | | | | Random Walks (RW) | Single-GPU | Biased and Uniform | | | Egonet | Single-GPU | multi-seed | -| | _node2vec_ | --- | C code is ready, Python wrapper coming in 22.04 | +| | Node2Vec | Single-GPU | | +| | Neighborhood sampling | Multi-GPU | | | Traversal | | | | | | Breadth First Search (BFS) | Multi-GPU | with cutoff support
[C++ README](cpp/src/traversal/README.md#BFS) | | | Single Source Shortest Path (SSSP) | Multi-GPU | [C++ README](cpp/src/traversal/README.md#SSSP) | +| | _ASSP / APSP_ | | | | Tree | | | | | | Minimum Spanning Tree | Single-GPU | | | | Maximum Spanning Tree | Single-GPU | | @@ -164,20 +183,20 @@ Install and update cuGraph using the conda command: ```bash -# CUDA 11.0 -conda install -c nvidia -c rapidsai -c numba -c conda-forge cugraph cudatoolkit=11.0 -# CUDA 11.2 -conda install -c nvidia -c rapidsai -c numba -c conda-forge cugraph cudatoolkit=11.2 + + # CUDA 11.4 conda install -c nvidia -c rapidsai -c numba -c conda-forge cugraph cudatoolkit=11.4 # CUDA 11.5 conda install -c nvidia -c rapidsai -c numba -c conda-forge cugraph cudatoolkit=11.5 + +For CUDA > 11.5, please use the 11.5 environment ``` -Note: This conda installation only applies to Linux and Python versions 3.7/3.8. +Note: This conda installation only applies to Linux and Python versions 3.8/3.9. ## Build from Source and Contributing diff --git a/ci/test.sh b/ci/test.sh index c4b64eff852..8f4c88c6291 100755 --- a/ci/test.sh +++ b/ci/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -96,9 +96,9 @@ cd ${CUGRAPH_ROOT}/python/pylibcugraph/pylibcugraph pytest --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-pylibcugraph-pytests.xml -v --cov-config=.coveragerc --cov=pylibcugraph --cov-report=xml:${WORKSPACE}/python/pylibcugraph/pylibcugraph-coverage.xml --cov-report term --ignore=raft --benchmark-disable echo "Ran Python pytest for pylibcugraph : return code was: $?, test script exit code is now: $EXITCODE" -echo "Python pytest for cuGraph..." +echo "Python pytest for cuGraph (single-GPU only)..." cd ${CUGRAPH_ROOT}/python/cugraph/cugraph -pytest --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-cugraph-pytests.xml -v --cov-config=.coveragerc --cov=cugraph --cov-report=xml:${WORKSPACE}/python/cugraph/cugraph-coverage.xml --cov-report term --ignore=raft --benchmark-disable +pytest --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-cugraph-pytests.xml -v --cov-config=.coveragerc --cov=cugraph --cov-report=xml:${WORKSPACE}/python/cugraph/cugraph-coverage.xml --cov-report term --ignore=raft --ignore=tests/dask --benchmark-disable echo "Ran Python pytest for cugraph : return code was: $?, test script exit code is now: $EXITCODE" echo "Python benchmarks for cuGraph (running as tests)..." diff --git a/conda/environments/cugraph_dev_cuda11.0.yml b/conda/environments/cugraph_dev_cuda11.0.yml deleted file mode 100644 index 2bf1c588217..00000000000 --- a/conda/environments/cugraph_dev_cuda11.0.yml +++ /dev/null @@ -1,49 +0,0 @@ -name: cugraph_dev -channels: -- rapidsai -- nvidia -- rapidsai-nightly -- conda-forge -dependencies: -- cudatoolkit=11.0 -- libcugraphops=22.06.* -- cudf=22.06.* -- libcudf=22.06.* -- rmm=22.06.* -- librmm=22.06.* -- libraft-headers=22.06.* -- pyraft=22.06.* -- cuda-python>=11.5,<12.0 -- dask==2022.03.0 -- distributed==2022.03.0 -- dask-cuda=22.06.* -- dask-cudf=22.06.* -- nccl>=2.9.9 -- ucx-py=0.26.* -- ucx-proc=*=gpu -- scipy -- networkx>=2.5.1 -- clang=11.1.0 -- clang-tools=11.1.0 -- cmake>=3.20.1 -- python>=3.6,<3.9 -- notebook>=0.5.0 -- boost -- cython>=0.29,<0.30 -- pytest -- scikit-learn>=0.23.1 -- sphinx -- pydata-sphinx-theme -- sphinxcontrib-websupport -- sphinx-markdown-tables -- sphinx-copybutton -- nbsphinx -- numpydoc -- ipython -- recommonmark -- pip -- rapids-pytest-benchmark -- doxygen -- pytest-cov -- gtest=1.10.0 -- gmock=1.10.0 diff --git a/conda/environments/cugraph_dev_cuda11.2.yml b/conda/environments/cugraph_dev_cuda11.2.yml index bf0203c1702..35d3dc5bd36 100644 --- a/conda/environments/cugraph_dev_cuda11.2.yml +++ b/conda/environments/cugraph_dev_cuda11.2.yml @@ -25,7 +25,7 @@ dependencies: - networkx>=2.5.1 - clang=11.1.0 - clang-tools=11.1.0 -- cmake>=3.20.1 +- cmake>=3.20.1,<3.23 - python>=3.6,<3.9 - notebook>=0.5.0 - boost diff --git a/conda/environments/cugraph_dev_cuda11.4.yml b/conda/environments/cugraph_dev_cuda11.4.yml index 21ac68a79e2..e1f65a33992 100644 --- a/conda/environments/cugraph_dev_cuda11.4.yml +++ b/conda/environments/cugraph_dev_cuda11.4.yml @@ -25,7 +25,7 @@ dependencies: - networkx>=2.5.1 - clang=11.1.0 - clang-tools=11.1.0 -- cmake>=3.20.1 +- cmake>=3.20.1,<3.23 - python>=3.6,<3.9 - notebook>=0.5.0 - boost diff --git a/conda/environments/cugraph_dev_cuda11.5.yml b/conda/environments/cugraph_dev_cuda11.5.yml index 955eb591a7d..7a1add74dba 100644 --- a/conda/environments/cugraph_dev_cuda11.5.yml +++ b/conda/environments/cugraph_dev_cuda11.5.yml @@ -25,7 +25,7 @@ dependencies: - networkx>=2.5.1 - clang=11.1.0 - clang-tools=11.1.0 -- cmake>=3.20.1 +- cmake>=3.20.1,<3.23 - python>=3.6,<3.9 - notebook>=0.5.0 - boost diff --git a/conda/recipes/libcugraph/meta.yaml b/conda/recipes/libcugraph/meta.yaml index 68d4e626e29..edc144b111a 100644 --- a/conda/recipes/libcugraph/meta.yaml +++ b/conda/recipes/libcugraph/meta.yaml @@ -34,7 +34,7 @@ build: requirements: build: - - cmake>=3.20.1 + - cmake>=3.20.1,<3.23 - doxygen>=1.8.11 - cudatoolkit {{ cuda_version }}.* - libraft-headers {{ minor_version }} diff --git a/conda/recipes/libcugraph_etl/meta.yaml b/conda/recipes/libcugraph_etl/meta.yaml index d039f30fb4a..3334186ebfa 100644 --- a/conda/recipes/libcugraph_etl/meta.yaml +++ b/conda/recipes/libcugraph_etl/meta.yaml @@ -34,7 +34,7 @@ build: requirements: build: - - cmake>=3.20.1 + - cmake>=3.20.1,<3.23 - doxygen>=1.8.11 - cudatoolkit {{ cuda_version }}.* - libcudf {{ minor_version }}.* diff --git a/cpp/include/cugraph_c/algorithms.h b/cpp/include/cugraph_c/algorithms.h index c156a72ec35..cfaa89bb490 100644 --- a/cpp/include/cugraph_c/algorithms.h +++ b/cpp/include/cugraph_c/algorithms.h @@ -514,20 +514,21 @@ typedef struct { * replacement. If false selection is done without replacement. * @param [in] do_expensive_check * A flag to run expensive checks for input arguments (if set to true) - * @param [in] result Output from the uniform_nbr_sample call + * @param [in] result Output from the uniform_neighbor_sample call * @param [out] error Pointer to an error object storing details of any error. Will * be populated if error code is not CUGRAPH_SUCCESS * @return error code */ -cugraph_error_code_t uniform_nbr_sample(const cugraph_resource_handle_t* handle, - cugraph_graph_t* graph, - const cugraph_type_erased_device_array_view_t* start, - const cugraph_type_erased_device_array_view_t* start_label, - const cugraph_type_erased_host_array_view_t* fan_out, - bool_t without_replacement, - bool_t do_expensive_check, - cugraph_sample_result_t** result, - cugraph_error_t** error); +cugraph_error_code_t cugraph_uniform_neighbor_sample( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* start, + const cugraph_type_erased_device_array_view_t* start_label, + const cugraph_type_erased_host_array_view_t* fan_out, + bool_t with_replacement, + bool_t do_expensive_check, + cugraph_sample_result_t** result, + cugraph_error_t** error); /** * @brief Get the source vertices from the sampling algorithm result diff --git a/cpp/include/cugraph_c/array.h b/cpp/include/cugraph_c/array.h index 925d2f34ea5..273225dcc86 100644 --- a/cpp/include/cugraph_c/array.h +++ b/cpp/include/cugraph_c/array.h @@ -223,6 +223,22 @@ data_type_id_t cugraph_type_erased_host_array_type(const cugraph_type_erased_hos */ void* cugraph_type_erased_host_array_pointer(const cugraph_type_erased_host_array_view_t* p); +/** + * @brief Copy data between two type erased device array views + * + * @param [in] handle Handle for accessing resources + * @param [out] dst Pointer to type erased host array view destination + * @param [in] src Pointer to type erased host array view source + * @param [out] error Pointer to an error object storing details of any error. Will + * be populated if error code is not CUGRAPH_SUCCESS + * @return error code + */ +cugraph_error_code_t cugraph_type_erased_host_array_view_copy( + const cugraph_resource_handle_t* handle, + cugraph_type_erased_host_array_view_t* dst, + const cugraph_type_erased_host_array_view_t* src, + cugraph_error_t** error); + /** * @brief Copy data from host to a type erased device array view * diff --git a/cpp/src/c_api/array.cpp b/cpp/src/c_api/array.cpp index 3d5671143dd..760a68d95fe 100644 --- a/cpp/src/c_api/array.cpp +++ b/cpp/src/c_api/array.cpp @@ -150,8 +150,7 @@ extern "C" cugraph_error_code_t cugraph_type_erased_host_array_create( size_t n_bytes = n_elems * (::data_type_sz[dtype]); *array = reinterpret_cast( - new cugraph::c_api::cugraph_type_erased_host_array_t{ - std::make_unique(n_bytes), n_elems, n_bytes, dtype}); + new cugraph::c_api::cugraph_type_erased_host_array_t{n_elems, n_bytes, dtype}); return CUGRAPH_SUCCESS; } catch (std::exception const& ex) { @@ -223,6 +222,46 @@ extern "C" void* cugraph_type_erased_host_array_pointer( return internal_pointer->data_; } +extern "C" cugraph_error_code_t cugraph_type_erased_host_array_view_copy( + const cugraph_resource_handle_t* handle, + cugraph_type_erased_host_array_view_t* dst, + const cugraph_type_erased_host_array_view_t* src, + cugraph_error_t** error) +{ + *error = nullptr; + + try { + auto p_handle = reinterpret_cast(handle); + auto internal_pointer_dst = + reinterpret_cast(dst); + auto internal_pointer_src = + reinterpret_cast(src); + + if (!handle) { + *error = reinterpret_cast( + new cugraph::c_api::cugraph_error_t{"invalid resource handle"}); + return CUGRAPH_INVALID_HANDLE; + } + + if (internal_pointer_src->num_bytes() != internal_pointer_dst->num_bytes()) { + *error = reinterpret_cast( + new cugraph::c_api::cugraph_error_t{"source and destination arrays are different sizes"}); + return CUGRAPH_INVALID_INPUT; + } + + raft::copy(reinterpret_cast(internal_pointer_dst->data_), + reinterpret_cast(internal_pointer_src->data_), + internal_pointer_src->num_bytes(), + p_handle->handle_->get_stream()); + + return CUGRAPH_SUCCESS; + } catch (std::exception const& ex) { + auto tmp_error = new cugraph::c_api::cugraph_error_t{ex.what()}; + *error = reinterpret_cast(tmp_error); + return CUGRAPH_UNKNOWN_ERROR; + } +} + extern "C" cugraph_error_code_t cugraph_type_erased_device_array_view_copy_from_host( const cugraph_resource_handle_t* handle, cugraph_type_erased_device_array_view_t* dst, @@ -286,7 +325,6 @@ extern "C" cugraph_error_code_t cugraph_type_erased_device_array_view_copy_to_ho return CUGRAPH_UNKNOWN_ERROR; } } - extern "C" cugraph_error_code_t cugraph_type_erased_device_array_view_copy( const cugraph_resource_handle_t* handle, cugraph_type_erased_device_array_view_t* dst, diff --git a/cpp/src/c_api/array.hpp b/cpp/src/c_api/array.hpp index 26465e05d3b..a309f39f685 100644 --- a/cpp/src/c_api/array.hpp +++ b/cpp/src/c_api/array.hpp @@ -51,7 +51,6 @@ struct cugraph_type_erased_device_array_view_t { struct cugraph_type_erased_device_array_t { // NOTE: size must be first here because the device buffer is released size_t size_; - // Why doesn't rmm::device_buffer support release? rmm::device_buffer data_; data_type_id_t type_; @@ -87,15 +86,37 @@ struct cugraph_type_erased_host_array_view_t { return reinterpret_cast(data_); } + template + T const* as_type() const + { + return reinterpret_cast(data_); + } + size_t num_bytes() const { return num_bytes_; } }; struct cugraph_type_erased_host_array_t { - std::unique_ptr data_; - size_t size_; - size_t num_bytes_; + std::unique_ptr data_{nullptr}; + size_t size_{0}; + size_t num_bytes_{0}; data_type_id_t type_; + cugraph_type_erased_host_array_t(size_t size, size_t num_bytes, data_type_id_t type) + : data_(std::make_unique(num_bytes)), + size_(size), + num_bytes_(num_bytes), + type_(type) + { + } + + template + cugraph_type_erased_host_array_t(std::vector& vec, data_type_id_t type) + : size_(vec.size()), num_bytes_(vec.size() * sizeof(T)), type_(type) + { + data_ = std::make_unique(num_bytes_); + std::copy(vec.begin(), vec.end(), reinterpret_cast(data_.get())); + } + auto view() { return new cugraph_type_erased_host_array_view_t{data_.get(), size_, num_bytes_, type_}; diff --git a/cpp/src/c_api/random_walks.cpp b/cpp/src/c_api/random_walks.cpp index 288fe00d2a7..a503506be21 100644 --- a/cpp/src/c_api/random_walks.cpp +++ b/cpp/src/c_api/random_walks.cpp @@ -123,6 +123,12 @@ struct node2vec_functor : public abstract_functor { // std::make_unique(2, p_, q_, false)); std::make_unique(cugraph::sampling_strategy_t::NODE2VEC, p_, q_)); + // + // Need to unrenumber the vertices in the resulting paths + // + unrenumber_local_int_vertices( + handle_, paths.data(), paths.size(), number_map->data(), 0, paths.size() - 1, false); + result_ = new cugraph_random_walk_result_t{ compress_result_, max_depth_, diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp index ccb0f94f54f..a1537bc7510 100644 --- a/cpp/src/c_api/uniform_neighbor_sampling.cpp +++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -43,24 +44,21 @@ struct cugraph_sample_result_t { namespace { -#if 0 -// FIXME: ifdef this out for now. Can't be implemented until PR 2073 is merged - - struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_functor { +struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_functor { raft::handle_t const& handle_; - cugraph_graph_t* graph_{nullptr}; - cugraph_type_erased_device_array_view_t const* start_{nullptr}; - cugraph_type_erased_device_array_view_t const* start_label_{nullptr}; - cugraph_type_erased_host_array_view_t const* fan_out_{nullptr}; + cugraph::c_api::cugraph_graph_t* graph_{nullptr}; + cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_{nullptr}; + cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_label_{nullptr}; + cugraph::c_api::cugraph_type_erased_host_array_view_t const* fan_out_{nullptr}; bool with_replacement_{false}; bool do_expensive_check_{false}; - cugraph_sample_result_t* result_{nullptr}; + cugraph::c_api::cugraph_sample_result_t* result_{nullptr}; - uniform_neighbor_sampling_functor(::cugraph_resource_handle_t const* handle, - ::cugraph_graph_t* graph, - ::cugraph_type_erased_device_array_view_t const* start, - ::cugraph_type_erased_device_array_view_t const* start_label, - ::cugraph_type_erased_host_array_view_t const* fan_out, + uniform_neighbor_sampling_functor(cugraph_resource_handle_t const* handle, + cugraph_graph_t* graph, + cugraph_type_erased_device_array_view_t const* start, + cugraph_type_erased_device_array_view_t const* start_label, + cugraph_type_erased_host_array_view_t const* fan_out, bool with_replacement, bool do_expensive_check) : abstract_functor(), @@ -70,8 +68,8 @@ namespace { reinterpret_cast(start)), start_label_(reinterpret_cast( start_label)), - fan_out_(reinterpret_cast( - fan_out)), + fan_out_( + reinterpret_cast(fan_out)), with_replacement_(with_replacement), do_expensive_check_(do_expensive_check) { @@ -87,6 +85,8 @@ namespace { // FIXME: Think about how to handle SG vice MG if constexpr (!cugraph::is_candidate::value) { unsupported(); + } else if constexpr (!multi_gpu) { + unsupported(); } else { // uniform_nbr_sample expects store_transposed == false if constexpr (store_transposed) { @@ -110,55 +110,57 @@ namespace { // // Need to renumber sources // - renumber_ext_vertices(handle_, - start.data(), - start.size(), - number_map->data(), - graph_view.get_local_vertex_first(), - graph_view.get_local_vertex_last(), - false); - - // TODO: How can I do this? - auto [(srcs, dsts, labels, indices), counts] = cugraph::uniform_nbr_sample( - handle_, - graph_view, - start.data(), - start_label_.as_type(), - start.size(), - fanout_, - with_replacement_); - - result_ = new cugraph_sample_result_t{ - new cugraph_type_erased_device_array_t(srcs, graph_->vertex_type_), - new cugraph_type_erased_device_array_t(dsts, graph_->weight_type_), - new cugraph_type_erased_device_array_t(labels, label_type), - new cugraph_type_erased_device_array_t(indices, graph_->edge_type_), - new cugraph_type_erased_host_array_t(counts, graph_->vertex_type_)}; + cugraph::renumber_ext_vertices(handle_, + start.data(), + start.size(), + number_map->data(), + graph_view.get_local_vertex_first(), + graph_view.get_local_vertex_last(), + false); + + // C++ API wants an std::vector + std::vector fan_out(fan_out_->size_); + std::copy_n(fan_out_->as_type(), fan_out_->size_, fan_out.data()); + + auto&& [tmp_tuple, counts] = cugraph::uniform_nbr_sample(handle_, + graph_view, + start.data(), + start_label_->as_type(), + start.size(), + fan_out, + with_replacement_); + + auto&& [srcs, dsts, labels, indices] = tmp_tuple; + + std::vector vertex_partition_lasts = graph_view.get_vertex_partition_lasts(); + + cugraph::unrenumber_int_vertices(handle_, + srcs.data(), + srcs.size(), + number_map->data(), + vertex_partition_lasts, + do_expensive_check_); + + cugraph::unrenumber_int_vertices(handle_, + dsts.data(), + dsts.size(), + number_map->data(), + vertex_partition_lasts, + do_expensive_check_); + + result_ = new cugraph::c_api::cugraph_sample_result_t{ + new cugraph::c_api::cugraph_type_erased_device_array_t(srcs, graph_->vertex_type_), + new cugraph::c_api::cugraph_type_erased_device_array_t(dsts, graph_->vertex_type_), + new cugraph::c_api::cugraph_type_erased_device_array_t(labels, start_label_->type_), + new cugraph::c_api::cugraph_type_erased_device_array_t(indices, graph_->edge_type_), + new cugraph::c_api::cugraph_type_erased_host_array_t(counts, graph_->vertex_type_)}; } } }; -#else - -struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_functor { - cugraph::c_api::cugraph_sample_result_t* result_{nullptr}; - - uniform_neighbor_sampling_functor() : abstract_functor() {} - - template - void operator()() - { - unsupported(); - } -}; -#endif } // namespace -extern "C" cugraph_error_code_t uniform_nbr_sample( +extern "C" cugraph_error_code_t cugraph_uniform_neighbor_sample( const cugraph_resource_handle_t* handle, cugraph_graph_t* graph, const cugraph_type_erased_device_array_view_t* start, @@ -169,8 +171,8 @@ extern "C" cugraph_error_code_t uniform_nbr_sample( cugraph_sample_result_t** result, cugraph_error_t** error) { - uniform_neighbor_sampling_functor functor; - + uniform_neighbor_sampling_functor functor{ + handle, graph, start, start_labels, fan_out, with_replacement, do_expensive_check}; return cugraph::c_api::run_algorithm(graph, functor, result, error); } diff --git a/cpp/src/sampling/nbr_sampling_impl.cuh b/cpp/src/sampling/nbr_sampling_impl.cuh index 188603324b4..863407eb5b1 100644 --- a/cpp/src/sampling/nbr_sampling_impl.cuh +++ b/cpp/src/sampling/nbr_sampling_impl.cuh @@ -231,6 +231,9 @@ shuffle_to_target_gpu_ids(raft::handle_t const& handle, thrust::upper_bound(thrust::seq, gpu_id_first, gpu_id_last, static_cast(i)))); }); + thrust::adjacent_difference( + handle.get_thrust_policy(), tx_counts.begin(), tx_counts.end(), tx_counts.begin()); + std::vector h_tx_counts(tx_counts.size()); raft::update_host(h_tx_counts.data(), tx_counts.data(), tx_counts.size(), handle.get_stream()); @@ -332,8 +335,6 @@ uniform_nbr_sample_impl( if constexpr (graph_view_t::is_multi_gpu) { size_t num_starting_vs = d_in.size(); - CUGRAPH_EXPECTS(num_starting_vs > 0, - "Invalid input argument: starting vertex set cannot be null."); CUGRAPH_EXPECTS(num_starting_vs == d_ranks.size(), "Sets of input vertices and ranks must have same sizes."); @@ -360,112 +361,107 @@ uniform_nbr_sample_impl( size_t level{0l}; for (auto&& k_level : h_fan_out) { - // main body: - //{ // prep step for extracting out-degs(sources): // auto&& [d_new_in, d_new_rank] = gather_active_majors(handle, graph_view, d_in.cbegin(), d_in.cend(), d_ranks.cbegin()); - auto in_sz = d_in.size(); - if (in_sz > 0) { - rmm::device_uvector d_out_src(0, handle.get_stream()); - rmm::device_uvector d_out_dst(0, handle.get_stream()); - rmm::device_uvector d_out_ranks(0, handle.get_stream()); - rmm::device_uvector d_indices(0, handle.get_stream()); - if (k_level != 0) { - // extract out-degs(sources): - // - auto&& d_out_degs = - get_active_major_global_degrees(handle, graph_view, d_new_in, global_out_degrees); - - // segemented-random-generation of indices: - // - device_vec_t d_rnd_indices(d_new_in.size() * k_level, handle.get_stream()); - - cugraph_ops::Rng rng(row_rank + level); - cugraph_ops::get_sampling_index(detail::raw_ptr(d_rnd_indices), - rng, - detail::raw_const_ptr(d_out_degs), - static_cast(d_out_degs.size()), - static_cast(k_level), - flag_replacement, - handle.get_stream()); - - // gather edges step: - // invalid entries (not found, etc.) filtered out in result; - // d_indices[] filtered out in-place (to avoid copies+moves); - // - auto&& [temp_d_out_src, temp_d_out_dst, temp_d_out_ranks, temp_d_indices] = - gather_local_edges(handle, - graph_view, - d_new_in, - d_new_rank, - std::move(d_rnd_indices), - static_cast(k_level), - global_degree_offsets, - global_adjacency_list_offsets); - d_out_src = std::move(temp_d_out_src); - d_out_dst = std::move(temp_d_out_dst); - d_out_ranks = std::move(temp_d_out_ranks); - d_indices = std::move(temp_d_indices); - } else { - auto&& [temp_d_out_src, temp_d_out_dst, temp_d_out_ranks, temp_d_indices] = - gather_one_hop_edgelist( - handle, graph_view, d_new_in, d_new_rank, global_adjacency_list_offsets); - d_out_src = std::move(temp_d_out_src); - d_out_dst = std::move(temp_d_out_dst); - d_out_ranks = std::move(temp_d_out_ranks); - d_indices = std::move(temp_d_indices); - } - - // resize accumulators: - // - auto old_sz = d_acc_dst.size(); - auto add_sz = d_out_dst.size(); - auto new_sz = old_sz + add_sz; - - d_acc_src.resize(new_sz, handle.get_stream()); - d_acc_dst.resize(new_sz, handle.get_stream()); - d_acc_ranks.resize(new_sz, handle.get_stream()); - d_acc_indices.resize(new_sz, handle.get_stream()); - - // zip quad; must be done after resizing, - // because they grow from one iteration to another, - // so iterators could be invalidated: - // - auto acc_zip_it = - thrust::make_zip_iterator(thrust::make_tuple(d_acc_src.begin() + old_sz, - d_acc_dst.begin() + old_sz, - d_acc_ranks.begin() + old_sz, - d_acc_indices.begin() + old_sz)); + rmm::device_uvector d_out_src(0, handle.get_stream()); + rmm::device_uvector d_out_dst(0, handle.get_stream()); + rmm::device_uvector d_out_ranks(0, handle.get_stream()); + rmm::device_uvector d_indices(0, handle.get_stream()); - // union step: + if (k_level != 0) { + // extract out-degs(sources): // - auto out_zip_it = thrust::make_zip_iterator(thrust::make_tuple( - d_out_src.begin(), d_out_dst.begin(), d_out_ranks.begin(), d_indices.begin())); + auto&& d_out_degs = + get_active_major_global_degrees(handle, graph_view, d_new_in, global_out_degrees); - thrust::copy_n(handle.get_thrust_policy(), out_zip_it, add_sz, acc_zip_it); - - // shuffle step: update input for self_rank - // zipping is necessary to preserve rank info during shuffle! + // segemented-random-generation of indices: // - auto next_in_zip_begin = - thrust::make_zip_iterator(thrust::make_tuple(d_out_dst.begin(), d_out_ranks.begin())); - auto next_in_zip_end = - thrust::make_zip_iterator(thrust::make_tuple(d_out_dst.end(), d_out_ranks.end())); - - update_input_by_rank(handle, + device_vec_t d_rnd_indices(d_new_in.size() * k_level, handle.get_stream()); + + cugraph_ops::Rng rng(row_rank + level); + cugraph_ops::get_sampling_index(detail::raw_ptr(d_rnd_indices), + rng, + detail::raw_const_ptr(d_out_degs), + static_cast(d_out_degs.size()), + static_cast(k_level), + flag_replacement, + handle.get_stream()); + + // gather edges step: + // invalid entries (not found, etc.) filtered out in result; + // d_indices[] filtered out in-place (to avoid copies+moves); + // + auto&& [temp_d_out_src, temp_d_out_dst, temp_d_out_ranks, temp_d_indices] = + gather_local_edges(handle, graph_view, - next_in_zip_begin, - next_in_zip_end, - static_cast(self_rank), - d_in, - d_ranks, - gpu_t{}); + d_new_in, + d_new_rank, + std::move(d_rnd_indices), + static_cast(k_level), + global_degree_offsets, + global_adjacency_list_offsets); + d_out_src = std::move(temp_d_out_src); + d_out_dst = std::move(temp_d_out_dst); + d_out_ranks = std::move(temp_d_out_ranks); + d_indices = std::move(temp_d_indices); + } else { + auto&& [temp_d_out_src, temp_d_out_dst, temp_d_out_ranks, temp_d_indices] = + gather_one_hop_edgelist( + handle, graph_view, d_new_in, d_new_rank, global_adjacency_list_offsets); + d_out_src = std::move(temp_d_out_src); + d_out_dst = std::move(temp_d_out_dst); + d_out_ranks = std::move(temp_d_out_ranks); + d_indices = std::move(temp_d_indices); } - //} + // resize accumulators: + // + auto old_sz = d_acc_dst.size(); + auto add_sz = d_out_dst.size(); + auto new_sz = old_sz + add_sz; + + d_acc_src.resize(new_sz, handle.get_stream()); + d_acc_dst.resize(new_sz, handle.get_stream()); + d_acc_ranks.resize(new_sz, handle.get_stream()); + d_acc_indices.resize(new_sz, handle.get_stream()); + + // zip quad; must be done after resizing, + // because they grow from one iteration to another, + // so iterators could be invalidated: + // + auto acc_zip_it = + thrust::make_zip_iterator(thrust::make_tuple(d_acc_src.begin() + old_sz, + d_acc_dst.begin() + old_sz, + d_acc_ranks.begin() + old_sz, + d_acc_indices.begin() + old_sz)); + + // union step: + // + auto out_zip_it = thrust::make_zip_iterator(thrust::make_tuple( + d_out_src.begin(), d_out_dst.begin(), d_out_ranks.begin(), d_indices.begin())); + + thrust::copy_n(handle.get_thrust_policy(), out_zip_it, add_sz, acc_zip_it); + + // shuffle step: update input for self_rank + // zipping is necessary to preserve rank info during shuffle! + // + auto next_in_zip_begin = + thrust::make_zip_iterator(thrust::make_tuple(d_out_dst.begin(), d_out_ranks.begin())); + auto next_in_zip_end = + thrust::make_zip_iterator(thrust::make_tuple(d_out_dst.end(), d_out_ranks.end())); + + update_input_by_rank(handle, + graph_view, + next_in_zip_begin, + next_in_zip_end, + static_cast(self_rank), + d_in, + d_ranks, + gpu_t{}); + ++level; } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 1482f4f810f..d79bbab5b29 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -621,6 +621,7 @@ if(BUILD_CUGRAPH_MG_TESTS) ConfigureCTestMG(MG_CAPI_CREATE_GRAPH c_api/mg_create_graph_test.c c_api/mg_test_utils.cpp) ConfigureCTestMG(MG_CAPI_PAGERANK c_api/mg_pagerank_test.c c_api/mg_test_utils.cpp) ConfigureCTestMG(MG_CAPI_HITS c_api/mg_hits_test.c c_api/mg_test_utils.cpp) + ConfigureCTestMG(MG_CAPI_UNIFORM_NEIGHBOR_SAMPLE c_api/mg_uniform_neighbor_sample_test.c c_api/mg_test_utils.cpp) else() message(FATAL_ERROR "OpenMPI NOT found, cannot build MG tests.") endif() @@ -666,6 +667,7 @@ ConfigureCTest(CAPI_BFS_TEST c_api/bfs_test.c) ConfigureCTest(CAPI_SSSP_TEST c_api/sssp_test.c) ConfigureCTest(CAPI_EXTRACT_PATHS_TEST c_api/extract_paths_test.c) ConfigureCTest(CAPI_NODE2VEC_TEST c_api/node2vec_test.c) +ConfigureCTest(CAPI_UNIFORM_NEIGHBOR_SAMPLE c_api/uniform_neighbor_sample_test.c) ################################################################################################### ### enable testing ################################################################################ diff --git a/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c b/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c new file mode 100644 index 00000000000..6b867652801 --- /dev/null +++ b/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "mg_test_utils.h" /* RUN_MG_TEST */ + +#include +#include + +#include + +typedef int32_t vertex_t; +typedef int32_t edge_t; +typedef float weight_t; + +int generic_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle, + vertex_t* h_src, + vertex_t* h_dst, + weight_t* h_wgt, + size_t num_vertices, + size_t num_edges, + vertex_t* h_start, + int* h_start_label, + size_t num_starts, + int* fan_out, + size_t max_depth, + bool_t with_replacement, + bool_t store_transposed) +{ + int test_ret_value = 0; + + cugraph_error_code_t ret_code = CUGRAPH_SUCCESS; + cugraph_error_t* ret_error = NULL; + + cugraph_graph_t* graph = NULL; + cugraph_sample_result_t* result = NULL; + + cugraph_type_erased_device_array_t* d_start = NULL; + cugraph_type_erased_device_array_view_t* d_start_view = NULL; + cugraph_type_erased_device_array_t* d_start_label = NULL; + cugraph_type_erased_device_array_view_t* d_start_label_view = NULL; + cugraph_type_erased_host_array_view_t* h_fan_out_view = NULL; + + ret_code = create_mg_test_graph( + handle, h_src, h_dst, h_wgt, num_edges, store_transposed, &graph, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed."); + + ret_code = + cugraph_type_erased_device_array_create(handle, num_starts, INT32, &d_start, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_start create failed."); + + d_start_view = cugraph_type_erased_device_array_view(d_start); + + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + handle, d_start_view, (byte_t*)h_start, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed."); + + ret_code = + cugraph_type_erased_device_array_create(handle, num_starts, INT32, &d_start_label, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_start_label create failed."); + + d_start_label_view = cugraph_type_erased_device_array_view(d_start_label); + + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + handle, d_start_label_view, (byte_t*)h_start_label, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed."); + + h_fan_out_view = cugraph_type_erased_host_array_view_create(fan_out, max_depth, INT32); + + ret_code = cugraph_uniform_neighbor_sample(handle, + graph, + d_start_view, + d_start_label_view, + h_fan_out_view, + with_replacement, + FALSE, + &result, + &ret_error); + + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "uniform_neighbor_sample failed."); + + cugraph_type_erased_device_array_view_t* srcs; + cugraph_type_erased_device_array_view_t* dsts; + cugraph_type_erased_device_array_view_t* labels; + cugraph_type_erased_device_array_view_t* index; + cugraph_type_erased_host_array_view_t* counts; + + srcs = cugraph_sample_result_get_sources(result); + dsts = cugraph_sample_result_get_destinations(result); + labels = cugraph_sample_result_get_start_labels(result); + index = cugraph_sample_result_get_index(result); + counts = cugraph_sample_result_get_counts(result); + + size_t result_size = cugraph_type_erased_device_array_view_size(srcs); + + vertex_t h_srcs[result_size]; + vertex_t h_dsts[result_size]; + int h_labels[result_size]; + edge_t h_index[result_size]; + size_t* h_counts; + + ret_code = + cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_srcs, srcs, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); + + ret_code = + cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_dsts, dsts, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); + + ret_code = cugraph_type_erased_device_array_view_copy_to_host( + handle, (byte_t*)h_labels, labels, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); + + ret_code = + cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_index, index, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); + + h_counts = (size_t*)cugraph_type_erased_host_array_pointer(counts); + + // NOTE: The C++ tester does a more thorough validation. For our purposes + // here we will do a simpler validation, merely checking that all edges + // are actually part of the graph + weight_t M[num_vertices][num_vertices]; + + for (int i = 0; i < num_vertices; ++i) + for (int j = 0; j < num_vertices; ++j) + M[i][j] = 0.0; + + for (int i = 0; i < num_edges; ++i) + M[h_src[i]][h_dst[i]] = h_wgt[i]; + + for (int i = 0; (i < result_size) && (test_ret_value == 0); ++i) { + TEST_ASSERT(test_ret_value, + M[h_srcs[i]][h_dsts[i]] > 0.0, + "uniform_neighbor_sample got edge that doesn't exist"); + + bool_t found = FALSE; + for (int j = 0; j < num_starts; ++j) + found = found || (h_labels[i] == h_start_label[j]); + + TEST_ASSERT(test_ret_value, found, "invalid label"); + } + + cugraph_type_erased_host_array_view_free(h_fan_out_view); + + return test_ret_value; +} + +int test_uniform_neighbor_sample(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 8; + size_t num_vertices = 6; + size_t fan_out_size = 2; + size_t num_starts = 2; + + vertex_t src[] = {0, 1, 1, 2, 2, 2, 3, 4}; + vertex_t dst[] = {1, 3, 4, 0, 1, 3, 5, 5}; + weight_t wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t start[] = {2, 2}; + vertex_t start_labels[] = {0, 1}; + int fan_out[] = {1, 2}; + + return generic_uniform_neighbor_sample_test(handle, + src, + dst, + wgt, + num_vertices, + num_edges, + start, + start_labels, + num_starts, + fan_out, + fan_out_size, + TRUE, + FALSE); +} + +/******************************************************************************/ + +int main(int argc, char** argv) +{ + // Set up MPI: + int comm_rank; + int comm_size; + int num_gpus_per_node; + cudaError_t status; + int mpi_status; + int result = 0; + cugraph_resource_handle_t* handle = NULL; + cugraph_error_t* ret_error; + cugraph_error_code_t ret_code = CUGRAPH_SUCCESS; + int prows = 1; + + C_MPI_TRY(MPI_Init(&argc, &argv)); + C_MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank)); + C_MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &comm_size)); + C_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); + C_CUDA_TRY(cudaSetDevice(comm_rank % num_gpus_per_node)); + + void* raft_handle = create_raft_handle(prows); + handle = cugraph_create_resource_handle(raft_handle); + + if (result == 0) { + result |= RUN_MG_TEST(test_uniform_neighbor_sample, handle); + + cugraph_free_resource_handle(handle); + } + + free_raft_handle(raft_handle); + + C_MPI_TRY(MPI_Finalize()); + + return result; +} diff --git a/cpp/tests/c_api/node2vec_test.c b/cpp/tests/c_api/node2vec_test.c index 8f13c8dafb0..979e5a7a82b 100644 --- a/cpp/tests/c_api/node2vec_test.c +++ b/cpp/tests/c_api/node2vec_test.c @@ -203,11 +203,36 @@ int test_node2vec_short_sparse() src, dst, wgt, seeds, num_vertices, num_edges, 2, max_depth, TRUE, 0.8, 0.5, FALSE); } +int test_node2vec_karate() +{ + size_t num_edges = 156; + size_t num_vertices = 34; + + vertex_t src[] = {1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 2, + 3, 7, 13, 17, 19, 21, 30, 3, 7, 8, 9, 13, 27, 28, 32, 7, 12, + 13, 6, 10, 6, 10, 16, 16, 30, 32, 33, 33, 33, 32, 33, 32, + 33, 32, 33, 33, 32, 33, 32, 33, 25, 27, 29, 32, 33, 25, 27, + 31, 31, 29, 33, 33, 31, 33, 32, 33, 32, 33, 32, 33, 33, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, + 8, 8, 8, 9, 13, 14, 14, 15, 15, 18, 18, 19, 20, 20, 22, 22, + 23, 23, 23, 23, 23, 24, 24, 24, 25, 26, 26, 27, 28, 28, 29, + 29, 30, 30, 31, 31, 32}; + vertex_t dst[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,4,4,5,5,5,6,8,8,8,9,13,14,14,15,15,18,18,19,20,20,22,22,23,23,23,23,23,24,24,24,25,26,26,27,28,28,29,29,30,30,31,31,32,1,2,3,4,5,6,7,8,10,11,12,13,17,19,21,31,2,3,7,13,17,19,21,30,3,7,8,9,13,27,28,32,7,12,13,6,10,6,10,16,16,30,32,33,33,33,32,33,32,33,32,33,33,32,33,32,33,25,27,29,32,33,25,27,31,31,29,33,33,31,33,32,33,32,33,32,33,33}; + weight_t wgt[] = {1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f}; + vertex_t seeds[] = {12, 28, 20, 23, 15, 26}; + size_t max_depth = 5; + + return generic_node2vec_test( + src, dst, wgt, seeds, num_vertices, num_edges, 6, max_depth, TRUE, 0.8, 0.5, FALSE); +} + int main(int argc, char** argv) { int result = 0; result |= RUN_TEST(test_node2vec); result |= RUN_TEST(test_node2vec_short_dense); result |= RUN_TEST(test_node2vec_short_sparse); + result |= RUN_TEST(test_node2vec_karate); return result; } diff --git a/cpp/tests/c_api/uniform_neighbor_sample_test.c b/cpp/tests/c_api/uniform_neighbor_sample_test.c new file mode 100644 index 00000000000..b844220677b --- /dev/null +++ b/cpp/tests/c_api/uniform_neighbor_sample_test.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "c_test_utils.h" /* RUN_TEST */ + +#include +#include + +#include + +typedef int32_t vertex_t; +typedef int32_t edge_t; +typedef float weight_t; + +int generic_uniform_neighbor_sample_test(vertex_t* h_src, + vertex_t* h_dst, + weight_t* h_wgt, + size_t num_vertices, + size_t num_edges, + vertex_t* h_start, + int* h_start_label, + size_t num_starts, + int* fan_out, + size_t max_depth, + bool_t with_replacement, + bool_t renumber, + bool_t store_transposed) +{ + int test_ret_value = 0; + + cugraph_error_code_t ret_code = CUGRAPH_SUCCESS; + cugraph_error_t* ret_error = NULL; + + cugraph_resource_handle_t* handle = NULL; + cugraph_graph_t* graph = NULL; + cugraph_sample_result_t* result = NULL; + + cugraph_type_erased_device_array_t* d_start = NULL; + cugraph_type_erased_device_array_view_t* d_start_view = NULL; + cugraph_type_erased_device_array_t* d_start_label = NULL; + cugraph_type_erased_device_array_view_t* d_start_label_view = NULL; + cugraph_type_erased_host_array_view_t* h_fan_out_view = NULL; + + handle = cugraph_create_resource_handle(NULL); + TEST_ASSERT(test_ret_value, handle != NULL, "resource handle creation failed."); + + ret_code = create_test_graph( + handle, h_src, h_dst, h_wgt, num_edges, store_transposed, renumber, &graph, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed."); + + ret_code = + cugraph_type_erased_device_array_create(handle, num_starts, INT32, &d_start, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_start create failed."); + + d_start_view = cugraph_type_erased_device_array_view(d_start); + + ret_code = + cugraph_type_erased_device_array_create(handle, num_starts, INT32, &d_start_label, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_start_label create failed."); + + d_start_label_view = cugraph_type_erased_device_array_view(d_start_label); + + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + handle, d_start_label_view, (byte_t*)h_start_label, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed."); + + h_fan_out_view = cugraph_type_erased_host_array_view_create(fan_out, max_depth, INT32); + + ret_code = cugraph_uniform_neighbor_sample(handle, + graph, + d_start_view, + d_start_label_view, + h_fan_out_view, + with_replacement, + FALSE, + &result, + &ret_error); + + TEST_ASSERT(test_ret_value, + ret_code != CUGRAPH_SUCCESS, + "cugraph_uniform_neighbor_sample expected to fail in SG test"); + +#if 0 + // FIXME: cugraph_uniform_neighbor_sample does not support SG + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "uniform_neighbor_sample failed."); + + cugraph_type_erased_device_array_view_t* srcs; + cugraph_type_erased_device_array_view_t* dsts; + cugraph_type_erased_device_array_view_t* labels; + cugraph_type_erased_device_array_view_t* index; + cugraph_type_erased_host_array_view_t* counts; + + srcs = cugraph_sample_result_get_sources(result); + dsts = cugraph_sample_result_get_destinations(result); + labels = cugraph_sample_result_get_start_labels(result); + index = cugraph_sample_result_get_index(result); + counts = cugraph_sample_result_get_counts(result); + + size_t result_size = cugraph_type_erased_device_array_view_size(srcs); + + vertex_t h_srcs[result_size]; + vertex_t h_dsts[result_size]; + int h_labels[result_size]; + edge_t h_index[result_size]; + size_t* h_counts; + + ret_code = + cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_srcs, srcs, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); + + ret_code = + cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_dsts, dsts, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); + + ret_code = cugraph_type_erased_device_array_view_copy_to_host( + handle, (byte_t*)h_labels, labels, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); + + ret_code = + cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_index, index, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); + + h_counts = (size_t*)cugraph_type_erased_host_array_pointer(counts); + + // NOTE: The C++ tester does a more thorough validation. For our purposes + // here we will do a simpler validation, merely checking that all edges + // are actually part of the graph + weight_t M[num_vertices][num_vertices]; + + for (int i = 0; i < num_vertices; ++i) + for (int j = 0; j < num_vertices; ++j) + M[i][j] = 0.0; + + for (int i = 0; i < num_edges; ++i) + M[h_src[i]][h_dst[i]] = h_wgt[i]; + + for (int i = 0; (i < result_size) && (test_ret_value == 0); ++i) { + TEST_ASSERT(test_ret_value, + M[h_srcs[i]][h_dsts[i]] > 0.0, + "uniform_neighbor_sample got edge that doesn't exist"); + + bool_t found = FALSE; + for (int j = 0; j < num_starts; ++j) + found = found || (h_labels[i] == h_start_label[j]); + + TEST_ASSERT(test_ret_value, found, "invalid label"); + } + + cugraph_type_erased_host_array_view_free(h_fan_out_view); +#endif + + return test_ret_value; +} + +int test_uniform_neighbor_sample() +{ + size_t num_edges = 8; + size_t num_vertices = 6; + size_t fan_out_size = 2; + size_t num_starts = 2; + + vertex_t src[] = {0, 1, 1, 2, 2, 2, 3, 4}; + vertex_t dst[] = {1, 3, 4, 0, 1, 3, 5, 5}; + weight_t wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t start[] = {2, 2}; + vertex_t start_labels[] = {0, 1}; + int fan_out[] = {1, 2}; + + return generic_uniform_neighbor_sample_test(src, + dst, + wgt, + num_vertices, + num_edges, + start, + start_labels, + num_starts, + fan_out, + fan_out_size, + TRUE, + FALSE, + FALSE); +} + +int main(int argc, char** argv) +{ + int result = 0; + result |= RUN_TEST(test_uniform_neighbor_sample); + return result; +} diff --git a/datasets/small_line.csv b/datasets/small_line.csv new file mode 100644 index 00000000000..68751f432a2 --- /dev/null +++ b/datasets/small_line.csv @@ -0,0 +1,9 @@ +0 1 1.0 +1 2 1.0 +2 3 1.0 +3 4 1.0 +4 5 1.0 +5 6 1.0 +6 7 1.0 +7 8 1.0 +8 9 1.0 \ No newline at end of file diff --git a/datasets/small_tree.csv b/datasets/small_tree.csv new file mode 100644 index 00000000000..e8216bbb6ad --- /dev/null +++ b/datasets/small_tree.csv @@ -0,0 +1,11 @@ +0 1 1.0 +0 2 1.0 +0 3 1.0 +0 4 1.0 +1 5 1.0 +2 5 1.0 +3 5 1.0 +4 5 1.0 +5 6 1.0 +5 7 1.0 +5 8 1.0 diff --git a/docs/cugraph/source/basics/cugraph_blogs.rst b/docs/cugraph/source/basics/cugraph_blogs.rst index 35db20ff454..4d544787977 100644 --- a/docs/cugraph/source/basics/cugraph_blogs.rst +++ b/docs/cugraph/source/basics/cugraph_blogs.rst @@ -7,8 +7,12 @@ these blog posts provide deeper dives into features from cuGraph. Here, we've selected just a few that are of particular interest to cuGraph users: -BLOGS -============== +BLOGS & Conferences +==================== +2018 +------- + * `GTC18 Fall - RAPIDS: Benchmarking Graph Analytics on the DGX-2 `_ + 2019 ------- @@ -16,7 +20,8 @@ BLOGS * `RAPIDS cuGraph — The vision and journey to version 1.0 and beyond `_ * `RAPIDS cuGraph : multi-GPU PageRank `_ * `Similarity in graphs: Jaccard versus the Overlap Coefficient `_ - + * `GTC19 Spring - Accelerating Graph Algorithms with RAPIDS `_ + * `GTC19 Fall - Multi-Node Multi-GPU Machine Learning and Graph Analytics with RAPIDS `_ 2020 ------ @@ -24,10 +29,19 @@ BLOGS * `Tackling Large Graphs with RAPIDS cuGraph and CUDA Unified Memory on GPUs `_ * `RAPIDS cuGraph adds NetworkX and DiGraph Compatibility `_ * `Large Graph Visualization with RAPIDS cuGraph `_ + * `GTC 20 Fall - cuGraph Goes Big `_ 2021 ------ - * + * `GTC 21 - State of RAPIDS cuGraph and what's comming next `_ + + +2022 +------ + * `GTC: State of cuGraph (video & slides) `_ + * `GTC: Scaling and Validating Louvain in cuGraph against Massive Graphs (video & slides) `_ + + @@ -38,7 +52,13 @@ Media Academic Papers =============== -* S. Kang, A. Fender, J. Eaton, B. Rees: Computing PageRank Scores of Web Crawl Data Using DGX A100 Clusters. In IEEE HPEC, Sep. 2020 + + * S Kang, A. Fender, J. Eaton, B. Rees:`Computing PageRank Scores of Web Crawl Data Using DGX A100 Clusters`. In IEEE HPEC, Sep. 2020 + + * Hricik, T., Bader, D., & Green, O. (2020, September). `Using RAPIDS AI to accelerate graph data science workflows`. In 2020 IEEE High Performance Extreme Computing Conference (HPEC) (pp. 1-4). IEEE. + + * Richardson, B., Rees, B., Drabas, T., Oldridge, E., Bader, D. A., & Allen, R. (2020, August). Accelerating and Expanding End-to-End Data Science Workflows with DL/ML Interoperability Using RAPIDS. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (pp. 3503-3504). + Other BLOGS diff --git a/docs/cugraph/source/basics/cugraph_intro.md b/docs/cugraph/source/basics/cugraph_intro.md index 142395fb719..0684129503f 100644 --- a/docs/cugraph/source/basics/cugraph_intro.md +++ b/docs/cugraph/source/basics/cugraph_intro.md @@ -67,3 +67,8 @@ documentation we will mostly use the terms __Node__ and __Edge__ to better match NetworkX preferred term use, as well as other Python-based tools. At the CUDA/C layer, we favor the mathematical terms of __Vertex__ and __Edge__. +# Roadmap +GitHub does not provide a robust project management interface, and so a roadmap turns into simply a projection of when work will be completed and not a complete picture of everything that needs to be done. To capture the work that requires multiple steps, issues are labels as “EPIC” and include multiple subtasks that could span multiple releases. The EPIC will be in the release where work in expected to be completed. A better roadmap is being worked an image of the roadmap will be posted when ready. + + * GitHub Project Board: https://github.com/rapidsai/cugraph/projects/28 + \ No newline at end of file diff --git a/img/Scaling.png b/img/Scaling.png new file mode 100644 index 00000000000..484c26ce293 Binary files /dev/null and b/img/Scaling.png differ diff --git a/img/cugraph-stack.png b/img/cugraph-stack.png new file mode 100644 index 00000000000..4224fa90d83 Binary files /dev/null and b/img/cugraph-stack.png differ diff --git a/python/cugraph/cugraph/dask/__init__.py b/python/cugraph/cugraph/dask/__init__.py index 60aebaf19b0..7e60315ffb5 100644 --- a/python/cugraph/cugraph/dask/__init__.py +++ b/python/cugraph/cugraph/dask/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -12,6 +12,7 @@ # limitations under the License. from .link_analysis.pagerank import pagerank +from .link_analysis.hits import hits from .traversal.bfs import bfs from .traversal.sssp import sssp from .common.read_utils import get_chunksize diff --git a/python/cugraph/cugraph/dask/link_analysis/hits.py b/python/cugraph/cugraph/dask/link_analysis/hits.py new file mode 100644 index 00000000000..6e54e314246 --- /dev/null +++ b/python/cugraph/cugraph/dask/link_analysis/hits.py @@ -0,0 +1,213 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from dask.distributed import wait, default_client +from cugraph.dask.common.input_utils import get_distributed_data + +import cugraph.comms.comms as Comms +import dask_cudf +import cudf + +from pylibcugraph.experimental import (ResourceHandle, + GraphProperties, + MGGraph, + hits as pylibcugraph_hits + ) + + +def call_hits(sID, + data, + src_col_name, + dst_col_name, + graph_properties, + store_transposed, + num_edges, + do_expensive_check, + tolerance, + max_iter, + initial_hubs_guess_vertices, + initial_hubs_guess_value, + normalized): + + handle = Comms.get_handle(sID) + h = ResourceHandle(handle.getHandle()) + srcs = data[0][src_col_name] + dsts = data[0][dst_col_name] + weights = None + if "value" in data[0].columns: + weights = data[0]['value'] + + mg = MGGraph(h, + graph_properties, + srcs, + dsts, + weights, + store_transposed, + num_edges, + do_expensive_check) + + result = pylibcugraph_hits(h, + mg, + tolerance, + max_iter, + initial_hubs_guess_vertices, + initial_hubs_guess_value, + normalized, + do_expensive_check) + + return result + + +def convert_to_cudf(cp_arrays): + """ + create a cudf DataFrame from cupy arrays + """ + cupy_vertices, cupy_hubs, cupy_authorities = cp_arrays + df = cudf.DataFrame() + df["vertex"] = cupy_vertices + df["hubs"] = cupy_hubs + df["authorities"] = cupy_authorities + return df + + +def hits(input_graph, tol=1.0e-5, max_iter=100, nstart=None, normalized=True): + """ + Compute HITS hubs and authorities values for each vertex + + The HITS algorithm computes two numbers for a node. Authorities + estimates the node value based on the incoming links. Hubs estimates + the node value based on outgoing links. + + Both cuGraph and networkx implementation use a 1-norm. + + Parameters + ---------- + + input_graph : cugraph.Graph + cuGraph graph descriptor, should contain the connectivity information + as an edge list (edge weights are not used for this algorithm). + The adjacency list will be computed if not already present. + + tol : float, optional (default=1.0e-5) + Set the tolerance of the approximation, this parameter should be a + small magnitude value. + + max_iter : int, optional (default=100) + The maximum number of iterations before an answer is returned. + + nstart : cudf.Dataframe, optional (default=None) + The initial hubs guess vertices along with their initial hubs guess + value + + nstart['vertex'] : cudf.Series + Initial hubs guess vertices + nstart['values'] : cudf.Series + Initial hubs guess values + + normalized : bool, optional (default=True) + A flag to normalize the results + + Returns + ------- + HubsAndAuthorities : dask_cudf.DataFrame + GPU data frame containing three cudf.Series of size V: the vertex + identifiers and the corresponding hubs values and the corresponding + authorities values. + + df['vertex'] : dask_cudf.Series + Contains the vertex identifiers + df['hubs'] : dask_cudf.Series + Contains the hubs score + df['authorities'] : dask_cudf.Series + Contains the authorities score + + Examples + -------- + >>> # import cugraph.dask as dcg + >>> # ... Init a DASK Cluster + >>> # see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html + >>> # Download dataset from https://github.com/rapidsai/cugraph/datasets/.. + >>> # chunksize = dcg.get_chunksize(datasets_path / "karate.csv") + >>> # ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize) + >>> # dg = cugraph.Graph(directed=True) + >>> # dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', + >>> # edge_attr='value') + >>> # hits = dcg.hits(dg, max_iter = 50) + + """ + + client = default_client() + + # FIXME Still compute renumbering at this layer in case str + # vertex ID are passed + input_graph.compute_renumber_edge_list(transposed=False) + ddf = input_graph.edgelist.edgelist_df + + graph_properties = GraphProperties( + is_multigraph=False) + + store_transposed = False + do_expensive_check = False + initial_hubs_guess_vertices = None + initial_hubs_guess_values = None + + src_col_name = input_graph.renumber_map.renumbered_src_col_name + dst_col_name = input_graph.renumber_map.renumbered_dst_col_name + + # FIXME Move this call to the function creating a directed + # graph from a dask dataframe because duplicated edges need + # to be dropped + ddf = ddf.map_partitions( + lambda df: df.drop_duplicates(subset=[src_col_name, dst_col_name])) + + num_edges = len(ddf) + data = get_distributed_data(ddf) + + if nstart is not None: + initial_hubs_guess_vertices = nstart['vertex'] + initial_hubs_guess_values = nstart['values'] + + cupy_result = [client.submit(call_hits, + Comms.get_session_id(), + wf[1], + src_col_name, + dst_col_name, + graph_properties, + store_transposed, + num_edges, + do_expensive_check, + tol, + max_iter, + initial_hubs_guess_vertices, + initial_hubs_guess_values, + normalized, + workers=[wf[0]]) + for idx, wf in enumerate(data.worker_to_parts.items())] + + wait(cupy_result) + + cudf_result = [client.submit(convert_to_cudf, + cp_arrays, + workers=client.who_has( + cp_arrays)[cp_arrays.key]) + for cp_arrays in cupy_result] + + wait(cudf_result) + + ddf = dask_cudf.from_delayed(cudf_result) + if input_graph.renumbered: + return input_graph.unrenumber(ddf, 'vertex') + + return ddf diff --git a/python/cugraph/cugraph/dask/sampling/__init__.py b/python/cugraph/cugraph/dask/sampling/__init__.py new file mode 100644 index 00000000000..c7a036fda49 --- /dev/null +++ b/python/cugraph/cugraph/dask/sampling/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py b/python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py new file mode 100644 index 00000000000..b7e842c6f31 --- /dev/null +++ b/python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py @@ -0,0 +1,187 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy +from dask.distributed import wait, default_client + +import dask_cudf +import cudf + +from pylibcugraph.experimental import (MGGraph, + ResourceHandle, + GraphProperties, + uniform_neighborhood_sampling, + ) +from cugraph.dask.common.input_utils import get_distributed_data +from cugraph.comms import comms as Comms + + +def call_nbr_sampling(sID, + data, + src_col_name, + dst_col_name, + num_edges, + do_expensive_check, + start_list, + info_list, + h_fan_out, + with_replacement): + + # Preparation for graph creation + handle = Comms.get_handle(sID) + handle = ResourceHandle(handle.getHandle()) + graph_properties = GraphProperties(is_symmetric=False, is_multigraph=False) + srcs = data[0][src_col_name] + dsts = data[0][dst_col_name] + weights = None + if "value" in data[0].columns: + weights = data[0]['value'] + + store_transposed = False + + mg = MGGraph(handle, + graph_properties, + srcs, + dsts, + weights, + store_transposed, + num_edges, + do_expensive_check) + + ret_val = uniform_neighborhood_sampling(handle, + mg, + start_list, + info_list, + h_fan_out, + with_replacement, + do_expensive_check) + return ret_val + + +def convert_to_cudf(cp_arrays): + """ + Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper + """ + cupy_sources, cupy_destinations, cupy_labels, cupy_indices = cp_arrays + # cupy_sources, cupy_destinations, cupy_labels, cupy_indices, + # cupy_counts = cp_arrays + df = cudf.DataFrame() + df["sources"] = cupy_sources + df["destinations"] = cupy_destinations + df["labels"] = cupy_labels + df["indices"] = cupy_indices + # df["counts"] = cupy_counts + return df + + +def EXPERIMENTAL__uniform_neighborhood(input_graph, + start_info_list, + fanout_vals, + with_replacement=True): + """ + Does neighborhood sampling, which samples nodes from a graph based on the + current node's neighbors, with a corresponding fanout value at each hop. + + Parameters + ---------- + input_graph : cugraph.DiGraph + cuGraph graph, which contains connectivity information as dask cudf + edge list dataframe + + start_info_list : tuple of list or cudf.Series + Tuple of a list of starting vertices for sampling, along with a + corresponding list of label for reorganizing results after sending + the input to different callers. + + fanout_vals : list + List of branching out (fan-out) degrees per starting vertex for each + hop level. + + with_replacement: bool, optional (default=True) + Flag to specify if the random sampling is done with replacement + + Returns + ------- + result : dask_cudf.DataFrame + GPU data frame containing two dask_cudf.Series + + ddf['sources']: dask_cudf.Series + Contains the source vertices from the sampling result + ddf['destinations']: dask_cudf.Series + Contains the destination vertices from the sampling result + ddf['labels']: dask_cudf.Series + Contains the start labels from the sampling result + ddf['indices']: dask_cudf.Series + Contains the indices from the sampling result for path + reconstruction + """ + # Initialize dask client + client = default_client() + # Important for handling renumbering + input_graph.compute_renumber_edge_list(transposed=False) + + start_list, info_list = start_info_list + + if isinstance(start_list, list): + start_list = cudf.Series(start_list) + if isinstance(info_list, list): + info_list = cudf.Series(info_list) + # fanout_vals must be a host array! + # FIXME: ensure other sequence types (eg. cudf Series) can be handled. + if isinstance(fanout_vals, list): + fanout_vals = numpy.asarray(fanout_vals, dtype="int32") + else: + raise TypeError("fanout_vals must be a list, " + f"got: {type(fanout_vals)}") + + ddf = input_graph.edgelist.edgelist_df + num_edges = len(ddf) + data = get_distributed_data(ddf) + + src_col_name = input_graph.renumber_map.renumbered_src_col_name + dst_col_name = input_graph.renumber_map.renumbered_dst_col_name + + # start_list uses "external" vertex IDs, but since the graph has been + # renumbered, the start vertex IDs must also be renumbered. + start_list = input_graph.lookup_internal_vertex_id(start_list).compute() + do_expensive_check = True + + result = [client.submit(call_nbr_sampling, + Comms.get_session_id(), + wf[1], + src_col_name, + dst_col_name, + num_edges, + do_expensive_check, + start_list, + info_list, + fanout_vals, + with_replacement, + workers=[wf[0]]) + for idx, wf in enumerate(data.worker_to_parts.items())] + + wait(result) + + cudf_result = [client.submit(convert_to_cudf, + cp_arrays) + for cp_arrays in result] + + wait(cudf_result) + + ddf = dask_cudf.from_delayed(cudf_result) + if input_graph.renumbered: + ddf = input_graph.unrenumber(ddf, "sources") + ddf = input_graph.unrenumber(ddf, "destinations") + + return ddf diff --git a/python/cugraph/cugraph/experimental/compat/__init__.py b/python/cugraph/cugraph/experimental/compat/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cugraph/cugraph/experimental/compat/nx/DiGraph.py b/python/cugraph/cugraph/experimental/compat/nx/DiGraph.py new file mode 100644 index 00000000000..64eabb4b318 --- /dev/null +++ b/python/cugraph/cugraph/experimental/compat/nx/DiGraph.py @@ -0,0 +1,23 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import networkx as nx + + +class DiGraph(nx.DiGraph): + """ + Class which extends NetworkX DiGraph class. It provides original + NetworkX functionality and will be overridden as this compatibility + layer moves functionality to gpus in future releases. + """ + pass diff --git a/python/cugraph/cugraph/experimental/compat/nx/Graph.py b/python/cugraph/cugraph/experimental/compat/nx/Graph.py new file mode 100644 index 00000000000..7e14de21581 --- /dev/null +++ b/python/cugraph/cugraph/experimental/compat/nx/Graph.py @@ -0,0 +1,23 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import networkx as nx + + +class Graph(nx.Graph): + """ + Class which extends NetworkX Graph class. It provides original + NetworkX functionality and will be overridden as this compatibility + layer moves functionality to gpus in future releases. + """ + pass diff --git a/python/cugraph/cugraph/experimental/compat/nx/__init__.py b/python/cugraph/cugraph/experimental/compat/nx/__init__.py new file mode 100644 index 00000000000..3ec620f6d69 --- /dev/null +++ b/python/cugraph/cugraph/experimental/compat/nx/__init__.py @@ -0,0 +1,97 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +from types import ModuleType +import sys + +# FIXME: only perform the NetworkX imports below if NetworkX is installed. If +# it's determined that NetworkX is required to use nx compat, then the contents +# of this entire namespace may have to be optional, or packaged separately with +# a hard dependency on NetworkX. + +# Start by populating this namespace with the same contents as +# networkx/__init__.py +from networkx import * + +# Override the individual NetworkX objects loaded above with the cugraph.nx +# compat equivalents. This means if an equivalent compat obj is not available, +# the standard NetworkX obj will be used. +# +# Each cugraph obj should have the same module path as the +# NetworkX obj it isoverriding, and the submodules along the hierarchy should +# each import the same sub objects/modules as NetworkX does. For example, +# in NetworkX, "pagerank" is a function in +# "networkx/algorithms/link_analysis/pagerank_alg.py", and is +# directly imported in the namespaces "networkx.algorithms.link_analysis", +# "networkx.algorithms", and "networkx". Therefore, the cugraph +# compat pagerank should be defined in a module of the same name and +# also be present in the same namespaces. +# Refer to the networkx __init__.py files when adding new overriding +# modules to ensure the same paths and used and namespaces are populated. +from cugraph.experimental.compat.nx import algorithms +from cugraph.experimental.compat.nx.algorithms import * + +from cugraph.experimental.compat.nx.algorithms import link_analysis +from cugraph.experimental.compat.nx.algorithms.link_analysis import * + +# Recursively import all of the NetworkX modules into equivalent submodules +# under this package. The above "from networkx import *" handles names in this +# namespace, but it will not create the equivalent networkx submodule +# hierarchy. For example, a user could expect to "import cugraph.nx.drawing", +# which should simply redirect to "networkx.drawing". +# +# This can be accomplished by updating sys.modules with the import path and +# module object of each NetworkX submodule in the NetworkX package hierarchy, +# but only for module paths that have not been added yet (otherwise this would +# overwrite the overides above). +_visited = set() + + +def _import_submodules_recursively(obj, mod_path): + # Since modules can freely import any other modules, immediately mark this + # obj as visited so submodules that import it are not re-examined + # infinitely. + _visited.add(obj) + for name in dir(obj): + sub_obj = getattr(obj, name) + + if type(sub_obj) is ModuleType: + sub_mod_path = f"{mod_path}.{name}" + # Do not overwrite modules that are already present, such as those + # intended to override which were imported separately above. + if sub_mod_path not in sys.modules: + sys.modules[sub_mod_path] = sub_obj + if sub_obj not in _visited: + _import_submodules_recursively(sub_obj, sub_mod_path) + + +_import_submodules_recursively( + + + importlib.import_module("networkx"), __name__) + +del _visited +del _import_submodules_recursively + +# At this point, individual types that cugraph.nx are overriding +# could be used to override the corresponding types *inside* the +# networkx modules imported above. For example, the networkx graph generators +# will still return networkx.Graph objects instead of cugraph.nx.Graph +# objects (unless the user knows to pass a "create_using" arg, if available). +# For specific overrides, assignments could be made in the imported +# a networkx modules so cugraph.nx types are used by default. +# NOTE: this has the side-effect of causing all networkx +# imports in this python process/interpreter to use the override (ie. the user +# won't be able to use the original networkx types, +# even from a networkx import) diff --git a/python/cugraph/cugraph/experimental/compat/nx/algorithms/__init__.py b/python/cugraph/cugraph/experimental/compat/nx/algorithms/__init__.py new file mode 100644 index 00000000000..caebb9cd546 --- /dev/null +++ b/python/cugraph/cugraph/experimental/compat/nx/algorithms/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from networkx.algorithms import * +from cugraph.experimental.compat.nx.algorithms.link_analysis import * +from cugraph.experimental.compat.nx.algorithms import link_analysis \ No newline at end of file diff --git a/python/cugraph/cugraph/experimental/compat/nx/algorithms/link_analysis/__init__.py b/python/cugraph/cugraph/experimental/compat/nx/algorithms/link_analysis/__init__.py new file mode 100644 index 00000000000..bc5bc533ee1 --- /dev/null +++ b/python/cugraph/cugraph/experimental/compat/nx/algorithms/link_analysis/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from networkx.algorithms.link_analysis import * +from cugraph.experimental.compat.nx.algorithms.link_analysis.pagerank_alg import * \ No newline at end of file diff --git a/python/cugraph/cugraph/experimental/compat/nx/algorithms/link_analysis/pagerank_alg.py b/python/cugraph/cugraph/experimental/compat/nx/algorithms/link_analysis/pagerank_alg.py new file mode 100644 index 00000000000..4ffe01aadce --- /dev/null +++ b/python/cugraph/cugraph/experimental/compat/nx/algorithms/link_analysis/pagerank_alg.py @@ -0,0 +1,124 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cugraph +import cugraph.utilities +import cudf +import numpy as np + + +def create_cudf_from_dict(dict_in): + """ + converts python dictionary to a cudf.Dataframe as needed by this + cugraph pagerank call. + + Parameters + ---------- + dictionary with node ids(key) and values + + Returns + ------- + a cudf DataFrame of (vertex)ids and values. + """ + if not(isinstance(dict_in, dict)): + raise TypeError("type_name must be a dict, got: " + f"{type(dict_in)}") + # FIXME: Looking to replacing fromiter with rename and + # compare performance + k = np.fromiter(dict_in.keys(), dtype="int32") + v = np.fromiter(dict_in.values(), dtype="float32") + df = cudf.DataFrame({"vertex": k, "values": v}) + return df + + +def pagerank( + G, + alpha=0.85, + personalization=None, + max_iter=100, + tol=1.0e-6, + nstart=None, + weight="weight", + dangling=None): + + """ + Calls the cugraph pagerank algorithm taking in a networkX object. + In future releases it will maintain compatibility but will migrate more + of the workflow to the GPU. + + Parameters + ---------- + G : networkx.Graph + + alpha : float, optional (default=0.85) + The damping factor alpha represents the probability to follow an + outgoing edge, standard value is 0.85. + Thus, 1.0-alpha is the probability to “teleport” to a random vertex. + Alpha should be greater than 0.0 and strictly lower than 1.0. + + personalization : dictionary, optional (default=None) + dictionary comes from networkx is converted to a dataframe + containing the personalization information. + + max_iter : int, optional (default=100) + The maximum number of iterations before an answer is returned. This can + be used to limit the execution time and do an early exit before the + solver reaches the convergence tolerance. + If this value is lower or equal to 0 cuGraph will use the default + value, which is 100. + + tol : float, optional (default=1e-05) + Set the tolerance the approximation, this parameter should be a small + magnitude value. + The lower the tolerance the better the approximation. If this value is + 0.0f, cuGraph will use the default value which is 1.0E-5. + Setting too small a tolerance can lead to non-convergence due to + numerical roundoff. Usually values between 0.01 and 0.00001 are + acceptable. + + nstart : dictionary, optional (default=None) + dictionary containing the initial guess vertex and value for pagerank. + Will be converted to a Dataframe before calling the cugraph algorithm + nstart['vertex'] : cudf.Series + Subset of vertices of graph for initial guess for pagerank values + nstart['values'] : cudf.Series + Pagerank values for vertices + + weight: str, optional (default=None) + This parameter is here for NetworkX compatibility and not + yet supported in this algorithm + + dangling : dict, optional (default=None) + This parameter is here for NetworkX compatibility and ignored + + Returns + ------- + PageRank : dictionary + A dictionary of nodes with the PageRank as value + + """ + local_pers = None + local_nstart = None + if (personalization is not None): + local_pers = create_cudf_from_dict(personalization) + if (nstart is not None): + local_nstart = create_cudf_from_dict(nstart) + return cugraph.pagerank( + G, + alpha, + local_pers, + max_iter, + tol, + local_nstart, + weight, + dangling) diff --git a/python/cugraph/cugraph/experimental/dask/__init__.py b/python/cugraph/cugraph/experimental/dask/__init__.py new file mode 100644 index 00000000000..059df21d487 --- /dev/null +++ b/python/cugraph/cugraph/experimental/dask/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cugraph.utilities.api_tools import experimental_warning_wrapper + +from cugraph.dask.sampling.neighborhood_sampling import \ + EXPERIMENTAL__uniform_neighborhood +uniform_neighborhood_sampling = \ + experimental_warning_wrapper(EXPERIMENTAL__uniform_neighborhood) diff --git a/python/cugraph/cugraph/sampling/node2vec.py b/python/cugraph/cugraph/sampling/node2vec.py index 86ad21271fa..1b4d7aa707d 100644 --- a/python/cugraph/cugraph/sampling/node2vec.py +++ b/python/cugraph/cugraph/sampling/node2vec.py @@ -11,15 +11,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pylibcugraph -import cudf +from pylibcugraph.experimental import (ResourceHandle, + GraphProperties, + SGGraph, + node2vec as pylibcugraph_node2vec, + ) from cugraph.utilities import ensure_cugraph_obj_for_nx +import cudf + def node2vec(G, start_vertices, max_depth=None, - use_padding=False, + compress_result=True, p=1.0, q=1.0): """ @@ -42,13 +47,14 @@ def node2vec(G, start_vertices: int or list or cudf.Series or cudf.DataFrame A single node or a list or a cudf.Series of nodes from which to run the random walks. In case of multi-column vertices it should be - a cudf.DataFrame + a cudf.DataFrame. Only supports int32 currently. max_depth: int The maximum depth of the random walks - use_padding: bool, optional (default=False) - If True, padded paths are returned else coalesced paths are returned + compress_result: bool, optional (default=True) + If True, coalesced paths are returned with a sizes array with offsets. + Otherwise padded paths are returned with an empty sizes array. p: float, optional (default=1.0, [0 < p]) Return factor, which represents the likelihood of backtracking to @@ -81,7 +87,7 @@ def node2vec(G, ... dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2') - >>> start_vertices = cudf.Series([0, 2]) + >>> start_vertices = cudf.Series([0, 2], dtype=np.int32) >>> paths, weights, path_sizes = cugraph.node2vec(G, start_vertices, 3, ... True, 0.8, 0.5) @@ -89,8 +95,9 @@ def node2vec(G, if (not isinstance(max_depth, int)) or (max_depth < 1): raise ValueError(f"'max_depth' must be a positive integer, \ got: {max_depth}") - if (not isinstance(use_padding, bool)): - raise ValueError(f"'use_padding' must be a bool, got: {use_padding}") + if (not isinstance(compress_result, bool)): + raise ValueError(f"'compress_result' must be a bool, \ + got: {compress_result}") if (not isinstance(p, float)) or (p <= 0.0): raise ValueError(f"'p' must be a positive float, got: {p}") if (not isinstance(q, float)) or (q <= 0.0): @@ -103,6 +110,9 @@ def node2vec(G, if isinstance(start_vertices, list): start_vertices = cudf.Series(start_vertices) + if start_vertices.dtype != 'int32': + raise ValueError(f"'start_vertices' must have int32 values, \ + got: {start_vertices.dtype}") if G.renumbered is True: if isinstance(start_vertices, cudf.DataFrame): @@ -115,24 +125,23 @@ def node2vec(G, dsts = G.edgelist.edgelist_df['dst'] weights = G.edgelist.edgelist_df['weights'] - resource_handle = pylibcugraph.experimental.ResourceHandle() - graph_props = pylibcugraph.experimental.GraphProperties( - is_multigraph=G.is_multigraph()) + if srcs.dtype != 'int32': + raise ValueError(f"Graph vertices must have int32 values, \ + got: {srcs.dtype}") + + resource_handle = ResourceHandle() + graph_props = GraphProperties(is_multigraph=G.is_multigraph()) store_transposed = False renumber = False do_expensive_check = False - # FIXME: If input graph is not renumbered, then SGGraph creation - # causes incorrect vertices to be returned when computing pylib - # version of node2vec - sg = pylibcugraph.experimental.SGGraph(resource_handle, graph_props, - srcs, dsts, weights, - store_transposed, renumber, - do_expensive_check) - - vertex_set, edge_set, sizes = pylibcugraph.experimental.node2vec( - resource_handle, sg, start_vertices, - max_depth, use_padding, p, q) + sg = SGGraph(resource_handle, graph_props, srcs, dsts, weights, + store_transposed, renumber, do_expensive_check) + + vertex_set, edge_set, sizes = \ + pylibcugraph_node2vec(resource_handle, sg, start_vertices, + max_depth, compress_result, p, q) + vertex_set = cudf.Series(vertex_set) edge_set = cudf.Series(edge_set) sizes = cudf.Series(sizes) @@ -142,11 +151,4 @@ def node2vec(G, df_['vertex_set'] = vertex_set df_ = G.unrenumber(df_, 'vertex_set', preserve_order=True) vertex_set = cudf.Series(df_['vertex_set']) - - if use_padding: - edge_set_sz = (max_depth - 1) * len(start_vertices) - return vertex_set, edge_set[:edge_set_sz], sizes - - vertex_set_sz = vertex_set.sum() - edge_set_sz = vertex_set_sz - len(start_vertices) - return vertex_set[:vertex_set_sz], edge_set[:edge_set_sz], sizes + return vertex_set, edge_set, sizes diff --git a/python/cugraph/cugraph/tests/dask/test_mg_hits.py b/python/cugraph/cugraph/tests/dask/test_mg_hits.py new file mode 100644 index 00000000000..3cff17e62c8 --- /dev/null +++ b/python/cugraph/cugraph/tests/dask/test_mg_hits.py @@ -0,0 +1,137 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cugraph.dask as dcg +import gc +import pytest +import cugraph +import dask_cudf +from cugraph.dask.common.mg_utils import is_single_gpu +from cugraph.tests import utils + +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= + + +def setup_function(): + gc.collect() + + +# ============================================================================= +# Pytest fixtures +# ============================================================================= + +datasets = utils.DATASETS_UNDIRECTED + \ + [utils.RAPIDS_DATASET_ROOT_DIR_PATH/"email-Eu-core.csv"] + +fixture_params = utils.genFixtureParamsProduct((datasets, "graph_file"), + ([50], "max_iter"), + ([1.0e-6], "tol"), + ) + + +@pytest.fixture(scope="module", params=fixture_params) +def input_combo(request): + """ + Simply return the current combination of params as a dictionary for use in + tests or other parameterized fixtures. + """ + parameters = dict(zip(("graph_file", "max_iter", "tol"), request.param)) + + return parameters + + +@pytest.fixture(scope="module") +def input_expected_output(input_combo): + """ + This fixture returns the inputs and expected results from the HITS algo. + (based on cuGraph HITS) which can be used for validation. + """ + + input_data_path = input_combo["graph_file"] + + G = utils.generate_cugraph_graph_from_file( + input_data_path) + sg_cugraph_hits = cugraph.hits( + G, + input_combo["max_iter"], + input_combo["tol"]) + # Save the results back to the input_combo dictionary to prevent redundant + # cuGraph runs. Other tests using the input_combo fixture will look for + # them, and if not present they will have to re-run the same cuGraph call. + sg_cugraph_hits = sg_cugraph_hits.sort_values( + "vertex").reset_index(drop=True) + + input_combo["sg_cugraph_results"] = sg_cugraph_hits + chunksize = dcg.get_chunksize(input_data_path) + ddf = dask_cudf.read_csv( + input_data_path, + chunksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) + + dg = cugraph.Graph(directed=True) + dg.from_dask_cudf_edgelist( + ddf, source='src', destination='dst', edge_attr='value', renumber=True) + + input_combo["MGGraph"] = dg + + return input_combo + + +# ============================================================================= +# Tests +# ============================================================================= + + +@pytest.mark.skipif( + is_single_gpu(), reason="skipping MG testing on Single GPU system" +) +def test_dask_hits(dask_client, benchmark, input_expected_output): + + dg = input_expected_output["MGGraph"] + + result_hits = benchmark(dcg.hits, + dg, + input_expected_output["tol"], + input_expected_output["max_iter"]) + + result_hits = result_hits.compute().sort_values( + "vertex").reset_index(drop=True).rename(columns={ + "hubs": "mg_cugraph_hubs", "authorities": "mg_cugraph_authorities"} + ) + + expected_output = input_expected_output["sg_cugraph_results"].sort_values( + "vertex").reset_index(drop=True) + + # Update the dask cugraph HITS results with sg cugraph results for easy + # comparison using cuDF DataFrame methods. + result_hits["sg_cugraph_hubs"] = expected_output['hubs'] + result_hits["sg_cugraph_authorities"] = expected_output["authorities"] + + hubs_diffs1 = result_hits.query( + 'mg_cugraph_hubs - sg_cugraph_hubs > 0.00001') + hubs_diffs2 = result_hits.query( + 'mg_cugraph_hubs - sg_cugraph_hubs < -0.00001') + authorities_diffs1 = result_hits.query( + 'mg_cugraph_authorities - sg_cugraph_authorities > 0.0001') + authorities_diffs2 = result_hits.query( + 'mg_cugraph_authorities - sg_cugraph_authorities < -0.0001') + + assert len(hubs_diffs1) == 0 + assert len(hubs_diffs2) == 0 + assert len(authorities_diffs1) == 0 + assert len(authorities_diffs2) == 0 diff --git a/python/cugraph/cugraph/tests/dask/test_mg_neighborhood_sampling.py b/python/cugraph/cugraph/tests/dask/test_mg_neighborhood_sampling.py new file mode 100644 index 00000000000..25838445e11 --- /dev/null +++ b/python/cugraph/cugraph/tests/dask/test_mg_neighborhood_sampling.py @@ -0,0 +1,133 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import pytest +import cugraph.dask as dcg +import cugraph +import dask_cudf +import cudf +from cugraph.dask.common.mg_utils import is_single_gpu +from cugraph.tests import utils + + +# ============================================================================= +# Test helpers +# ============================================================================= +def setup_function(): + gc.collect() + + +# datasets = utils.RAPIDS_DATASET_ROOT_DIR_PATH/"karate.csv" +datasets = utils.DATASETS_SMALL +fixture_params = utils.genFixtureParamsProduct((datasets, "graph_file")) + + +def _get_param_args(param_name, param_values): + """ + Returns a tuple of (, ) which can be applied + as the args to pytest.mark.parametrize(). The pytest.param list also + contains param id string formed from the param name and values. + """ + return (param_name, + [pytest.param(v, id=f"{param_name}={v}") for v in param_values]) + + +@pytest.mark.skipif( + is_single_gpu(), reason="skipping MG testing on Single GPU system" +) +def test_mg_neighborhood_sampling_simple(dask_client): + + from cugraph.experimental.dask import uniform_neighborhood_sampling + + df = cudf.DataFrame({"src": cudf.Series([0, 1, 1, 2, 2, 2, 3, 4], + dtype="int32"), + "dst": cudf.Series([1, 3, 4, 0, 1, 3, 5, 5], + dtype="int32"), + "value": cudf.Series([0.1, 2.1, 1.1, 5.1, 3.1, + 4.1, 7.2, 3.2], + dtype="float32"), + }) + ddf = dask_cudf.from_cudf(df, npartitions=2) + + G = cugraph.Graph(directed=True) + G.from_dask_cudf_edgelist(ddf, "src", "dst", "value") + + # TODO: Incomplete, include more testing for tree graph as well as + # for larger graphs + start_list = cudf.Series([0, 1], dtype="int32") + info_list = cudf.Series([0, 0], dtype="int32") + fanout_vals = [1, 1] + with_replacement = True + result_nbr = uniform_neighborhood_sampling(G, + (start_list, info_list), + fanout_vals, + with_replacement) + result_nbr = result_nbr.compute() + + # Since the validity of results have (probably) been tested at botht he C++ + # and C layers, simply test that the python interface and conversions were + # done correctly. + assert result_nbr['sources'].dtype == "int32" + assert result_nbr['destinations'].dtype == "int32" + assert result_nbr['labels'].dtype == "int32" + assert result_nbr['indices'].dtype == "int32" + + # ALl labels should be 0 or 1 + assert result_nbr['labels'].isin([0, 1]).all() + + +@pytest.mark.skipif( + is_single_gpu(), reason="skipping MG testing on Single GPU system" +) +def test_mg_neighborhood_sampling_tree(dask_client): + + from cugraph.experimental.dask import uniform_neighborhood_sampling + + input_data_path = (utils.RAPIDS_DATASET_ROOT_DIR_PATH / + "small_tree.csv").as_posix() + chunksize = dcg.get_chunksize(input_data_path) + + ddf = dask_cudf.read_csv( + input_data_path, + chunksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) + + G = cugraph.Graph(directed=True) + G.from_dask_cudf_edgelist(ddf, "src", "dst", "value") + + # TODO: Incomplete, include more testing for tree graph as well as + # for larger graphs + start_list = cudf.Series([0, 0], dtype="int32") + info_list = cudf.Series([0, 0], dtype="int32") + fanout_vals = [4, 1, 3] + with_replacement = True + result_nbr = uniform_neighborhood_sampling(G, + (start_list, info_list), + fanout_vals, + with_replacement) + result_nbr = result_nbr.compute() + + # Since the validity of results have (probably) been tested at botht he C++ + # and C layers, simply test that the python interface and conversions were + # done correctly. + assert result_nbr['sources'].dtype == "int32" + assert result_nbr['destinations'].dtype == "int32" + assert result_nbr['labels'].dtype == "int32" + assert result_nbr['indices'].dtype == "int32" + + # All labels should be 0 + assert (result_nbr['labels'] == 0).all() diff --git a/python/cugraph/cugraph/tests/dask/test_mg_replication.py b/python/cugraph/cugraph/tests/dask/test_mg_replication.py index 462b0bda184..b5800c854ef 100644 --- a/python/cugraph/cugraph/tests/dask/test_mg_replication.py +++ b/python/cugraph/cugraph/tests/dask/test_mg_replication.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -24,16 +24,13 @@ DATASETS_OPTIONS = utils.DATASETS_SMALL DIRECTED_GRAPH_OPTIONS = [False, True] -# FIXME: The "preset_gpu_count" from 21.08 and below are not supported and have -# been removed @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) @pytest.mark.parametrize("input_data_path", DATASETS_OPTIONS, - ids=[f"dataset={d.as_posix()}" - for d in DATASETS_OPTIONS]) + ids=[f"dataset={d}" for d in DATASETS_OPTIONS]) def test_replicate_cudf_dataframe_with_weights( input_data_path, dask_client ): @@ -54,8 +51,7 @@ def test_replicate_cudf_dataframe_with_weights( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) @pytest.mark.parametrize("input_data_path", DATASETS_OPTIONS, - ids=[f"dataset={d.as_posix()}" - for d in DATASETS_OPTIONS]) + ids=[f"dataset={d}" for d in DATASETS_OPTIONS]) def test_replicate_cudf_dataframe_no_weights(input_data_path, dask_client): gc.collect() df = cudf.read_csv( @@ -74,8 +70,7 @@ def test_replicate_cudf_dataframe_no_weights(input_data_path, dask_client): is_single_gpu(), reason="skipping MG testing on Single GPU system" ) @pytest.mark.parametrize("input_data_path", DATASETS_OPTIONS, - ids=[f"dataset={d.as_posix()}" - for d in DATASETS_OPTIONS]) + ids=[f"dataset={d}" for d in DATASETS_OPTIONS]) def test_replicate_cudf_series(input_data_path, dask_client): gc.collect() df = cudf.read_csv( @@ -98,8 +93,7 @@ def test_replicate_cudf_series(input_data_path, dask_client): @pytest.mark.skip(reason="no way of currently testing this") @pytest.mark.parametrize("graph_file", DATASETS_OPTIONS, - ids=[f"dataset={d.as_posix()}" - for d in DATASETS_OPTIONS]) + ids=[f"dataset={d}" for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) def test_enable_batch_no_context(graph_file, directed): gc.collect() @@ -113,8 +107,7 @@ def test_enable_batch_no_context(graph_file, directed): is_single_gpu(), reason="skipping MG testing on Single GPU system" ) @pytest.mark.parametrize("graph_file", DATASETS_OPTIONS, - ids=[f"dataset={d.as_posix()}" - for d in DATASETS_OPTIONS]) + ids=[f"dataset={d}" for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) def test_enable_batch_no_context_view_adj( graph_file, directed, dask_client @@ -129,8 +122,7 @@ def test_enable_batch_no_context_view_adj( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) @pytest.mark.parametrize("graph_file", DATASETS_OPTIONS, - ids=[f"dataset={d.as_posix()}" - for d in DATASETS_OPTIONS]) + ids=[f"dataset={d}" for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) def test_enable_batch_context_then_views( graph_file, directed, dask_client @@ -156,8 +148,7 @@ def test_enable_batch_context_then_views( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) @pytest.mark.parametrize("graph_file", DATASETS_OPTIONS, - ids=[f"dataset={d.as_posix()}" - for d in DATASETS_OPTIONS]) + ids=[f"dataset={d}" for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) def test_enable_batch_view_then_context(graph_file, directed, dask_client): gc.collect() @@ -185,8 +176,7 @@ def test_enable_batch_view_then_context(graph_file, directed, dask_client): is_single_gpu(), reason="skipping MG testing on Single GPU system" ) @pytest.mark.parametrize("graph_file", DATASETS_OPTIONS, - ids=[f"dataset={d.as_posix()}" - for d in DATASETS_OPTIONS]) + ids=[f"dataset={d}" for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) def test_enable_batch_context_no_context_views( graph_file, directed, dask_client @@ -208,8 +198,7 @@ def test_enable_batch_context_no_context_views( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) @pytest.mark.parametrize("graph_file", DATASETS_OPTIONS, - ids=[f"dataset={d.as_posix()}" - for d in DATASETS_OPTIONS]) + ids=[f"dataset={d}" for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) def test_enable_batch_edgelist_replication( graph_file, directed, dask_client @@ -227,8 +216,7 @@ def test_enable_batch_edgelist_replication( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) @pytest.mark.parametrize("graph_file", DATASETS_OPTIONS, - ids=[f"dataset={d.as_posix()}" - for d in DATASETS_OPTIONS]) + ids=[f"dataset={d}" for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) def test_enable_batch_adjlist_replication_weights( graph_file, directed, dask_client @@ -240,7 +228,7 @@ def test_enable_batch_adjlist_replication_weights( names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) - G = cugraph.DiGraph() if directed else cugraph.Graph() + G = cugraph.Graph(directed=directed) G.from_cudf_edgelist( df, source="src", destination="dst", edge_attr="value" ) @@ -261,8 +249,7 @@ def test_enable_batch_adjlist_replication_weights( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) @pytest.mark.parametrize("graph_file", DATASETS_OPTIONS, - ids=[f"dataset={d.as_posix()}" - for d in DATASETS_OPTIONS]) + ids=[f"dataset={d}" for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) def test_enable_batch_adjlist_replication_no_weights( graph_file, directed, dask_client @@ -274,7 +261,7 @@ def test_enable_batch_adjlist_replication_no_weights( names=["src", "dst"], dtype=["int32", "int32"], ) - G = cugraph.DiGraph() if directed else cugraph.Graph() + G = cugraph.Graph(directed=directed) G.from_cudf_edgelist(df, source="src", destination="dst") G.enable_batch() G.view_adj_list() diff --git a/python/cugraph/cugraph/tests/test_compat_algo.py b/python/cugraph/cugraph/tests/test_compat_algo.py new file mode 100644 index 00000000000..2c2ae9f0ef4 --- /dev/null +++ b/python/cugraph/cugraph/tests/test_compat_algo.py @@ -0,0 +1,35 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cugraph.experimental.compat.nx as nx + + +def test_connectivity(): + # Tests a run of a native nx algorithm that hasnt been overridden. + expected = [{1, 2, 3, 4, 5}, {8, 9, 7}] + G = nx.Graph() + G.add_edges_from([(1, 2), (2, 3), (3, 4), (4, 5)]) + G.add_edges_from([(7, 8), (8, 9), (7, 9)]) + assert list(nx.connected_components(G)) == expected + + +def test_pagerank_result_type(): + G = nx.DiGraph() + [G.add_node(k) for k in ["A", "B", "C", "D", "E", "F", "G"]] + G.add_edges_from([('G', 'A'), ('A', 'G'), ('B', 'A'), + ('C', 'A'), ('A', 'C'), ('A', 'D'), + ('E', 'A'), ('F', 'A'), ('D', 'B'), + ('D', 'F')]) + ppr1 = nx.pagerank(G) + # This just tests that the right type is returned. + assert isinstance(ppr1, dict) diff --git a/python/cugraph/cugraph/tests/test_compat_pr.py b/python/cugraph/cugraph/tests/test_compat_pr.py new file mode 100644 index 00000000000..d7bafe518d5 --- /dev/null +++ b/python/cugraph/cugraph/tests/test_compat_pr.py @@ -0,0 +1,254 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Temporarily suppress warnings till networkX fixes deprecation warnings +# (Using or importing the ABCs from 'collections' instead of from +# 'collections.abc' is deprecated, and in 3.8 it will stop working) for +# python 3.7. Also, this import networkx needs to be relocated in the +# third-party group once this gets fixed. +import pytest +from cugraph.tests import utils +import numpy as np +import gc +import importlib + + +MAX_ITERATIONS = [100, 200] +TOLERANCE = [1.0e-06] +ALPHA = [0.85, 0.70] +PERS_PERCENT = [0, 15] +HAS_GUESS = [0, 1] + +FILES_UNDIRECTED = [ + utils.RAPIDS_DATASET_ROOT_DIR_PATH/"karate.csv" +] + +# these are only used in the missing parameter tests. +KARATE_RANKING = [11, 9, 14, 15, 18, 20, 22, + 17, 21, 12, 26, 16, 28, 19] + +KARATE_PERS_RANKING = [11, 16, 17, 21, 4, 10, 5, + 6, 12, 7, 9, 24, 19, 25] + +KARATE_ITER_RANKINGS = [11, 9, 14, 15, 18, 20, + 22, 17, 21, 12, 26, 16, + 28, 19] + +KARATE_NSTART_RANKINGS = [11, 9, 14, 15, 18, 20, + 22, 17, 21, 12, 26, 16, + 28, 19] + + +# ============================================================================= +# Pytest fixtures +# ============================================================================= +def setup_function(): + gc.collect() + + +datasets = FILES_UNDIRECTED +fixture_params = utils.genFixtureParamsProduct((datasets, "graph_file"), + (MAX_ITERATIONS, "max_iter"), + (TOLERANCE, "tol"), + (PERS_PERCENT, "pers_percent"), + (HAS_GUESS, "has_guess"), + ) + + +@pytest.fixture(scope="module", params=fixture_params) +def input_combo(request): + """ + Simply return the current combination of params as a dictionary for use in + tests or other parameterized fixtures. + """ + parameters = dict(zip(("graph_file", + "max_iter", + "tol", + "pers_percent", + "has_guess"), + request.param)) + + return parameters + + +@pytest.fixture(scope="module") +def input_expected_output(input_combo): + """ + This fixture returns the expected results from the pagerank algorithm. + """ + import networkx + + M = utils.read_csv_for_nx(input_combo["graph_file"]) + + Gnx = networkx.from_pandas_edgelist( + M, source="0", target="1", edge_attr="weight", + create_using=networkx.DiGraph() + ) + nnz_vtx = np.unique(M[['0', '1']]) + personalization = get_personalization(input_combo["pers_percent"], + nnz_vtx) + input_combo["nstart"] = None + nstart = None + if (input_combo["has_guess"] == 1): + z = {k: 1.0 / Gnx.number_of_nodes() for k in Gnx.nodes()} + input_combo["nstart"] = z + nstart = z + + pr = networkx.pagerank(Gnx, + max_iter=input_combo["max_iter"], + tol=input_combo["tol"], + personalization=personalization, + nstart=nstart) + input_combo["personalization"] = personalization + input_combo["nx_pr_rankings"] = pr + return input_combo + + +@pytest.fixture(scope="module", params=['networkx', 'nxcompat']) +def which_import(request): + if (request.param == 'networkx'): + return importlib.import_module("networkx") + if (request.param == 'nxcompat'): + return importlib.import_module("cugraph.experimental.compat.nx") + + +# The function selects personalization_perc% of accessible vertices in graph M +# and randomly assigns them personalization values +def get_personalization(personalization_perc, nnz_vtx): + personalization = None + if personalization_perc != 0: + personalization = {} + personalization_count = int( + (nnz_vtx.size * personalization_perc) / 100.0) + nnz_vtx = np.random.choice(nnz_vtx, + min(nnz_vtx.size, + personalization_count), + replace=False) + + nnz_val = np.random.random(nnz_vtx.size) + nnz_val = nnz_val / sum(nnz_val) + for vtx, val in zip(nnz_vtx, nnz_val): + personalization[vtx] = val + return personalization + + +@pytest.mark.parametrize("graph_file", FILES_UNDIRECTED) +def test_with_noparams(graph_file, which_import): + nx = which_import + + M = utils.read_csv_for_nx(graph_file) + Gnx = nx.from_pandas_edgelist( + M, source="0", target="1", edge_attr="weight", + create_using=nx.DiGraph() + ) + pr = nx.pagerank(Gnx) + + # Rounding issues show up in runs but this tests that the + # cugraph and networkx algrorithms are being correctly called. + assert(sorted(pr, key=pr.get)[:14]) == KARATE_RANKING + + +@pytest.mark.parametrize("graph_file", FILES_UNDIRECTED) +@pytest.mark.parametrize("max_iter", MAX_ITERATIONS) +def test_with_max_iter(graph_file, max_iter, which_import): + nx = which_import + M = utils.read_csv_for_nx(graph_file) + Gnx = nx.from_pandas_edgelist( + M, source="0", target="1", edge_attr="weight", + create_using=nx.DiGraph() + ) + pr = nx.pagerank(Gnx, max_iter=max_iter) + # Rounding issues show up in runs but this tests that the + # cugraph and networkx algrorithms are being correctly called. + assert(sorted(pr, key=pr.get)[:14]) == KARATE_ITER_RANKINGS + + +@pytest.mark.parametrize("graph_file", FILES_UNDIRECTED) +@pytest.mark.parametrize("max_iter", MAX_ITERATIONS) +def test_perc_spec(graph_file, max_iter, which_import): + nx = which_import + + # simple personalization to validate running + personalization = { + 20: 0.7237260913723357, + 12: 0.03952608674390543, + 22: 0.2367478218837589 + } + + M = utils.read_csv_for_nx(graph_file) + Gnx = nx.from_pandas_edgelist( + M, source="0", target="1", edge_attr="weight", + create_using=nx.DiGraph() + ) + + # NetworkX PageRank + M = utils.read_csv_for_nx(graph_file) + Gnx = nx.from_pandas_edgelist(M, + source="0", + target="1", + edge_attr="weight", + create_using=nx.DiGraph()) + # uses the same personalization for each imported package + + pr = nx.pagerank( + Gnx, max_iter=max_iter, + personalization=personalization + ) + + # Rounding issues show up in runs but this tests that the + # cugraph and networkx algrorithms are being correctly called. + assert(sorted(pr, key=pr.get)[:14]) == KARATE_PERS_RANKING + + +@pytest.mark.parametrize("graph_file", FILES_UNDIRECTED) +@pytest.mark.parametrize("max_iter", MAX_ITERATIONS) +def test_with_nstart(graph_file, max_iter, which_import): + nx = which_import + + M = utils.read_csv_for_nx(graph_file) + Gnx = nx.from_pandas_edgelist( + M, source="0", target="1", edge_attr="weight", + create_using=nx.DiGraph() + ) + + z = {k: 1.0 / Gnx.number_of_nodes() for k in Gnx.nodes()} + + M = utils.read_csv_for_nx(graph_file) + Gnx = nx.from_pandas_edgelist( + M, source="0", target="1", edge_attr="weight", + create_using=nx.DiGraph() + ) + pr = nx.pagerank(Gnx, max_iter=max_iter, nstart=z) + + # Rounding issues show up in runs but this tests that the + # cugraph and networkx algrorithms are being correctly called. + assert(sorted(pr, key=pr.get)[:14]) == KARATE_NSTART_RANKINGS + + +def test_fixture_data(input_expected_output, which_import): + nx = which_import + M = utils.read_csv_for_nx(input_expected_output["graph_file"]) + Gnx = nx.from_pandas_edgelist( + M, source="0", target="1", edge_attr="weight", + create_using=nx.DiGraph() + ) + pr = nx.pagerank(Gnx, + max_iter=input_expected_output["max_iter"], + tol=input_expected_output["tol"], + personalization=input_expected_output["personalization"], + nstart=input_expected_output["nstart"]) + actual = sorted(pr.items()) + expected = sorted(input_expected_output["nx_pr_rankings"].items()) + assert all([a == pytest.approx(b, abs=1.0e-04) + for a, b in zip(actual, expected)]) diff --git a/python/cugraph/cugraph/tests/test_node2vec.py b/python/cugraph/cugraph/tests/test_node2vec.py index 114ced7666f..e897c5fd0a3 100644 --- a/python/cugraph/cugraph/tests/test_node2vec.py +++ b/python/cugraph/cugraph/tests/test_node2vec.py @@ -18,14 +18,16 @@ from cugraph.tests import utils import cugraph +import cudf # ============================================================================= # Parameters # ============================================================================= DIRECTED_GRAPH_OPTIONS = [False, True] -DATASETS_SMALL = [pytest.param(d) for d in utils.DATASETS_SMALL] -KARATE = DATASETS_SMALL[0][0][0] +COMPRESSED = [False, True] +LINE = utils.RAPIDS_DATASET_ROOT_DIR_PATH/"small_line.csv" +KARATE = utils.RAPIDS_DATASET_ROOT_DIR_PATH/"karate.csv" # ============================================================================= @@ -35,10 +37,20 @@ def setup_function(): gc.collect() +def _get_param_args(param_name, param_values): + """ + Returns a tuple of (, ) which can be applied + as the args to pytest.mark.parametrize(). The pytest.param list also + contains param id string formed from the param name and values. + """ + return (param_name, + [pytest.param(v, id=f"{param_name}={v}") for v in param_values]) + + def calc_node2vec(G, start_vertices, - max_depth=None, - use_padding=False, + max_depth, + compress_result, p=1.0, q=1.0): """ @@ -52,7 +64,7 @@ def calc_node2vec(G, max_depth : int - use_padding : bool + compress_result : bool p : float @@ -61,125 +73,215 @@ def calc_node2vec(G, assert G is not None vertex_paths, edge_weights, vertex_path_sizes = cugraph.node2vec( - G, start_vertices, max_depth, use_padding, p, q) + G, start_vertices, max_depth, compress_result, p, q) return (vertex_paths, edge_weights, vertex_path_sizes), start_vertices -@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) -@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) -def test_node2vec_coalesced( - graph_file, - directed -): - G = utils.generate_cugraph_graph_from_file(graph_file, directed=directed, - edgevals=True) - k = random.randint(1, 10) - max_depth = 3 - start_vertices = random.sample(range(G.number_of_vertices()), k) - df, seeds = calc_node2vec( - G, - start_vertices, - max_depth, - use_padding=False, - p=0.8, - q=0.5 - ) - vertex_paths, edge_weights, vertex_path_sizes = df - # Check that output sizes are as expected - assert vertex_paths.size == max_depth * k - assert edge_weights.size == (max_depth - 1) * k - # Check that weights match up with paths - err = 0 - for i in range(k): - for j in range(max_depth - 1): - # weight = edge_weights[i * (max_depth - 1) + j] - u = vertex_paths[i * max_depth + j] - v = vertex_paths[i * max_depth + j + 1] - # Walk not found in edgelist - if (not G.has_edge(u, v)): - err += 1 - # FIXME: Checking weights is buggy - # Corresponding weight to edge is not correct - # expr = "(src == {} and dst == {})".format(u, v) - # if not (G.edgelist.edgelist_df.query(expr)["weights"] == weight): - # err += 1 - assert err == 0 - - -@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) -@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) -def test_node2vec_padded( - graph_file, - directed -): - G = utils.generate_cugraph_graph_from_file(graph_file, directed=directed, - edgevals=True) - k = random.randint(1, 10) - max_depth = 3 - start_vertices = random.sample(range(G.number_of_vertices()), k) - df, seeds = calc_node2vec( - G, - start_vertices, - max_depth, - use_padding=True, - p=0.8, - q=0.5 - ) - vertex_paths, edge_weights, vertex_path_sizes = df - # Check that output sizes are as expected - assert vertex_paths.size == max_depth * k - assert edge_weights.size == (max_depth - 1) * k - assert vertex_path_sizes.sum() == vertex_paths.size - # Check that weights match up with paths - err = 0 - path_start = 0 - for i in range(k): - for j in range(max_depth - 1): - # weight = edge_weights[i * (max_depth - 1) + j] - u = vertex_paths[i * max_depth + j] - v = vertex_paths[i * max_depth + j + 1] - # Walk not found in edgelist - if (not G.has_edge(u, v)): - err += 1 - # FIXME: Checking weights is buggy - # Corresponding weight to edge is not correct - # expr = "(src == {} and dst == {})".format(u, v) - # if not (G.edgelist.edgelist_df.query(expr)["weights"] == weight): - # err += 1 - # Check that path sizes matches up correctly with paths - if vertex_paths[i * max_depth] != seeds[i]: - err += 1 - path_start += vertex_path_sizes[i] - assert err == 0 - - -@pytest.mark.parametrize("graph_file", [KARATE]) +@pytest.mark.parametrize(*_get_param_args("graph_file", [KARATE])) def test_node2vec_invalid( graph_file ): G = utils.generate_cugraph_graph_from_file(graph_file, directed=True, edgevals=True) k = random.randint(1, 10) - start_vertices = random.sample(range(G.number_of_vertices()), k) - use_padding = True + start_vertices = cudf.Series(random.sample(range(G.number_of_vertices()), + k), dtype="int32") + compress = True max_depth = 1 p = 1 q = 1 invalid_max_depths = [None, -1, "1", 4.5] invalid_pqs = [None, -1, "1"] + invalid_start_vertices = [1.0, "1", 2147483648] # Tests for invalid max_depth for bad_depth in invalid_max_depths: with pytest.raises(ValueError): df, seeds = calc_node2vec(G, start_vertices, max_depth=bad_depth, - use_padding=use_padding, p=p, q=q) + compress_result=compress, p=p, q=q) # Tests for invalid p for bad_p in invalid_pqs: with pytest.raises(ValueError): df, seeds = calc_node2vec(G, start_vertices, max_depth=max_depth, - use_padding=use_padding, p=bad_p, q=q) + compress_result=compress, p=bad_p, q=q) # Tests for invalid q for bad_q in invalid_pqs: with pytest.raises(ValueError): df, seeds = calc_node2vec(G, start_vertices, max_depth=max_depth, - use_padding=use_padding, p=p, q=bad_q) + compress_result=compress, p=p, q=bad_q) + + # Tests for invalid start_vertices dtypes, modify when more types are + # supported + for bad_start in invalid_start_vertices: + with pytest.raises(ValueError): + df, seeds = calc_node2vec(G, bad_start, max_depth=max_depth, + compress_result=compress, p=p, q=q) + + +@pytest.mark.parametrize(*_get_param_args("graph_file", [LINE])) +@pytest.mark.parametrize(*_get_param_args("directed", DIRECTED_GRAPH_OPTIONS)) +def test_node2vec_line(graph_file, directed): + G = utils.generate_cugraph_graph_from_file(graph_file, directed=directed, + edgevals=True) + max_depth = 3 + start_vertices = cudf.Series([0, 3, 6], dtype="int32") + df, seeds = calc_node2vec( + G, + start_vertices, + max_depth, + compress_result=True, + p=0.8, + q=0.5 + ) + + +@pytest.mark.parametrize(*_get_param_args("graph_file", utils.DATASETS_SMALL)) +@pytest.mark.parametrize(*_get_param_args("directed", DIRECTED_GRAPH_OPTIONS)) +@pytest.mark.parametrize(*_get_param_args("compress", COMPRESSED)) +def test_node2vec_new( + graph_file, + directed, + compress, +): + cu_M = utils.read_csv_file(graph_file) + + G = cugraph.Graph(directed=directed) + + G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2", + renumber=False) + num_verts = G.number_of_vertices() + k = random.randint(6, 12) + start_vertices = cudf.Series(random.sample(range(num_verts), k), + dtype="int32") + max_depth = 5 + result, seeds = calc_node2vec( + G, + start_vertices, + max_depth, + compress_result=compress, + p=0.8, + q=0.5 + ) + vertex_paths, edge_weights, vertex_path_sizes = result + + if compress: + # Paths are coalesced, meaning vertex_path_sizes is nonempty. It's + # necessary to use in order to track starts of paths + assert vertex_paths.size == vertex_path_sizes.sum() + if directed: + # directed graphs may be coalesced at any point + assert vertex_paths.size - k == edge_weights.size + # This part is for checking to make sure each of the edges + # in all of the paths are valid and are accurate + idx = 0 + for path_idx in range(vertex_path_sizes.size): + for _ in range(vertex_path_sizes[path_idx] - 1): + weight = edge_weights[idx] + u = vertex_paths[idx + path_idx] + v = vertex_paths[idx + path_idx + 1] + # Corresponding weight to edge is not correct + expr = "(src == {} and dst == {})".format(u, v) + edge_query = G.edgelist.edgelist_df.query(expr) + if edge_query.empty: + raise ValueError("edge_query didn't find:({},{})". + format(u, v)) + else: + if edge_query["weights"].values[0] != weight: + raise ValueError("edge_query weight incorrect") + idx += 1 + + else: + # undirected graphs should never be coalesced + assert vertex_paths.size == max_depth * k + assert edge_weights.size == (max_depth - 1) * k + # This part is for checking to make sure each of the edges + # in all of the paths are valid and are accurate + for path_idx in range(k): + for idx in range(max_depth - 1): + weight = edge_weights[path_idx * (max_depth - 1) + idx] + u = vertex_paths[path_idx * max_depth + idx] + v = vertex_paths[path_idx * max_depth + idx + 1] + # Corresponding weight to edge is not correct + expr = "(src == {} and dst == {})".format(u, v) + edge_query = G.edgelist.edgelist_df.query(expr) + if edge_query.empty: + raise ValueError("edge_query didn't find:({},{})". + format(u, v)) + else: + if edge_query["weights"].values[0] != weight: + raise ValueError("edge_query weight incorrect") + else: + # Paths are padded, meaning a formula can be used to track starts of + # paths. Check that output sizes are as expected + assert vertex_paths.size == max_depth * k + assert edge_weights.size == (max_depth - 1) * k + assert vertex_path_sizes.size == 0 + if directed: + blanks = vertex_paths.isna() + # This part is for checking to make sure each of the edges + # in all of the paths are valid and are accurate + for i in range(k): + path_at_end, j = False, 0 + weight_idx = 0 + while not path_at_end: + src_idx = i * max_depth + j + dst_idx = i * max_depth + j + 1 + if directed: + invalid_src = blanks[src_idx] or (src_idx >= num_verts) + invalid_dst = blanks[dst_idx] or (dst_idx >= num_verts) + if invalid_src or invalid_dst: + break + weight = edge_weights[weight_idx] + u = vertex_paths[src_idx] + v = vertex_paths[dst_idx] + # Corresponding weight to edge is not correct + expr = "(src == {} and dst == {})".format(u, v) + edge_query = G.edgelist.edgelist_df.query(expr) + if edge_query.empty: + raise ValueError("edge_query didn't find:({},{})". + format(u, v)) + else: + if edge_query["weights"].values[0] != weight: + raise ValueError("edge_query weight incorrect") + + # Only increment if the current indices are valid + j += 1 + weight_idx += 1 + if j >= max_depth - 1: + path_at_end = True + # Check that path sizes matches up correctly with paths + if vertex_paths[i * max_depth] != seeds[i]: + raise ValueError("vertex_path start did not match seed \ + vertex:{}".format(vertex_paths.values)) + + +@pytest.mark.parametrize(*_get_param_args("graph_file", [LINE])) +@pytest.mark.parametrize(*_get_param_args("renumber", [True, False])) +def test_node2vec_renumber_cudf( + graph_file, + renumber +): + cu_M = cudf.read_csv(graph_file, delimiter=' ', + dtype=['int32', 'int32', 'float32'], header=None) + G = cugraph.Graph(directed=True) + G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2", + renumber=renumber) + + start_vertices = cudf.Series([8, 0, 7, 1, 6, 2], dtype="int32") + num_seeds = 6 + max_depth = 4 + + df, seeds = calc_node2vec( + G, + start_vertices, + max_depth, + compress_result=False, + p=0.8, + q=0.5 + ) + vertex_paths, edge_weights, vertex_path_sizes = df + + for i in range(num_seeds): + if vertex_paths[i * max_depth] != seeds[i]: + raise ValueError("vertex_path {} start did not match seed \ + vertex".format(vertex_paths.values)) diff --git a/python/cugraph/cugraph/tests/test_nx_compatibility.py b/python/cugraph/cugraph/tests/test_nx_compatibility.py deleted file mode 100644 index 90a5cbb46d1..00000000000 --- a/python/cugraph/cugraph/tests/test_nx_compatibility.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -# Temporarily suppress warnings till networkX fixes deprecation warnings -# (Using or importing the ABCs from 'collections' instead of from -# 'collections.abc' is deprecated, and in 3.8 it will stop working) for -# python 3.7. Also, this import networkx needs to be relocated in the -# third-party group once this gets fixed. -import pytest -from cugraph.tests import utils - -import networkx as nx - - -def test_nx_gbuilder(): - - # Create an empty graph - G = nx.Graph() - assert G.number_of_edges() == 0 - assert G.number_of_nodes() == 0 - - # Add a node - G.add_node(1) - assert G.number_of_edges() == 0 - assert G.number_of_nodes() == 1 - - # Add some edges - G.add_edges_from([(1, 2), (1, 3)]) - assert G.number_of_edges() == 2 - assert G.number_of_nodes() == 3 - - # Add some duplicates - G.add_edges_from([(1, 2), (1, 3)]) - G.add_node(1) - G.add_edge(1, 2) - assert G.number_of_edges() == 2 - assert G.number_of_nodes() == 3 - - # Add nodes with a property from a list - G.add_nodes_from([(4, {"color": "red"}), (5, {"color": "green"}), ]) - assert G.nodes[4]["color"] == "red" - - G.add_node("spam") # adds node "spam" - G.add_nodes_from("spam") # adds 4 nodes: 's', 'p', 'a', 'm' - G.add_edge(3, 'm') - assert G.number_of_edges() == 3 - assert G.number_of_nodes() == 10 - assert list(G.nodes) == [1, 2, 3, 4, 5, 'spam', 's', 'p', 'a', 'm'] - # remove nodes - G.remove_node(2) - G.remove_nodes_from("spam") - assert list(G.nodes) == [1, 3, 4, 5, 'spam'] - G.remove_edge(1, 3) - - # Access edge attributes - G = nx.Graph([(1, 2, {"color": "yellow"})]) - assert G[1][2] == {'color': 'yellow'} - assert G.edges[1, 2] == {'color': 'yellow'} - - -def test_nx_graph_functions(): - # test adjacency - FG = nx.Graph() - FG.add_weighted_edges_from([(1, 2, 0.125), (1, 3, 0.75), - (2, 4, 1.2), (3, 4, 0.375)]) - for n, nbrs in FG.adj.items(): - for nbr, eattr in nbrs.items(): - wt = eattr['weight'] - if wt < 0.5: - assert FG[n][nbr]['weight'] < 0.5 - # accessing graph edges - for (u, v, wt) in FG.edges.data('weight'): - if wt < 0.5: - assert FG[u][v]['weight'] <= 0.5 - else: - assert FG[u][v]['weight'] > 0.5 - - -def test_nx_analysis(): - G = nx.Graph() - G.add_edges_from([(1, 2), (1, 3)]) - G.add_node("spam") # adds node "spam" - assert list(nx.connected_components(G)) == [{1, 2, 3}, {'spam'}] - assert sorted(d for n, d in G.degree()) == [0, 1, 1, 2] - assert nx.clustering(G) == {1: 0, 2: 0, 3: 0, 'spam': 0} - assert list(nx.bfs_edges(G, 1)) == [(1, 2), (1, 3)] - - -@pytest.mark.parametrize( - "graph_file", - [utils.RAPIDS_DATASET_ROOT_DIR_PATH/"dolphins.csv"]) -def test_with_dolphins(graph_file): - - df = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) - G = nx.from_pandas_edgelist(df, create_using=nx.Graph(), - source="0", target="1", edge_attr="weight") - - assert G.degree(0) == 6 - assert G.degree(14) == 12 - assert G.degree(15) == 7 - assert G.degree(40) == 8 - assert G.degree(42) == 6 - assert G.degree(47) == 6 - assert G.degree(17) == 9 diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd index 72474a4a5fa..d7a0755df85 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd @@ -24,6 +24,7 @@ from pylibcugraph._cugraph_c.error cimport ( ) from pylibcugraph._cugraph_c.array cimport ( cugraph_type_erased_device_array_view_t, + cugraph_type_erased_host_array_view_t, ) from pylibcugraph._cugraph_c.graph cimport ( cugraph_graph_t, @@ -203,3 +204,90 @@ cdef extern from "cugraph_c/algorithms.h": cugraph_random_walk_result_t** result, cugraph_error_t** error ) + ########################################################################### + # hits + ctypedef struct cugraph_hits_result_t: + pass + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_hits_result_get_vertices( + cugraph_hits_result_t* result + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_hits_result_get_hubs( + cugraph_hits_result_t* result + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_hits_result_get_authorities( + cugraph_hits_result_t* result + ) + + cdef void \ + cugraph_hits_result_free( + cugraph_hits_result_t* result + ) + + cdef cugraph_error_code_t \ + cugraph_hits( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + double tol, + size_t max_iter, + const cugraph_type_erased_device_array_view_t* initial_hubs_guess_vertices, + const cugraph_type_erased_device_array_view_t* initial_hubs_guess_values, + bool_t normalized, + bool_t do_expensive_check, + cugraph_hits_result_t** result, + cugraph_error_t** error + ) + + ########################################################################### + # sampling + ctypedef struct cugraph_sample_result_t: + pass + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_sample_result_get_sources( + cugraph_sample_result_t* result + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_sample_result_get_destinations( + cugraph_sample_result_t* result + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_sample_result_get_start_labels( + cugraph_sample_result_t* result + ) + + cdef cugraph_type_erased_device_array_view_t* \ + cugraph_sample_result_get_index( + cugraph_sample_result_t* result + ) + + cdef cugraph_type_erased_host_array_view_t* \ + cugraph_sample_result_get_counts( + cugraph_sample_result_t* result + ) + + cdef void \ + cugraph_sample_result_free( + cugraph_sample_result_t* result + ) + + # uniform neighborhood sampling + cdef cugraph_error_code_t \ + cugraph_uniform_neighbor_sample( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* start, + const cugraph_type_erased_device_array_view_t* start_labels, + const cugraph_type_erased_host_array_view_t* fan_out, + bool_t without_replacement, + bool_t do_expensive_check, + cugraph_sample_result_t** result, + cugraph_error_t** error + ) diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/array.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/array.pxd index c399b67d3ca..621a91a11f3 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/array.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/array.pxd @@ -140,6 +140,14 @@ cdef extern from "cugraph_c/array.h": cugraph_type_erased_host_array_pointer( const cugraph_type_erased_host_array_view_t* p ) + + # cdef void* \ + # cugraph_type_erased_host_array_view_copy( + # const cugraph_resource_handle_t* handle, + # cugraph_type_erased_host_array_view_t* dst, + # const cugraph_type_erased_host_array_view_t* src, + # cugraph_error_t** error + # ) cdef cugraph_error_code_t \ cugraph_type_erased_device_array_view_copy_from_host( diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph.pxd index e5313f710c1..cf445dfcea2 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph.pxd @@ -65,10 +65,7 @@ cdef extern from "cugraph_c/graph.h": const cugraph_type_erased_device_array_view_t* src, const cugraph_type_erased_device_array_view_t* dst, const cugraph_type_erased_device_array_view_t* weights, - const cugraph_type_erased_host_array_view_t* vertex_partition_offsets, - const cugraph_type_erased_host_array_view_t* segment_offsets, bool_t store_transposed, - size_t num_vertices, size_t num_edges, bool_t check, cugraph_graph_t** graph, diff --git a/python/pylibcugraph/pylibcugraph/experimental/__init__.py b/python/pylibcugraph/pylibcugraph/experimental/__init__.py index 14b8947f9cb..7445cf9ab71 100644 --- a/python/pylibcugraph/pylibcugraph/experimental/__init__.py +++ b/python/pylibcugraph/pylibcugraph/experimental/__init__.py @@ -41,6 +41,9 @@ from pylibcugraph.graphs import EXPERIMENTAL__SGGraph SGGraph = experimental_warning_wrapper(EXPERIMENTAL__SGGraph) +from pylibcugraph.graphs import EXPERIMENTAL__MGGraph +MGGraph = experimental_warning_wrapper(EXPERIMENTAL__MGGraph) + from pylibcugraph.resource_handle import EXPERIMENTAL__ResourceHandle ResourceHandle = experimental_warning_wrapper(EXPERIMENTAL__ResourceHandle) @@ -53,5 +56,11 @@ from pylibcugraph.sssp import EXPERIMENTAL__sssp sssp = experimental_warning_wrapper(EXPERIMENTAL__sssp) +from pylibcugraph.hits import EXPERIMENTAL__hits +hits = experimental_warning_wrapper(EXPERIMENTAL__hits) + from pylibcugraph.node2vec import EXPERIMENTAL__node2vec node2vec = experimental_warning_wrapper(EXPERIMENTAL__node2vec) + +from pylibcugraph.uniform_neighborhood_sampling import EXPERIMENTAL__uniform_neighborhood_sampling +uniform_neighborhood_sampling = experimental_warning_wrapper(EXPERIMENTAL__uniform_neighborhood_sampling) diff --git a/python/pylibcugraph/pylibcugraph/graph_properties.pyx b/python/pylibcugraph/pylibcugraph/graph_properties.pyx index dc8b2a51225..84737e935df 100644 --- a/python/pylibcugraph/pylibcugraph/graph_properties.pyx +++ b/python/pylibcugraph/pylibcugraph/graph_properties.pyx @@ -22,6 +22,18 @@ cdef class EXPERIMENTAL__GraphProperties: self.c_graph_properties.is_symmetric = is_symmetric self.c_graph_properties.is_multigraph = is_multigraph + # Pickle support methods: get args for __new__ (__cinit__), get/set state + def __getnewargs_ex__(self): + is_symmetric = self.c_graph_properties.is_symmetric + is_multigraph = self.c_graph_properties.is_multigraph + return ((),{"is_symmetric":is_symmetric, "is_multigraph":is_multigraph}) + + def __getstate__(self): + return () + + def __setstate__(self, state): + pass + @property def is_symmetric(self): return bool(self.c_graph_properties.is_symmetric) diff --git a/python/pylibcugraph/pylibcugraph/graphs.pxd b/python/pylibcugraph/pylibcugraph/graphs.pxd index 63cbb01f547..4d54d1f320d 100644 --- a/python/pylibcugraph/pylibcugraph/graphs.pxd +++ b/python/pylibcugraph/pylibcugraph/graphs.pxd @@ -27,6 +27,5 @@ cdef class _GPUGraph: cdef class EXPERIMENTAL__SGGraph(_GPUGraph): pass -# Not yet supported -# cdef class EXPERIMENTAL__MGGraph(_GPUGraph): -# pass +cdef class EXPERIMENTAL__MGGraph(_GPUGraph): + pass diff --git a/python/pylibcugraph/pylibcugraph/graphs.pyx b/python/pylibcugraph/pylibcugraph/graphs.pyx index c6038650869..579e70ea753 100644 --- a/python/pylibcugraph/pylibcugraph/graphs.pyx +++ b/python/pylibcugraph/pylibcugraph/graphs.pyx @@ -33,8 +33,16 @@ from pylibcugraph._cugraph_c.array cimport ( from pylibcugraph._cugraph_c.graph cimport ( cugraph_graph_t, cugraph_sg_graph_create, + cugraph_mg_graph_create, cugraph_graph_properties_t, cugraph_sg_graph_free, + cugraph_mg_graph_free, +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, + cugraph_mg_graph_create, + cugraph_graph_properties_t, + cugraph_mg_graph_free, ) from pylibcugraph.resource_handle cimport ( EXPERIMENTAL__ResourceHandle, @@ -169,3 +177,123 @@ cdef class EXPERIMENTAL__SGGraph(_GPUGraph): def __dealloc__(self): if self.c_graph_ptr is not NULL: cugraph_sg_graph_free(self.c_graph_ptr) + + +cdef class EXPERIMENTAL__MGGraph(_GPUGraph): + """ + RAII-stye Graph class for use with multi-GPU APIs that manages the + individual create/free calls and the corresponding cugraph_graph_t pointer. + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph_properties : GraphProperties + Object defining intended properties for the graph. + + src_array : device array type + Device array containing the vertex identifiers of the source of each + directed edge. The order of the array corresponds to the ordering of the + dst_array, where the ith item in src_array and the ith item in dst_array + define the ith edge of the graph. + + dst_array : device array type + Device array containing the vertex identifiers of the destination of + each directed edge. The order of the array corresponds to the ordering + of the src_array, where the ith item in src_array and the ith item in + dst_array define the ith edge of the graph. + + weight_array : device array type + Device array containing the weight values of each directed edge. The + order of the array corresponds to the ordering of the src_array and + dst_array arrays, where the ith item in weight_array is the weight value + of the ith edge of the graph. + + store_transposed : bool + Set to True if the graph should be transposed. This is required for some + algorithms, such as pagerank. + + num_edges : int + Number of edges + + do_expensive_check : bool + If True, performs more extensive tests on the inputs to ensure + validitity, at the expense of increased run time. + """ + def __cinit__(self, + EXPERIMENTAL__ResourceHandle resource_handle, + EXPERIMENTAL__GraphProperties graph_properties, + src_array, + dst_array, + weight_array, + store_transposed, + num_edges, + do_expensive_check): + + # FIXME: add tests for these + if not(isinstance(store_transposed, (int, bool))): + raise TypeError("expected int or bool for store_transposed, got " + f"{type(store_transposed)}") + if not(isinstance(num_edges, (int))): + raise TypeError("expected int for num_edges, got " + f"{type(num_edges)}") + if not(isinstance(do_expensive_check, (int, bool))): + raise TypeError("expected int or bool for do_expensive_check, got " + f"{type(do_expensive_check)}") + assert_CAI_type(src_array, "src_array") + assert_CAI_type(dst_array, "dst_array") + assert_CAI_type(weight_array, "weight_array") + + # FIXME: assert that src_array and dst_array have the same type + + cdef cugraph_error_t* error_ptr + cdef cugraph_error_code_t error_code + + cdef uintptr_t cai_srcs_ptr = \ + src_array.__cuda_array_interface__["data"][0] + cdef cugraph_type_erased_device_array_view_t* srcs_view_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_srcs_ptr, + len(src_array), + get_c_type_from_numpy_type(src_array.dtype)) + + cdef uintptr_t cai_dsts_ptr = \ + dst_array.__cuda_array_interface__["data"][0] + cdef cugraph_type_erased_device_array_view_t* dsts_view_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_dsts_ptr, + len(dst_array), + get_c_type_from_numpy_type(dst_array.dtype)) + + cdef uintptr_t cai_weights_ptr = \ + weight_array.__cuda_array_interface__["data"][0] + cdef cugraph_type_erased_device_array_view_t* weights_view_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_weights_ptr, + len(weight_array), + get_c_type_from_numpy_type(weight_array.dtype)) + + error_code = cugraph_mg_graph_create( + resource_handle.c_resource_handle_ptr, + &(graph_properties.c_graph_properties), + srcs_view_ptr, + dsts_view_ptr, + weights_view_ptr, + store_transposed, + num_edges, + do_expensive_check, + &(self.c_graph_ptr), + &error_ptr) + + assert_success(error_code, error_ptr, + "cugraph_mg_graph_create()") + + cugraph_type_erased_device_array_view_free(srcs_view_ptr) + cugraph_type_erased_device_array_view_free(dsts_view_ptr) + cugraph_type_erased_device_array_view_free(weights_view_ptr) + + def __dealloc__(self): + if self.c_graph_ptr is not NULL: + cugraph_mg_graph_free(self.c_graph_ptr) diff --git a/python/pylibcugraph/pylibcugraph/hits.pyx b/python/pylibcugraph/pylibcugraph/hits.pyx new file mode 100644 index 00000000000..4eede47e488 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/hits.pyx @@ -0,0 +1,194 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t + +from pylibcugraph._cugraph_c.resource_handle cimport ( + bool_t, + data_type_id_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_create, + cugraph_type_erased_device_array_view_free, +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.algorithms cimport ( + cugraph_hits, + cugraph_hits_result_t, + cugraph_hits_result_get_vertices, + cugraph_hits_result_get_hubs, + cugraph_hits_result_get_authorities, + cugraph_hits_result_free, +) +from pylibcugraph.resource_handle cimport ( + EXPERIMENTAL__ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + assert_CAI_type, + copy_to_cupy_array, + get_c_type_from_numpy_type +) + + +def EXPERIMENTAL__hits(EXPERIMENTAL__ResourceHandle resource_handle, + _GPUGraph graph, + double tol, + size_t max_iter, + initial_hubs_guess_vertices, + initial_hubs_guess_values, + bool_t normalized, + bool_t do_expensive_check): + """ + Compute HITS hubs and authorities values for each vertex + + The HITS algorithm computes two numbers for a node. Authorities + estimates the node value based on the incoming links. Hubs estimates + the node value based on outgoing links. + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph : SGGraph or MGGraph + The input graph, for either Single or Multi-GPU operations. + + tol : float, optional (default=1.0e-5) + Set the tolerance the approximation, this parameter should be a small + magnitude value. This parameter is not currently supported. + + max_iter : int, optional (default=100) + The maximum number of iterations before an answer is returned. + + initial_hubs_guess_vertices : device array type, optional (default=None) + Device array containing the pointer to the array of initial hub guess vertices + + initial_hubs_guess_values : device array type, optional (default=None) + Device array containing the pointer to the array of initial hub guess values + + normalized : bool, optional (default=True) + + + do_expensive_check : bool + If True, performs more extensive tests on the inputs to ensure + validitity, at the expense of increased run time. + + Returns + ------- + A tuple of device arrays, where the third item in the tuple is a device + array containing the vertex identifiers, the first and second items are device + arrays containing respectively the hubs and authorities values for the corresponding + vertices + + Examples + -------- + # FIXME: No example yet + + """ + + cdef uintptr_t cai_initial_hubs_guess_vertices_ptr = NULL + cdef uintptr_t cai_initial_hubs_guess_values_ptr = NULL + + cdef cugraph_type_erased_device_array_view_t* initial_hubs_guess_vertices_view_ptr = NULL + cdef cugraph_type_erased_device_array_view_t* initial_hubs_guess_values_view_ptr = NULL + + # FIXME: Add check ensuring that both initial_hubs_guess_vertices + # and initial_hubs_guess_values are passed when calling only pylibcugraph HITS. + # This is already True for cugraph HITS + + if initial_hubs_guess_vertices is not None: + assert_CAI_type(initial_hubs_guess_vertices, "initial_hubs_guess_vertices") + + cai_initial_hubs_guess_vertices_ptr = \ + initial_hubs_guess_vertices.__cuda_array_interface__["data"][0] + + initial_hubs_guess_vertices_view_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_initial_hubs_guess_vertices_ptr, + len(initial_hubs_guess_vertices), + get_c_type_from_numpy_type(initial_hubs_guess_vertices.dtype)) + + if initial_hubs_guess_values is not None: + assert_CAI_type(initial_hubs_guess_values, "initial_hubs_guess_values") + + cai_initial_hubs_guess_values_ptr = \ + initial_hubs_guess_values.__cuda_array_interface__["data"][0] + + initial_hubs_guess_values_view_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_initial_hubs_guess_values_ptr, + len(initial_hubs_guess_values), + get_c_type_from_numpy_type(initial_hubs_guess_values.dtype)) + + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr + + cdef cugraph_hits_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + + error_code = cugraph_hits(c_resource_handle_ptr, + c_graph_ptr, + tol, + max_iter, + initial_hubs_guess_vertices_view_ptr, + initial_hubs_guess_values_view_ptr, + normalized, + do_expensive_check, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_mg_hits") + + # Extract individual device array pointers from result and copy to cupy + # arrays for returning. + cdef cugraph_type_erased_device_array_view_t* vertices_ptr = \ + cugraph_hits_result_get_vertices(result_ptr) + cdef cugraph_type_erased_device_array_view_t* hubs_ptr = \ + cugraph_hits_result_get_hubs(result_ptr) + cdef cugraph_type_erased_device_array_view_t* authorities_ptr = \ + cugraph_hits_result_get_authorities(result_ptr) + + cupy_vertices = copy_to_cupy_array(c_resource_handle_ptr, vertices_ptr) + cupy_hubs = copy_to_cupy_array(c_resource_handle_ptr, hubs_ptr) + cupy_authorities = copy_to_cupy_array(c_resource_handle_ptr, + authorities_ptr) + + cugraph_hits_result_free(result_ptr) + + if initial_hubs_guess_vertices is not None: + cugraph_type_erased_device_array_view_free( + initial_hubs_guess_vertices_view_ptr) + + if initial_hubs_guess_values is not None: + cugraph_type_erased_device_array_view_free( + initial_hubs_guess_values_view_ptr) + + return (cupy_vertices, cupy_hubs, cupy_authorities) diff --git a/python/pylibcugraph/pylibcugraph/node2vec.pyx b/python/pylibcugraph/pylibcugraph/node2vec.pyx index ec45b234d92..9879a9a7be3 100644 --- a/python/pylibcugraph/pylibcugraph/node2vec.pyx +++ b/python/pylibcugraph/pylibcugraph/node2vec.pyx @@ -28,7 +28,7 @@ from pylibcugraph._cugraph_c.error cimport ( from pylibcugraph._cugraph_c.array cimport ( cugraph_type_erased_device_array_view_t, cugraph_type_erased_device_array_view_create, - cugraph_type_erased_device_array_free, + cugraph_type_erased_device_array_view_free, ) from pylibcugraph._cugraph_c.graph cimport ( cugraph_graph_t, @@ -81,8 +81,9 @@ def EXPERIMENTAL__node2vec(EXPERIMENTAL__ResourceHandle resource_handle, Maximum number of vertices in generated path compress_result : bool_t - If true, the third return device array contains the sizes for each path, - otherwise outputs empty device array. + If true, the paths are unpadded and a third return device array contains + the sizes for each path, otherwise the paths are padded and the third + return device array is empty. p : double The return factor p represents the likelihood of backtracking to a node @@ -109,7 +110,7 @@ def EXPERIMENTAL__node2vec(EXPERIMENTAL__ResourceHandle resource_handle, >>> import pylibcugraph, cupy, numpy >>> srcs = cupy.asarray([0, 1, 2], dtype=numpy.int32) >>> dsts = cupy.asarray([1, 2, 3], dtype=numpy.int32) - >>> seeds = cupy.asarrray([0, 0, 1], dtype=numpy.int32) + >>> seeds = cupy.asarray([0, 0, 1], dtype=numpy.int32) >>> weights = cupy.asarray([1.0, 1.0, 1.0], dtype=numpy.float32) >>> resource_handle = pylibcugraph.experimental.ResourceHandle() >>> graph_props = pylibcugraph.experimental.GraphProperties( @@ -172,7 +173,8 @@ def EXPERIMENTAL__node2vec(EXPERIMENTAL__ResourceHandle resource_handle, cupy_weights = copy_to_cupy_array(c_resource_handle_ptr, weights_ptr) cupy_path_sizes = copy_to_cupy_array(c_resource_handle_ptr, path_sizes_ptr) - + cugraph_random_walk_result_free(result_ptr) + cugraph_type_erased_device_array_view_free(seed_view_ptr) return (cupy_paths, cupy_weights, cupy_path_sizes) diff --git a/python/pylibcugraph/pylibcugraph/resource_handle.pyx b/python/pylibcugraph/pylibcugraph/resource_handle.pyx index a323751f2fb..77e3eca36b1 100644 --- a/python/pylibcugraph/pylibcugraph/resource_handle.pyx +++ b/python/pylibcugraph/pylibcugraph/resource_handle.pyx @@ -18,6 +18,8 @@ from pylibcugraph._cugraph_c.resource_handle cimport ( cugraph_create_resource_handle, cugraph_free_resource_handle, ) +#from cugraph.dask.traversal cimport mg_bfs as c_bfs +from pylibcugraph cimport resource_handle as c_resource_handle cdef class EXPERIMENTAL__ResourceHandle: @@ -25,8 +27,18 @@ cdef class EXPERIMENTAL__ResourceHandle: RAII-stye resource handle class to manage individual create/free calls and the corresponding pointer to a cugraph_resource_handle_t """ - def __cinit__(self): - self.c_resource_handle_ptr = cugraph_create_resource_handle(NULL) + def __cinit__(self, handle=None): + cdef void* handle_ptr = NULL + cdef size_t handle_size_t + if handle is not None: + # FIXME: rather than assume a RAFT handle here, consider something + # like a factory function in cugraph (which already has a RAFT + # dependency and makes RAFT assumptions) that takes a RAFT handle + # and constructs/returns a ResourceHandle + handle_size_t = handle + handle_ptr = handle_size_t + + self.c_resource_handle_ptr = cugraph_create_resource_handle(handle_ptr) # FIXME: check for error def __dealloc__(self): diff --git a/python/pylibcugraph/pylibcugraph/tests/test_neighborhood_sampling.py b/python/pylibcugraph/pylibcugraph/tests/test_neighborhood_sampling.py new file mode 100644 index 00000000000..4cf4b70e476 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/tests/test_neighborhood_sampling.py @@ -0,0 +1,127 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import cupy as cp +import numpy as np +import cudf +from pylibcugraph.experimental import (MGGraph, + ResourceHandle, + GraphProperties, + uniform_neighborhood_sampling, + ) + + +# ============================================================================= +# Pytest fixtures +# ============================================================================= +# fixtures used in this test module are defined in conftest.py + + +# ============================================================================= +# Tests +# ============================================================================= + + +def check_edges(result, srcs, dsts, weights, num_verts, num_edges, num_seeds): + result_srcs, result_dsts, result_labels, result_indices = result + h_src_arr = srcs.get() + h_dst_arr = dsts.get() + h_wgt_arr = weights.get() + + h_result_srcs = result_srcs.get() + h_result_dsts = result_dsts.get() + h_result_labels = result_labels.get() + h_result_indices = result_indices.get() + + # Following the C validation, we will check that all edges are part of the + # graph + M = np.zeros((num_verts, num_verts), dtype=np.float64) + + for idx in range(num_edges): + M[h_src_arr[idx]][h_dst_arr[idx]] = h_wgt_arr[idx] + + for edge in range(h_result_srcs): + assert M[h_result_srcs[edge]][h_result_dsts[edge]] > 0.0 + found = False + for j in range(num_seeds): + # Revise, this is not correct + found = found or (h_result_labels[edge] == h_result_indices[j]) + + +# TODO: Refactor after creating a helper within conftest.py to pass in an +# mg_graph_objs instance +@pytest.mark.skip(reason="pylibcugraph MG test infra not complete") +def test_neighborhood_sampling_cupy(): + resource_handle = ResourceHandle() + graph_props = GraphProperties(is_symmetric=False, is_multigraph=False) + + device_srcs = cp.asarray([0, 1, 1, 2, 2, 2, 3, 4], dtype=np.int32) + device_dsts = cp.asarray([1, 3, 4, 0, 1, 3, 5, 5], dtype=np.int32) + device_weights = cp.asarray([0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2], + dtype=np.float32) + start_list = cp.asarray([2, 2], dtype=np.int32) + info_list = cp.asarray([0, 1], dtype=np.int32) + fanout_vals = cp.asarray([1, 2], dtype=np.int32) + + mg = MGGraph(resource_handle, + graph_props, + device_srcs, + device_dsts, + device_weights, + store_transposed=True, + num_edges=8, + do_expensive_check=False) + + result = uniform_neighborhood_sampling(resource_handle, + mg, + start_list, + info_list, + fanout_vals, + with_replacement=True, + do_expensive_check=False) + + check_edges(result, device_srcs, device_dsts, device_weights, 6, 8, 2) + + +@pytest.mark.skip(reason="pylibcugraph MG test infra not complete") +def test_neighborhood_sampling_cudf(): + resource_handle = ResourceHandle() + graph_props = GraphProperties(is_symmetric=False, is_multigraph=False) + + device_srcs = cudf.Series([0, 1, 1, 2, 2, 2, 3, 4], dtype=np.int32) + device_dsts = cudf.Series([1, 3, 4, 0, 1, 3, 5, 5], dtype=np.int32) + device_weights = cudf.Series([0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2], + dtype=np.float32) + start_list = cudf.Series([2, 2], dtype=np.int32) + info_list = cudf.Series([0, 1], dtype=np.int32) + fanout_vals = cudf.Series([1, 2], dtype=np.int32) + + mg = MGGraph(resource_handle, + graph_props, + device_srcs, + device_dsts, + device_weights, + store_transposed=True, + num_edges=8, + do_expensive_check=False) + + result = uniform_neighborhood_sampling(resource_handle, + mg, + start_list, + info_list, + fanout_vals, + with_replacement=True, + do_expensive_check=False) + + check_edges(result, device_srcs, device_dsts, device_weights, 6, 8, 2) diff --git a/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py b/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py index 6525393a647..19871780aeb 100644 --- a/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py +++ b/python/pylibcugraph/pylibcugraph/tests/test_node2vec.py @@ -14,6 +14,16 @@ import pytest import cupy as cp import numpy as np +from pylibcugraph.experimental import (ResourceHandle, + GraphProperties, + SGGraph, + node2vec) +from cugraph.tests import utils +import cugraph + + +COMPRESSED = [False, True] +LINE = utils.RAPIDS_DATASET_ROOT_DIR_PATH/"small_line.csv" # ============================================================================= @@ -60,6 +70,88 @@ }, } + +# ============================================================================= +# Test helpers +# ============================================================================= +def _get_param_args(param_name, param_values): + """ + Returns a tuple of (, ) which can be applied + as the args to pytest.mark.parametrize(). The pytest.param list also + contains param id string formed from teh param name and values. + """ + return (param_name, + [pytest.param(v, id=f"{param_name}={v}") for v in param_values]) + + +def _run_node2vec(src_arr, + dst_arr, + wgt_arr, + seeds, + num_vertices, + num_edges, + max_depth, + compressed_result, + p, + q, + renumbered): + """ + Builds a graph from the input arrays and runs node2vec using the other args + to this function, then checks the output for validity. + """ + resource_handle = ResourceHandle() + graph_props = GraphProperties(is_symmetric=False, is_multigraph=False) + G = SGGraph(resource_handle, graph_props, src_arr, dst_arr, wgt_arr, + store_transposed=False, renumber=renumbered, + do_expensive_check=True) + + (paths, weights, sizes) = node2vec(resource_handle, G, seeds, max_depth, + compressed_result, p, q) + + num_seeds = len(seeds) + + # Validating results of node2vec by checking each path + M = np.zeros((num_vertices, num_vertices), dtype=np.float64) + + h_src_arr = src_arr.get() + h_dst_arr = dst_arr.get() + h_wgt_arr = wgt_arr.get() + h_paths = paths.get() + h_weights = weights.get() + + for i in range(num_edges): + M[h_src_arr[i]][h_dst_arr[i]] = h_wgt_arr[i] + + if compressed_result: + path_offsets = np.zeros(num_seeds + 1, dtype=np.int32) + path_offsets[0] = 0 + for i in range(num_seeds): + path_offsets[i + 1] = path_offsets[i] + sizes.get()[i] + + for i in range(num_seeds): + for j in range(path_offsets[i], (path_offsets[i + 1] - 1)): + actual_wgt = h_weights[j - i] + expected_wgt = M[h_paths[j]][h_paths[j + 1]] + if pytest.approx(expected_wgt, 1e-4) != actual_wgt: + s = h_paths[j] + d = h_paths[j+1] + raise ValueError(f"Edge ({s},{d}) has wgt {actual_wgt}, " + f"should have been {expected_wgt}") + else: + max_path_length = int(len(paths) / num_seeds) + for i in range(num_seeds): + for j in range(max_path_length - 1): + curr_idx = i * max_path_length + j + next_idx = i * max_path_length + j + 1 + if (h_paths[next_idx] != num_vertices): + actual_wgt = h_weights[i * (max_path_length - 1) + j] + expected_wgt = M[h_paths[curr_idx]][h_paths[next_idx]] + if pytest.approx(expected_wgt, 1e-4) != actual_wgt: + s = h_paths[j] + d = h_paths[j+1] + raise ValueError(f"Edge ({s},{d}) has wgt {actual_wgt}" + f", should have been {expected_wgt}") + # ============================================================================= # Pytest fixtures # ============================================================================= @@ -67,13 +159,107 @@ # ============================================================================= -# Tests +# Tests adapted from libcugraph # ============================================================================= +def test_node2vec_short(): + num_edges = 8 + num_vertices = 6 + src = cp.asarray([0, 1, 1, 2, 2, 2, 3, 4], dtype=np.int32) + dst = cp.asarray([1, 3, 4, 0, 1, 3, 5, 5], dtype=np.int32) + wgt = cp.asarray([0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2], + dtype=np.float32) + seeds = cp.asarray([0, 0], dtype=np.int32) + max_depth = 4 + + _run_node2vec(src, dst, wgt, seeds, num_vertices, num_edges, max_depth, + False, 0.8, 0.5, False) + + +def test_node2vec_short_dense(): + num_edges = 8 + num_vertices = 6 + src = cp.asarray([0, 1, 1, 2, 2, 2, 3, 4], dtype=np.int32) + dst = cp.asarray([1, 3, 4, 0, 1, 3, 5, 5], dtype=np.int32) + wgt = cp.asarray([0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2], + dtype=np.float32) + seeds = cp.asarray([2, 3], dtype=np.int32) + max_depth = 4 + + _run_node2vec(src, dst, wgt, seeds, num_vertices, num_edges, max_depth, + False, 0.8, 0.5, False) -@pytest.mark.parametrize("compress_result", [True, False]) -def test_node2vec(sg_graph_objs, compress_result): - from pylibcugraph.experimental import node2vec +def test_node2vec_short_sparse(): + num_edges = 8 + num_vertices = 6 + src = cp.asarray([0, 1, 1, 2, 2, 2, 3, 4], dtype=np.int32) + dst = cp.asarray([1, 3, 4, 0, 1, 3, 5, 5], dtype=np.int32) + wgt = cp.asarray([0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2], + dtype=np.float32) + seeds = cp.asarray([2, 3], dtype=np.int32) + max_depth = 4 + + _run_node2vec(src, dst, wgt, seeds, num_vertices, num_edges, max_depth, + True, 0.8, 0.5, False) + + +@pytest.mark.parametrize(*_get_param_args("compress_result", [True, False])) +@pytest.mark.parametrize(*_get_param_args("renumbered", [True, False])) +def test_node2vec_karate(compress_result, renumbered): + num_edges = 156 + num_vertices = 34 + src = cp.asarray([1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, + 2, 3, 7, 13, 17, 19, 21, 30, 3, 7, 8, 9, 13, 27, 28, + 32, 7, 12, 13, 6, 10, 6, 10, 16, 16, 30, 32, 33, 33, + 33, 32, 33, 32, 33, 32, 33, 33, 32, 33, 32, 33, 25, 27, + 29, 32, 33, 25, 27, 31, 31, 29, 33, 33, 31, 33, 32, 33, + 32, 33, 32, 33, 33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, + 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 8, 8, 8, 9, 13, 14, + 14, 15, 15, 18, 18, 19, 20, 20, 22, 22, 23, 23, 23, 23, + 23, 24, 24, 24, 25, 26, 26, 27, 28, 28, 29, 29, 30, 30, + 31, 31, 32], + dtype=np.int32) + dst = cp.asarray([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, + 4, 5, 5, 5, 6, 8, 8, 8, 9, 13, 14, 14, 15, 15, 18, 18, + 19, 20, 20, 22, 22, 23, 23, 23, 23, 23, 24, 24, 24, 25, + 26, 26, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, 1, 2, + 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 2, 3, + 7, 13, 17, 19, 21, 30, 3, 7, 8, 9, 13, 27, 28, 32, 7, + 12, 13, 6, 10, 6, 10, 16, 16, 30, 32, 33, 33, 33, 32, + 33, 32, 33, 32, 33, 33, 32, 33, 32, 33, 25, 27, 29, 32, + 33, 25, 27, 31, 31, 29, 33, 33, 31, 33, 32, 33, 32, 33, + 32, 33, 33], + dtype=np.int32) + wgt = cp.asarray([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0], + dtype=np.float32) + seeds = cp.asarray([12, 28, 20, 23, 15, 26], dtype=np.int32) + max_depth = 5 + + _run_node2vec(src, dst, wgt, seeds, num_vertices, num_edges, max_depth, + compress_result, 0.8, 0.5, renumbered) + + +# ============================================================================= +# Tests +# ============================================================================= +@pytest.mark.parametrize(*_get_param_args("compress_result", [True, False])) +def test_node2vec(sg_graph_objs, compress_result): (g, resource_handle, ds_name) = sg_graph_objs (seeds, expected_paths, expected_weights, expected_path_sizes, max_depth) \ @@ -102,9 +288,6 @@ def test_node2vec(sg_graph_objs, compress_result): # up with weights array assert len(actual_path_sizes) == num_paths expected_walks = sum(exp_path_sizes) - num_paths - # FIXME: When using multiple seeds, paths are connected via the weights - # array, there should not be a weight connecting the end of a path with - # the beginning of another. PR #2089 will resolve this. # Verify the number of walks was equal to path sizes - num paths assert len(actual_weights) == expected_walks else: @@ -129,3 +312,64 @@ def test_node2vec(sg_graph_objs, compress_result): assert actual_path_sizes[i] == exp_path_sizes[i] assert actual_paths[path_start] == seeds[i] path_start += actual_path_sizes[i] + + +@pytest.mark.parametrize(*_get_param_args("graph_file", [LINE])) +@pytest.mark.parametrize(*_get_param_args("renumber", COMPRESSED)) +def test_node2vec_renumber_cudf(graph_file, renumber): + from cudf import read_csv, Series + + cu_M = read_csv(graph_file, delimiter=' ', + dtype=['int32', 'int32', 'float32'], header=None) + G = cugraph.Graph(directed=True) + G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2", + renumber=renumber) + src_arr = G.edgelist.edgelist_df['src'] + dst_arr = G.edgelist.edgelist_df['dst'] + wgt_arr = G.edgelist.edgelist_df['weights'] + seeds = Series([8, 0, 7, 1, 6, 2], dtype="int32") + max_depth = 4 + num_seeds = 6 + + resource_handle = ResourceHandle() + graph_props = GraphProperties(is_symmetric=False, is_multigraph=False) + G = SGGraph(resource_handle, graph_props, src_arr, dst_arr, wgt_arr, + store_transposed=False, renumber=renumber, + do_expensive_check=True) + + (paths, weights, sizes) = node2vec(resource_handle, G, seeds, max_depth, + False, 0.8, 0.5) + + for i in range(num_seeds): + if paths[i * max_depth] != seeds[i]: + raise ValueError("vertex_path {} start did not match seed \ + vertex".format(paths)) + + +@pytest.mark.parametrize(*_get_param_args("graph_file", [LINE])) +@pytest.mark.parametrize(*_get_param_args("renumber", COMPRESSED)) +def test_node2vec_renumber_cupy(graph_file, renumber): + import cupy as cp + import numpy as np + + src_arr = cp.asarray([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=np.int32) + dst_arr = cp.asarray([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.int32) + wgt_arr = cp.asarray([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + dtype=np.float32) + seeds = cp.asarray([8, 0, 7, 1, 6, 2], dtype=np.int32) + max_depth = 4 + num_seeds = 6 + + resource_handle = ResourceHandle() + graph_props = GraphProperties(is_symmetric=False, is_multigraph=False) + G = SGGraph(resource_handle, graph_props, src_arr, dst_arr, wgt_arr, + store_transposed=False, renumber=renumber, + do_expensive_check=True) + + (paths, weights, sizes) = node2vec(resource_handle, G, seeds, max_depth, + False, 0.8, 0.5) + + for i in range(num_seeds): + if paths[i * max_depth] != seeds[i]: + raise ValueError("vertex_path {} start did not match seed \ + vertex".format(paths)) diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighborhood_sampling.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighborhood_sampling.pyx new file mode 100644 index 00000000000..cf1701f04b9 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/uniform_neighborhood_sampling.pyx @@ -0,0 +1,179 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t + +from pylibcugraph._cugraph_c.resource_handle cimport ( + bool_t, + data_type_id_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_create, + cugraph_type_erased_device_array_free, + cugraph_type_erased_host_array_view_t, + cugraph_type_erased_host_array_view_create +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.algorithms cimport ( + cugraph_uniform_neighbor_sample, + cugraph_sample_result_t, + cugraph_sample_result_get_sources, + cugraph_sample_result_get_destinations, + cugraph_sample_result_get_start_labels, + cugraph_sample_result_get_index, + cugraph_sample_result_get_counts, + cugraph_sample_result_free, +) +from pylibcugraph.resource_handle cimport ( + EXPERIMENTAL__ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, + EXPERIMENTAL__MGGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + assert_CAI_type, + assert_AI_type, + get_c_type_from_numpy_type, +) + + +def EXPERIMENTAL__uniform_neighborhood_sampling(EXPERIMENTAL__ResourceHandle resource_handle, + EXPERIMENTAL__MGGraph input_graph, + start_list, + labels_list, + h_fan_out, + bool_t with_replacement, + bool_t do_expensive_check): + """ + Does neighborhood sampling, which samples nodes from a graph based on the + current node's neighbors, with a corresponding fanout value at each hop. + + Parameters + ---------- + resource_handle: ResourceHandle + Handle to the underlying device and host resources needed for + referencing data and running algorithms. + + input_graph: MGGraph + The input graph, for Multi-GPU operations. + + start_list: device array type + Device array containing the list of starting vertices for sampling. + + labels_list: device array type + Device array containing the starting labels for reorganizing the + results after sending the input to different callers. + + h_fan_out: numpy array type + Device array containing the brancing out (fan-out) degrees per + starting vertex for each hop level. + + with_replacement: bool + If true, sampling procedure is done with replacement (the same vertex + can be selected multiple times in the same step). + + do_expensive_check: bool + If True, performs more extensive tests on the inputs to ensure + validitity, at the expense of increased run time. + + Returns + ------- + A tuple of device arrays, where the first and second items in the tuple + are device arrays containing the starting and ending vertices of each + walk respectively, the third item in the tuple is a device array + containing the start labels, the fourth item in the tuple is a device + array containing the indices for reconstructing paths. + + """ + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = input_graph.c_graph_ptr + + assert_CAI_type(start_list, "start_list") + assert_CAI_type(labels_list, "labels_list") + assert_AI_type(h_fan_out, "h_fan_out") + + cdef cugraph_sample_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + cdef uintptr_t cai_start_ptr = \ + start_list.__cuda_array_interface__["data"][0] + cdef uintptr_t cai_labels_ptr = \ + labels_list.__cuda_array_interface__["data"][0] + cdef uintptr_t ai_fan_out_ptr = \ + h_fan_out.__array_interface__["data"][0] + + cdef cugraph_type_erased_device_array_view_t* start_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_start_ptr, + len(start_list), + get_c_type_from_numpy_type(start_list.dtype)) + cdef cugraph_type_erased_device_array_view_t* start_labels_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_labels_ptr, + len(labels_list), + get_c_type_from_numpy_type(labels_list.dtype)) + cdef cugraph_type_erased_host_array_view_t* fan_out_ptr = \ + cugraph_type_erased_host_array_view_create( + ai_fan_out_ptr, + len(h_fan_out), + get_c_type_from_numpy_type(h_fan_out.dtype)) + + error_code = cugraph_uniform_neighbor_sample(c_resource_handle_ptr, + c_graph_ptr, + start_ptr, + start_labels_ptr, + fan_out_ptr, + with_replacement, + do_expensive_check, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "uniform_nbr_sample") + + # TODO: counts is a part of the output, but another copy_to_cupy array + # with appropriate host array types would likely be required. Also + # potential memory leak until this is covered + cdef cugraph_type_erased_device_array_view_t* src_ptr = \ + cugraph_sample_result_get_sources(result_ptr) + cdef cugraph_type_erased_device_array_view_t* dst_ptr = \ + cugraph_sample_result_get_destinations(result_ptr) + cdef cugraph_type_erased_device_array_view_t* labels_ptr = \ + cugraph_sample_result_get_start_labels(result_ptr) + cdef cugraph_type_erased_device_array_view_t* index_ptr = \ + cugraph_sample_result_get_index(result_ptr) + # cdef cugraph_type_erased_host_array_view_t* counts_ptr = \ + # cugraph_sample_result_get_counts(result_ptr) + + cupy_sources = copy_to_cupy_array(c_resource_handle_ptr, src_ptr) + cupy_destinations = copy_to_cupy_array(c_resource_handle_ptr, dst_ptr) + cupy_labels = copy_to_cupy_array(c_resource_handle_ptr, labels_ptr) + cupy_indices = copy_to_cupy_array(c_resource_handle_ptr, index_ptr) + # cupy_counts = copy_to_cupy_array(c_resource_handle_ptr, counts_ptr) + + return (cupy_sources, cupy_destinations, cupy_labels, cupy_indices) + # return (cupy_sources, cupy_destinations, cupy_labels, cupy_indices, cupy_counts) diff --git a/python/pylibcugraph/pylibcugraph/utils.pxd b/python/pylibcugraph/pylibcugraph/utils.pxd index 1a357f048d4..3f508b85fbb 100644 --- a/python/pylibcugraph/pylibcugraph/utils.pxd +++ b/python/pylibcugraph/pylibcugraph/utils.pxd @@ -33,6 +33,8 @@ cdef assert_success(cugraph_error_code_t code, cdef assert_CAI_type(obj, var_name, allow_None=*) +cdef assert_AI_type(obj, var_name, allow_None=*) + cdef get_numpy_type_from_c_type(data_type_id_t c_type) cdef get_c_type_from_numpy_type(numpy_type) diff --git a/python/pylibcugraph/pylibcugraph/utils.pyx b/python/pylibcugraph/pylibcugraph/utils.pyx index 0905cf1594d..54b39dc6843 100644 --- a/python/pylibcugraph/pylibcugraph/utils.pyx +++ b/python/pylibcugraph/pylibcugraph/utils.pyx @@ -63,6 +63,18 @@ cdef assert_CAI_type(obj, var_name, allow_None=False): raise TypeError(msg) +cdef assert_AI_type(obj, var_name, allow_None=False): + if allow_None: + if obj is None: + return + msg = f"{var_name} must be None or support __array_interface__" + else: + msg = f"{var_name} does not support __array_interface__" + + if not(hasattr(obj, "__array_interface__")): + raise TypeError(msg) + + cdef get_numpy_type_from_c_type(data_type_id_t c_type): if c_type == data_type_id_t.INT32: return numpy.int32