diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 8863a753fc7..c71a1af4808 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,6 +1,6 @@ --- name: Bug report -about: Create a report to help us improve +about: Create a report to help us improve cuGraph title: "[BUG]" labels: "? - Needs Triage, bug" assignees: '' @@ -10,29 +10,19 @@ assignees: '' **Describe the bug** A clear and concise description of what the bug is. -**To Reproduce** -Steps to reproduce the behavior: -1. Go to '...' -2. Click on '....' -3. Scroll down to '....' -4. See error +**Steps/Code to reproduce bug** +Follow this guide http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports to craft a minimal bug report. This helps us reproduce the issue you're having and resolve the issue more quickly. **Expected behavior** A clear and concise description of what you expected to happen. -**Screenshots** -If applicable, add screenshots to help explain your problem. +**Environment overview (please complete the following information)** + - Environment location: [Bare-metal, Docker, Cloud(specify cloud provider)] + - Method of cuGraph install: [conda, Docker, or from source] + - If method of install is [Docker], provide `docker pull` & `docker run` commands used -**Desktop (please complete the following information):** - - OS: [e.g. iOS] - - Browser [e.g. chrome, safari] - - Version [e.g. 22] - -**Smartphone (please complete the following information):** - - Device: [e.g. iPhone6] - - OS: [e.g. iOS8.1] - - Browser [e.g. stock browser, safari] - - Version [e.g. 22] +**Environment details** +Please run and paste the output of the `cugraph/print_env.sh` script here, to gather any other relevant environment details **Additional context** Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/documentation_request.md b/.github/ISSUE_TEMPLATE/documentation_request.md new file mode 100644 index 00000000000..595a87e191e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/documentation_request.md @@ -0,0 +1,35 @@ +--- +name: Documentation request +about: Report incorrect or needed documentation +title: "[DOC]" +labels: "? - Needs Triage, doc" +assignees: '' + +--- + +## Report incorrect documentation + +**Location of incorrect documentation** +Provide links and line numbers if applicable. + +**Describe the problems or issues found in the documentation** +A clear and concise description of what you found to be incorrect. + +**Steps taken to verify documentation is incorrect** +List any steps you have taken: + +**Suggested fix for documentation** +Detail proposed changes to fix the documentation if you have any. + +--- + +## Report needed documentation + +**Report needed documentation** +A clear and concise description of what documentation you believe it is needed and why. + +**Describe the documentation you'd like** +A clear and concise description of what you want to happen. + +**Steps taken to search for needed documentation** +List any steps you have taken: \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index c3b1a9ac71d..e5e02a4cb2d 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -8,7 +8,7 @@ assignees: '' --- **Is your feature request related to a problem? Please describe.** -A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] +A clear and concise description of what the problem is. Ex. I wish I could use cuGraph to do [...] **Describe the solution you'd like** A clear and concise description of what you want to happen. @@ -16,7 +16,5 @@ A clear and concise description of what you want to happen. **Describe alternatives you've considered** A clear and concise description of any alternative solutions or features you've considered. -**Task List** -A clear list of task should be called out **Additional context** -Add any other context or screenshots about the feature request here. +Add any other context, code examples, or references to existing implementations about the feature request here. \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md index cc2d5cb79ad..a9b590525aa 100644 --- a/.github/ISSUE_TEMPLATE/question.md +++ b/.github/ISSUE_TEMPLATE/question.md @@ -1,6 +1,6 @@ --- name: Question -about: Ask a Question +about: Ask a Question about cuGraph title: "[QST]" labels: "? - Needs Triage, question" assignees: '' diff --git a/.gitignore b/.gitignore index 517ceab566b..30bcd5a845d 100644 --- a/.gitignore +++ b/.gitignore @@ -76,4 +76,7 @@ cpp/doxygen/html # Raft symlink python/cugraph/raft -python/_external_repositories/ \ No newline at end of file +python/_external_repositories/ + +# created by Dask tests +python/dask-worker-space diff --git a/CHANGELOG.md b/CHANGELOG.md index 5118b9c9059..5036d07e005 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,102 @@ +# cuGraph 0.15.0 (26 Aug 2020) + +## New Features +- PR #940 Add MG Batch BC +- PR #937 Add wrapper for gunrock HITS algorithm +- PR #939 Updated Notebooks to include new features and benchmarks +- PR #944 MG pagerank (dask) +- PR #947 MG pagerank (CUDA) +- PR #826 Bipartite Graph python API +- PR #963 Renumbering refactor, add multi GPU support +- PR #964 MG BFS (CUDA) +- PR #990 MG Consolidation +- PR #993 Add persistent Handle for Comms +- PR #979 Add hypergraph implementation to convert DataFrames into Graphs +- PR #1010 MG BFS (dask) +- PR #1018 MG personalized pagerank +- PR #1047 Updated select tests to use new dataset list that includes asymmetric directed graph +- PR #1090 Add experimental Leiden function +- PR #1077 Updated/added copyright notices, added copyright CI check from cuml +- PR #1100 Add support for new build process (Project Flash) +- PR #1093 New benchmarking notebook + +## Improvements +- PR #898 Add Edge Betweenness Centrality, and endpoints to BC +- PR #913 Eliminate `rmm.device_array` usage +- PR #903 Add short commit hash to conda package +- PR #920 modify bfs test, update graph number_of_edges, update storage of transposedAdjList in Graph +- PR #933 Update mg_degree to use raft, add python tests +- PR #930 rename test_utils.h to utilities/test_utils.hpp and remove thrust dependency +- PR #934 Update conda dev environment.yml dependencies to 0.15 +- PR #942 Removed references to deprecated RMM headers. +- PR #941 Regression python/cudf fix +- PR #945 Simplified benchmark --no-rmm-reinit option, updated default options +- PR #946 Install meta packages for dependencies +- PR #952 Updated get_test_data.sh to also (optionally) download and install datasets for benchmark runs +- PR #953 fix setting RAFT_DIR from the RAFT_PATH env var +- PR #954 Update cuGraph error handling to use RAFT +- PR #968 Add build script for CI benchmark integration +- PR #959 Add support for uint32_t and int64_t types for BFS (cpp side) +- PR #962 Update dask pagerank +- PR #975 Upgrade GitHub template +- PR #976 Fix error in Graph.edges(), update cuDF rename() calls +- PR #977 Update force_atlas2 to call on_train_end after iterating +- PR #980 Replace nvgraph Spectral Clustering (SC) functionality with RAFT SC +- PR #987 Move graph out of experimental namespace +- PR #984 Removing codecov until we figure out how to interpret failures that block CI +- PR #985 Add raft handle to BFS, BC and edge BC +- PR #991 Update conda upload versions for new supported CUDA/Python +- PR #988 Add clang and clang tools to the conda env +- PR #997 Update setup.cfg to run pytests under cugraph tests directory only +- PR #1007 Add tolerance support to MG Pagerank and fix +- PR #1009 Update benchmarks script to include requirements used +- PR #1014 Fix benchmarks script variable name +- PR #1021 Update cuGraph to use RAFT CUDA utilities +- PR #1019 Remove deprecated CUDA library calls +- PR #1024 Updated condata environment YML files +- PR #1026 update chunksize for mnmg, remove files and unused code +- PR #1028 Update benchmarks script to use ASV_LABEL +- PR #1030 MG directory org and documentation +- PR #1020 Updated Louvain to honor max_level, ECG now calls Louvain for 1 level, then full run. +- PR #1031 MG notebook +- PR #1034 Expose resolution (gamma) parameter in Louvain +- PR #1037 Centralize test main function and replace usage of deprecated `cnmem_memory_resource` +- PR #1041 Use S3 bucket directly for benchmark plugin +- PR #1056 Fix MG BFS performance +- PR #1062 Compute max_vertex_id in mnmg local data computation +- PR #1068 Remove unused thirdparty code +- PR #1105 Update `master` references to `main` + +## Bug Fixes +- PR #936 Update Force Atlas 2 doc and wrapper +- PR #938 Quote conda installs to avoid bash interpretation +- PR #966 Fix build error (debug mode) +- PR #983 Fix offset calculation in COO to CSR +- PR #989: Fix issue with incorrect docker image being used in local build script +- PR #992 Fix unrenumber of predecessor +- PR #1008 Fix for cudf updates disabling iteration of Series/Columns/Index +- PR #1012 Fix Local build script README +- PR #1017 Fix more mg bugs +- PR #1022 Fix support for using a cudf.DataFrame with a MG graph +- PR #1025: Explicitly skip raft test folder for pytest 6.0.0 +- PR #1027 Fix documentation +- PR #1033 Fix reparition error in big datasets, updated coroutine, fixed warnings +- PR #1036 Fixed benchmarks for new renumbering API, updated comments, added quick test-only benchmark run to CI +- PR #1040 Fix spectral clustering renumbering issue +- PR #1057 Updated raft dependency to pull fixes on cusparse selection in CUDA 11 +- PR #1066 Update cugunrock to not build for unsupported CUDA architectures +- PR #1069 Fixed CUDA 11 Pagerank crash, by replacing CUB's SpMV with raft's. +- PR #1083 Fix NBs to run in nightly test run, update renumbering text, cleanup +- PR #1087 Updated benchmarks README to better describe how to get plugin, added rapids-pytest-benchmark plugin to conda dev environments +- PR #1101 Removed unnecessary device-to-host copy which caused a performance regression +- PR #1106 Added new release.ipynb to notebook test skip list + # cuGraph 0.14.0 (03 Jun 2020) ## New Features - PR #756 Add Force Atlas 2 layout - PR #822 Added new functions in python graph class, similar to networkx -- PR #840 OPG degree +- PR #840 MG degree - PR #875 UVM notebook - PR #881 Raft integration infrastructure @@ -24,7 +117,7 @@ - PR #807 Updating the Python docs - PR #817 Add native Betweenness Centrality with sources subset - PR #818 Initial version of new "benchmarks" folder -- PR #820 OPG infra and all-gather smoke test +- PR #820 MG infra and all-gather smoke test - PR #823 Remove gdf column from nvgraph - PR #829 Updated README and CONTRIBUTIOIN docs - PR #831 Updated Notebook - Added K-Truss, ECG, and Betweenness Centrality @@ -41,6 +134,7 @@ - PR #874 Update setup.py to use custom clean command - PR #876 Add BFS C++ tests - PR #878 Updated build script +- PR #887 Updates test to common datasets - PR #879 Add docs build script to repository - PR #880 Remove remaining gdf_column references - PR #882 Add Force Atlas 2 to benchmarks @@ -49,6 +143,7 @@ - PR #897 Remove RMM ALLOC calls - PR #899 Update include paths to remove deleted cudf headers - PR #906 Update Louvain notebook +- PR #948 Move doc customization scripts to Jenkins ## Bug Fixes - PR #927 Update scikit learn dependency @@ -65,11 +160,13 @@ - PR #860 Fix all Notebooks - PR #870 Fix Louvain - PR #889 Added missing conftest.py file to benchmarks dir -- PR #896 opg dask infrastructure fixes +- PR #896 mg dask infrastructure fixes - PR #907 Fix bfs directed missing vertices - PR #911 Env and changelog update - PR #923 Updated pagerank with @afender 's temp fix for double-free crash - PR #928 Fix scikit learn test install to work with libgcc-ng 7.3 +- PR 935 Merge +- PR #956 Use new gpuCI image in local build script # cuGraph 0.13.0 (31 Mar 2020) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 54c931bdae5..ddd4fd0f9f4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -13,8 +13,8 @@ __Style Formatting Tools:__ * `flake8` version 3.5.0+ - -## 1) File an Issue for the RAPIDS cuGraph team to work + +## 1) File an Issue for the RAPIDS cuGraph team to work To file an issue, go to the RAPIDS cuGraph [issue](https://github.com/rapidsai/cugraph/issues/new/choose) page an select the appropriate issue type. Once an issue is filed the RAPIDS cuGraph team will evaluate and triage the issue. If you believe the issue needs priority attention, please include that in the issue to notify the team. ***Bug Report*** @@ -36,8 +36,8 @@ There are several ways to ask questions, including [Stack Overflow]( https://sta - describing your question - -## 2) Propose a New Feature and Implement It + +## 2) Propose a New Feature and Implement It We love when people want to get involved, and if you have a suggestion for a new feature or enhancement and want to be the one doing the development work, we fully encourage that. @@ -46,8 +46,8 @@ We love when people want to get involved, and if you have a suggestion for a new - Once we agree that the plan looks good, go ahead and implement it - Follow the [code contributions](#code-contributions) guide below. - -## 3) You want to implement a feature or bug-fix for an outstanding issue + +## 3) You want to implement a feature or bug-fix for an outstanding issue - Find an open Issue, and post that you would like to work that issues - Once we agree that the plan looks good, go ahead and implement it - Follow the [code contributions](#code-contributions) guide below. @@ -55,8 +55,8 @@ We love when people want to get involved, and if you have a suggestion for a new If you need more context on a particular issue, please ask. ---- - -# So you want to contribute code + +# So you want to contribute code **TL;DR General Development Process** 1. Read the documentation on [building from source](SOURCEBUILD.md) to learn how to setup, and validate, the development environment @@ -74,11 +74,14 @@ If you need more context on a particular issue, please ask. Remember, if you are unsure about anything, don't hesitate to comment on issues and ask for clarifications! +**The _FIXME_** comment + +Use the _FIXME_ comment to capture technical debt. It should not be used to flag bugs since those need to be cleaned up before code is submitted. +We are implementing a script to count and track the number of FIXME in the code. Usage of TODO or any other tag will not be accepted. -## Fork a private copy of cuGraph - +## Fork a private copy of cuGraph The RAPIDS cuGraph repo cannot directly be modified. Contributions must come in the form of a *Pull Request* from a forked version of cugraph. GitHub as a nice write up ion the process: https://help.github.com/en/github/getting-started-with-github/fork-a-repo 1. Fork the cugraph repo to your GitHub account @@ -92,7 +95,8 @@ Read the section on [building cuGraph from source](SOURCEBUILD.md) to validate t ```git remote add upstream https://github.com/rapidsai/cugraph.git``` 3. Checkout the latest branch -cuGraph only allows contribution to the current branch and not main or a future branch. PLease check the [cuGraph](https://github.com/rapidsai/cugraph) page for the name of the current branch. +cuGraph only allows contribution to the current branch and not main or a future branch. Please check the [cuGraph](https://github.com/rapidsai/cugraph) page for the name of the current branch. + ```git checkout branch-x.x``` 4. Code ..... diff --git a/Dockerfile b/Dockerfile index 53169427136..de0b1e8c10b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -# built from https://github.com/rapidsai/cudf/blob/master/Dockerfile +# built from https://github.com/rapidsai/cudf/blob/main/Dockerfile FROM cudf ADD src /cugraph/src diff --git a/PRTAGS.md b/PRTAGS.md index 91c47e035a4..8ec23ea30ac 100644 --- a/PRTAGS.md +++ b/PRTAGS.md @@ -8,5 +8,5 @@ PR = Pull Request | WIP | _Work In Progress_ - Within the RAPIDS cuGraph team, we try to open a PR when development starts. This allows other to review code as it is being developed and provide feedback before too much code needs to be refactored. It also allows process to be tracked | | skip-ci | _Do Not Run CI_ - This flag prevents CI from being run. It is good practice to include this with the **WIP** tag since code is typically not at a point where it will pass CI. | | skip ci | same as above | -| API-REVIEW | This tag request a code review just of the API portion of the code - This is benificial to ensure that all required arguments are captured. Doing this early can save from having to refactor later. | -| REVIEW | The code is ready for a full code review. Only code that has passed a code review is merged into the baseline | \ No newline at end of file +| API-REVIEW | This tag request a code review just of the API portion of the code - This is beneficial to ensure that all required arguments are captured. Doing this early can save from having to refactor later. | +| REVIEW | The code is ready for a full code review. Only code that has passed a code review is merged into the baseline | diff --git a/README.md b/README.md index f745ea1a0e3..45405d902bf 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Build Status](https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cugraph/job/branches/job/cugraph-branch-pipeline/badge/icon)](https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cugraph/job/branches/job/cugraph-branch-pipeline/) -The [RAPIDS](https://rapids.ai) cuGraph library is a collection of GPU accelerated graph algorithms that process data found in [GPU DataFrames](https://github.com/rapidsai/cudf). The vision of cuGraph is _to make graph analysis ubiquitous to the point that users just think in terms of analysis and not technologies or frameworks_. To realize that vision, cuGraph operators, at the Python layer, on GPU DataFrames, allowing for seamless passing of data between ETL tasks in [cuDF](https://github.com/rapidsai/cudf) and machine learning tasks in [cuML](https://github.com/rapidsai/cuml). Data scientist familiar with Python will quickly pick up how cuGraph integrates with the Pandas-like API of cuDF. Likewise, user familiar with NetworkX will quickly reconnize the NetworkX-like API provided in cuGraph, with the goal being to allow existing code to be ported with minimal effort into RAPIDS. For users familiar with C++/CUDA and graph structures, a C++ API is also provided. However, there is less type and structure checking at the C++ layer. +The [RAPIDS](https://rapids.ai) cuGraph library is a collection of GPU accelerated graph algorithms that process data found in [GPU DataFrames](https://github.com/rapidsai/cudf). The vision of cuGraph is _to make graph analysis ubiquitous to the point that users just think in terms of analysis and not technologies or frameworks_. To realize that vision, cuGraph operates, at the Python layer, on GPU DataFrames, thereby allowing for seamless passing of data between ETL tasks in [cuDF](https://github.com/rapidsai/cudf) and machine learning tasks in [cuML](https://github.com/rapidsai/cuml). Data scientists familiar with Python will quickly pick up how cuGraph integrates with the Pandas-like API of cuDF. Likewise, users familiar with NetworkX will quickly recognize the NetworkX-like API provided in cuGraph, with the goal to allow existing code to be ported with minimal effort into RAPIDS. For users familiar with C++/CUDA and graph structures, a C++ API is also provided. However, there is less type and structure checking at the C++ layer. For more project details, see [rapids.ai](https://rapids.ai/). @@ -10,59 +10,62 @@ The [RAPIDS](https://rapids.ai) cuGraph library is a collection of GPU accelerat -```markdown +```python import cugraph # read data into a cuDF DataFrame using read_csv -gdf = cudf.read_csv("graph_data.csv", names=["src", "dst"], dtype=["int32", "int32"] ) +gdf = cudf.read_csv("graph_data.csv", names=["src", "dst"], dtype=["int32", "int32"]) # We now have data as edge pairs -# create a Graph using the source (src) and destination (dst) vertex pairs the GDF +# create a Graph using the source (src) and destination (dst) vertex pairs G = cugraph.Graph() G.from_cudf_edgelist(gdf, source='src', destination='dst') # Let's now get the PageRank score of each vertex by calling cugraph.pagerank -gdf_page = cugraph.pagerank(G) +df_page = cugraph.pagerank(G) # Let's look at the PageRank Score (only do this on small graphs) -for i in range(len(gdf_page)): - print("vertex " + str(gdf_page['vertex'][i]) + - " PageRank is " + str(gdf_page['pagerank'][i])) +for i in range(len(df_page)): + print("vertex " + str(df_page['vertex'].iloc[i]) + + " PageRank is " + str(df_page['pagerank'].iloc[i])) ``` ## Supported Algorithms -| Category | Algorithm | Sacle | Notes +| Category | Algorithm | Scale | Notes | ------------ | -------------------------------------- | ------------ | ------------------- | | Centrality | | | | | | Katz | Single-GPU | | | | Betweenness Centrality | Single-GPU | | +| | Edge Betweenness Centrality | Single-GPU | | | Community | | | | +| | Leiden | Single-GPU | | | | Louvain | Single-GPU | | | | Ensemble Clustering for Graphs | Single-GPU | | | | Spectral-Clustering - Balanced Cut | Single-GPU | | -| | Spectral-Clustering | Single-GPU | | +| | Spectral-Clustering - Modularity | Single-GPU | | | | Subgraph Extraction | Single-GPU | | | | Triangle Counting | Single-GPU | | +| | K-Truss | Single-GPU | | | Components | | | | | | Weakly Connected Components | Single-GPU | | | | Strongly Connected Components | Single-GPU | | | Core | | | | | | K-Core | Single-GPU | | | | Core Number | Single-GPU | | -| | K-Truss | Single-GPU | | | Layout | | | | | | Force Atlas 2 | Single-GPU | | | Link Analysis| | | | -| | Pagerank | Single-GPU | Multi-GPU on DGX avaible | -| | Personal Pagerank | Single-GPU | | +| | Pagerank | Multiple-GPU | limited to 2 billion vertices | +| | Personal Pagerank | Multiple-GPU | limited to 2 billion vertices | +| | HITS | Single-GPU | leverages Gunrock | | Link Prediction | | | | -| | Jacard Similarity | Single-GPU | | -| | Weighted Jacard Similarity | Single-GPU | | +| | Jaccard Similarity | Single-GPU | | +| | Weighted Jaccard Similarity | Single-GPU | | | | Overlap Similarity | Single-GPU | | | Traversal | | | | -| | Breadth First Search (BFS) | Single-GPU | | +| | Breadth First Search (BFS) | Multiple-GPU | limited to 2 billion vertices | | | Single Source Shortest Path (SSSP) | Single-GPU | | | Structure | | | | | | Renumbering | Single-GPU | Also for multiple columns | @@ -78,26 +81,25 @@ for i in range(len(gdf_page)): ## cuGraph Notice The current version of cuGraph has some limitations: -- Vertex IDs need to be 32-bit integers. +- Vertex IDs need to be 32-bit integers (that restriction is going away in 0.16) - Vertex IDs are expected to be contiguous integers starting from 0. -- If the starting index is not zero, cuGraph will add disconnected vertices to fill in the missing range. (Auto-) Renumbering fixes this issue -cuGraph provides the renumber function to mitigate this problem. Input vertex IDs for the renumber function can be any type, can be non-contiguous, and can start from an arbitrary number. The renumber function maps the provided input vertex IDs to 32-bit contiguous integers starting from 0. cuGraph still requires the renumbered vertex IDs to be representable in 32-bit integers. These limitations are being addressed and will be fixed soon. +cuGraph provides the renumber function to mitigate this problem, which is by default automatically called when data is addted to a graph. Input vertex IDs for the renumber function can be any type, can be non-contiguous, can be multiple columns, and can start from an arbitrary number. The renumber function maps the provided input vertex IDs to 32-bit contiguous integers starting from 0. cuGraph still requires the renumbered vertex IDs to be representable in 32-bit integers. These limitations are being addressed and will be fixed soon. -cuGraph provides an auto-renumbering feature, enabled by default, during Graph creating. Renumbered vertices are automaticaly un-renumbered. +Additionally, when using the auto-renumbering feature, vertices are automatically un-renumbered in results. -cuGraph is constantly being updatred and improved. Please see the [Transition Guide](TRANSITIONGUIDE.md) if errors are encountered with newer versions +cuGraph is constantly being updated and improved. Please see the [Transition Guide](TRANSITIONGUIDE.md) if errors are encountered with newer versions ## Graph Sizes and GPU Memory Size -The amount of memory required is dependent on the graph structure and the analytics being executed. As a simple rule of thumb, the amount of GPU memory should be about twice the size of the data size. That gives overhead for the CSV reader and other transform functions. There are ways around the rule but using smaller data chunks. - - -| Size | Recomended GPU Memory | -|-------------------|-----------------------| -| 500 million edges | 32GB | -| 250 million edges | 16 GB | +The amount of memory required is dependent on the graph structure and the analytics being executed. As a simple rule of thumb, the amount of GPU memory should be about twice the size of the data size. That gives overhead for the CSV reader and other transform functions. There are ways around the rule but using smaller data chunks. +| Size | Recommended GPU Memory | +|-------------------|------------------------| +| 500 million edges | 32 GB | +| 250 million edges | 16 GB | +The use of managed memory for oversubscription can also be used to exceed the above memory limitations. See the recent blog on _Tackling Large Graphs with RAPIDS cuGraph and CUDA Unified Memory on GPUs_: https://medium.com/rapids-ai/tackling-large-graphs-with-rapids-cugraph-and-unified-virtual-memory-b5b69a065d4 ## Getting cuGraph @@ -108,35 +110,33 @@ There are 3 ways to get cuGraph : 3. [Build from Source](#source) - -## Quick Start + +## Quick Start Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapidsai/), choosing a tag based on the NVIDIA CUDA version you’re running. This provides a ready to run Docker container with example notebooks and data, showcasing how you can utilize all of the RAPIDS libraries: cuDF, cuML, and cuGraph. - -### Conda +### Conda It is easy to install cuGraph using conda. You can get a minimal conda installation with [Miniconda](https://conda.io/miniconda.html) or get the full installation with [Anaconda](https://www.anaconda.com/download). Install and update cuGraph using the conda command: ```bash -# CUDA 10.0 -conda install -c nvidia -c rapidsai -c numba -c conda-forge -c defaults cugraph cudatoolkit=10.0 - # CUDA 10.1 conda install -c nvidia -c rapidsai -c numba -c conda-forge -c defaults cugraph cudatoolkit=10.1 # CUDA 10.2 conda install -c nvidia -c rapidsai -c numba -c conda-forge -c defaults cugraph cudatoolkit=10.2 + +# CUDA 11.0 +conda install -c nvidia -c rapidsai -c numba -c conda-forge -c defaults cugraph cudatoolkit=11.0 ``` -Note: This conda installation only applies to Linux and Python versions 3.6/3.7. +Note: This conda installation only applies to Linux and Python versions 3.7/3.8. - -### Build from Source and Contributing +### Build from Source and Contributing Please see our [guide for building cuGraph from source](SOURCEBUILD.md) @@ -153,7 +153,7 @@ Python API documentation can be generated from [docs](docs) directory. ##
Open GPU Data Science -The RAPIDS suite of open source software libraries aim to enable execution of end-to-end data science and analytics pipelines entirely on GPUs. It relies on NVIDIA® CUDA® primitives for low-level compute optimization, but exposing that GPU parallelism and high-bandwidth memory speed through user-friendly Python interfaces. +The RAPIDS suite of open source software libraries aims to enable execution of end-to-end data science and analytics pipelines entirely on GPUs. It relies on NVIDIA® CUDA® primitives for low-level compute optimization but exposing that GPU parallelism and high-bandwidth memory speed through user-friendly Python interfaces.

diff --git a/SOURCEBUILD.md b/SOURCEBUILD.md index 23beee55f07..29aa20ad522 100644 --- a/SOURCEBUILD.md +++ b/SOURCEBUILD.md @@ -12,7 +12,7 @@ __Compiler__: * `cmake` version 3.12+ __CUDA:__ -* CUDA 10.0+ +* CUDA 10.1+ * NVIDIA driver 396.44+ * Pascal architecture or better @@ -47,8 +47,7 @@ __Create the conda development environment__ ```bash # create the conda environment (assuming in base `cugraph` directory) -# for CUDA 10 -conda env create --name cugraph_dev --file conda/environments/cugraph_dev_cuda10.0.yml + # for CUDA 10.1 conda env create --name cugraph_dev --file conda/environments/cugraph_dev_cuda10.1.yml @@ -56,6 +55,9 @@ conda env create --name cugraph_dev --file conda/environments/cugraph_dev_cuda10 # for CUDA 10.2 conda env create --name cugraph_dev --file conda/environments/cugraph_dev_cuda10.2.yml +# for CUDA 11 +conda env create --name cugraph_dev --file conda/environments/cugraph_dev_cuda11.0.yml + # activate the environment conda activate cugraph_dev @@ -68,15 +70,15 @@ conda deactivate ```bash -# for CUDA 10 -conda env update --name cugraph_dev --file conda/environments/cugraph_dev_cuda10.0.yml - # for CUDA 10.1 conda env update --name cugraph_dev --file conda/environments/cugraph_dev_cuda10.1.yml # for CUDA 10.2 conda env update --name cugraph_dev --file conda/environments/cugraph_dev_cuda10.2.yml +# for CUDA 11 +conda env update --name cugraph_dev --file conda/environments/cugraph_dev_cuda11.0.yml + conda activate cugraph_dev ``` @@ -200,7 +202,7 @@ Run either the C++ or the Python tests with datasets make test ``` -Note: This conda installation only applies to Linux and Python versions 3.6/3.7. +Note: This conda installation only applies to Linux and Python versions 3.7/3.8. ### Building and Testing on a gpuCI image locally @@ -226,8 +228,8 @@ Next the env_vars.sh file needs to be edited vi ./etc/conda/activate.d/env_vars.sh #!/bin/bash -export PATH=/usr/local/cuda-10.0/bin:$PATH # or cuda-10.2 if using CUDA 10.2 -export LD_LIBRARY_PATH=/usr/local/cuda-10.0/lib64:$LD_LIBRARY_PATH # or cuda-10.2 if using CUDA 10.2 +export PATH=/usr/local/cuda-10.1/bin:$PATH # or cuda-10.2 if using CUDA 10.2 +export LD_LIBRARY_PATH=/usr/local/cuda-10.1/lib64:$LD_LIBRARY_PATH # or cuda-10.2 if using CUDA 10.2 ``` ``` diff --git a/benchmarks/README.md b/benchmarks/README.md index 7aa581d14bb..0190b2870de 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -15,13 +15,15 @@ directory under the root of the `cuGraph` source tree. * cugraph built and installed (or `cugraph` sources and built C++ extensions available on `PYTHONPATH`) -* rapids-pytest-benchmark pytest plugin (`conda install -c rlratzel +* rapids-pytest-benchmark pytest plugin (`conda install -c rapidsai rapids-pytest-benchmark`) - * NOTE: the `rlratzel` channel is temporary! This plugin will eventually be - moved to a more standard channel -* specific datasets installed in /datasets (see benchmark sources in - this dir for details) +* The benchmark datasets downloaded and installed in /datasets. Run the +script below from the /datasets directory: +``` +cd /datasets +./get_test_data.sh --benchmark +``` ## Usage (Python) ### Python @@ -33,6 +35,7 @@ directory under the root of the `cuGraph` source tree. ## Examples ### Python +_**NOTE: these commands must be run from the `/benchmarks` directory.**_ * Run all the benchmarks and print their names on a separate line (`-v`), and generate a report to stdout ``` (rapids) user@machine:/cugraph/benchmarks> pytest -v diff --git a/benchmarks/bench_algos.py b/benchmarks/bench_algos.py index 91dc8fbb0fa..9be636ca480 100644 --- a/benchmarks/bench_algos.py +++ b/benchmarks/bench_algos.py @@ -1,3 +1,16 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pytest import pytest_benchmark @@ -17,6 +30,7 @@ def setFixtureParamNames(*args, **kwargs): pass import cugraph +from cugraph.structure.number_map import NumberMap from cugraph.tests import utils import rmm @@ -47,12 +61,26 @@ def createGraph(csvFileName, graphType=None): renumber=True) +# Record the current RMM settings so reinitialize() will be called only when a +# change is needed (RMM defaults both values to False). This allows the +# --no-rmm-reinit option to prevent reinitialize() from being called at all +# (see conftest.py for details). +RMM_SETTINGS = {"managed_mem": False, + "pool_alloc": False} + + def reinitRMM(managed_mem, pool_alloc): - rmm.reinitialize( - managed_memory=managed_mem, - pool_allocator=pool_alloc, - initial_pool_size=2 << 27 - ) + + if (managed_mem != RMM_SETTINGS["managed_mem"]) or \ + (pool_alloc != RMM_SETTINGS["pool_alloc"]): + + rmm.reinitialize( + managed_memory=managed_mem, + pool_allocator=pool_alloc, + initial_pool_size=2 << 27 + ) + RMM_SETTINGS.update(managed_mem=managed_mem, + pool_alloc=pool_alloc) ############################################################################### @@ -78,8 +106,7 @@ def edgelistCreated(request): setFixtureParamNames(request, ["dataset", "managed_mem", "pool_allocator"]) csvFileName = request.param[0] - if len(request.param) > 1: - reinitRMM(request.param[1], request.param[2]) + reinitRMM(request.param[1], request.param[2]) return utils.read_csv_file(csvFileName) @@ -92,8 +119,7 @@ def graphWithAdjListComputed(request): """ setFixtureParamNames(request, ["dataset", "managed_mem", "pool_allocator"]) csvFileName = request.param[0] - if len(request.param) > 1: - reinitRMM(request.param[1], request.param[2]) + reinitRMM(request.param[1], request.param[2]) G = createGraph(csvFileName, cugraph.structure.graph.Graph) G.view_adj_list() @@ -109,8 +135,7 @@ def anyGraphWithAdjListComputed(request): """ setFixtureParamNames(request, ["dataset", "managed_mem", "pool_allocator"]) csvFileName = request.param[0] - if len(request.param) > 1: - reinitRMM(request.param[1], request.param[2]) + reinitRMM(request.param[1], request.param[2]) G = createGraph(csvFileName) G.view_adj_list() @@ -126,8 +151,7 @@ def anyGraphWithTransposedAdjListComputed(request): """ setFixtureParamNames(request, ["dataset", "managed_mem", "pool_allocator"]) csvFileName = request.param[0] - if len(request.param) > 1: - reinitRMM(request.param[1], request.param[2]) + reinitRMM(request.param[1], request.param[2]) G = createGraph(csvFileName) G.view_transposed_adj_list() @@ -164,9 +188,7 @@ def bench_create_digraph(gpubenchmark, edgelistCreated): @pytest.mark.ETL def bench_renumber(gpubenchmark, edgelistCreated): - gpubenchmark(cugraph.renumber, - edgelistCreated["0"], # src - edgelistCreated["1"]) # dst + gpubenchmark(NumberMap.renumber, edgelistCreated, "0", "1") def bench_pagerank(gpubenchmark, anyGraphWithTransposedAdjListComputed): @@ -233,3 +255,9 @@ def bench_graph_degrees(gpubenchmark, anyGraphWithAdjListComputed): def bench_betweenness_centrality(gpubenchmark, anyGraphWithAdjListComputed): gpubenchmark(cugraph.betweenness_centrality, anyGraphWithAdjListComputed, k=10, seed=123) + + +def bench_edge_betweenness_centrality(gpubenchmark, + anyGraphWithAdjListComputed): + gpubenchmark(cugraph.edge_betweenness_centrality, + anyGraphWithAdjListComputed, k=10, seed=123) diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py index ea5be7212dc..8ab0c5a57b4 100644 --- a/benchmarks/conftest.py +++ b/benchmarks/conftest.py @@ -1,8 +1,4 @@ # pytest customizations specific to these benchmarks -import sys -from os import path -import importlib - def pytest_addoption(parser): parser.addoption("--no-rmm-reinit", action="store_true", default=False, @@ -11,21 +7,19 @@ def pytest_addoption(parser): def pytest_sessionstart(session): - # if the --no-rmm-reinit option is given, import the benchmark's "params" - # module and change the FIXTURE_PARAMS accordingly. + # if the --no-rmm-reinit option is given, set (or add to) the CLI "mark + # expression" (-m) the markers for no managedmem and no poolallocator. This + # will cause the RMM reinit() function to not be called. if session.config.getoption("no_rmm_reinit"): - paramsPyFile = path.join(path.dirname(path.abspath(__file__)), - "params.py") + newMarkexpr = "managedmem_off and poolallocator_off" + currentMarkexpr = session.config.getoption("markexpr") - # A simple "import" statement will not find the modules here (unless if - # this package is on the import path) since pytest evaluates this from - # a different location. - spec = importlib.util.spec_from_file_location("params", paramsPyFile) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) + if ("managedmem" in currentMarkexpr) or \ + ("poolallocator" in currentMarkexpr): + raise RuntimeError("managedmem and poolallocator markers cannot " + "be used with --no-rmm-reinit") - module.FIXTURE_PARAMS = module.NO_RMMREINIT_FIXTURE_PARAMS + if currentMarkexpr: + newMarkexpr = f"({currentMarkexpr}) and ({newMarkexpr})" - # If "benchmarks.params" is registered in sys.modules, all future - # imports of the module will simply refer to this one. - sys.modules["benchmarks.params"] = module + session.config.option.markexpr = newMarkexpr diff --git a/benchmarks/params.py b/benchmarks/params.py index cab0210ba23..2d1d3ea4acc 100644 --- a/benchmarks/params.py +++ b/benchmarks/params.py @@ -1,3 +1,15 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from itertools import product import pytest @@ -58,8 +70,10 @@ def genFixtureParamsProduct(*args): # FIXME: write and use mechanism described here for specifying datasets: # https://docs.rapids.ai/maintainers/datasets -# FIXME: rlr: soc-twitter-2010.csv crashes with OOM error on my HP-Z8! +# FIXME: rlr: soc-twitter-2010.csv crashes with OOM error on my RTX-8000 UNDIRECTED_DATASETS = [ + pytest.param("../datasets/karate.csv", + marks=[pytest.mark.tiny, pytest.mark.undirected]), pytest.param("../datasets/csv/undirected/hollywood.csv", marks=[pytest.mark.small, pytest.mark.undirected]), pytest.param("../datasets/csv/undirected/europe_osm.csv", @@ -88,16 +102,7 @@ def genFixtureParamsProduct(*args): marks=[pytest.mark.poolallocator_off]), ] -ALL_FIXTURE_PARAMS = genFixtureParamsProduct( - (DIRECTED_DATASETS + UNDIRECTED_DATASETS, "ds"), - (MANAGED_MEMORY, "mm"), - (POOL_ALLOCATOR, "pa")) - -NO_RMMREINIT_FIXTURE_PARAMS = genFixtureParamsProduct( - (DIRECTED_DATASETS + - UNDIRECTED_DATASETS, "ds")) - -# conftest.py will switch this to NO_RMMREINIT_FIXTURE_PARAMS -# if the --no-rmm-reinit option is passed. -# See conftest.py for details -FIXTURE_PARAMS = ALL_FIXTURE_PARAMS +FIXTURE_PARAMS = genFixtureParamsProduct( + (DIRECTED_DATASETS + UNDIRECTED_DATASETS, "ds"), + (MANAGED_MEMORY, "mm"), + (POOL_ALLOCATOR, "pa")) diff --git a/benchmarks/pytest.ini b/benchmarks/pytest.ini index fb4e43965d6..06a67a06040 100644 --- a/benchmarks/pytest.ini +++ b/benchmarks/pytest.ini @@ -1,9 +1,9 @@ [pytest] addopts = - -x --benchmark-warmup=on --benchmark-warmup-iterations=1 --benchmark-min-rounds=3 + --benchmark-columns="min, max, mean, stddev, outliers, gpu_mem, rounds" markers = managedmem_on: RMM managed memory enabled @@ -12,6 +12,7 @@ markers = poolallocator_off: RMM pool allocator disabled ETL: benchmarks for ETL steps small: small datasets + tiny: tiny datasets directed: directed datasets undirected: undirected datasets diff --git a/build.sh b/build.sh index 94c37cf20bb..e0557344384 100755 --- a/build.sh +++ b/build.sh @@ -34,7 +34,7 @@ HELP="$0 [ ...] [ ...] default action (no args) is to build and install 'libcugraph' then 'cugraph' targets " -LIBCUGRAPH_BUILD_DIR=${REPODIR}/cpp/build +LIBCUGRAPH_BUILD_DIR=${LIBCUGRAPH_BUILD_DIR:=${REPODIR}/cpp/build} CUGRAPH_BUILD_DIR=${REPODIR}/python/build BUILD_DIRS="${LIBCUGRAPH_BUILD_DIR} ${CUGRAPH_BUILD_DIR}" @@ -116,7 +116,7 @@ if (( ${NUMARGS} == 0 )) || hasArg cugraph; then cd ${REPODIR}/python if [[ ${INSTALL_TARGET} != "" ]]; then - python setup.py build_ext --inplace + python setup.py build_ext --inplace --library-dir=${LIBCUGRAPH_BUILD_DIR} python setup.py install else python setup.py build_ext --inplace --library-dir=${LIBCUGRAPH_BUILD_DIR} diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh new file mode 100644 index 00000000000..49a6362a904 --- /dev/null +++ b/ci/benchmark/build.sh @@ -0,0 +1,169 @@ +#!/usr/bin/env bash +# Copyright (c) 2018-2020, NVIDIA CORPORATION. +########################################## +# cuGraph Benchmark test script for CI # +########################################## + +set -e +set -o pipefail +NUMARGS=$# +ARGS=$* + +function logger { + echo -e "\n>>>> $@\n" +} + +function hasArg { + (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ") +} + +function cleanup { + logger "Removing datasets and temp files..." + rm -rf $WORKSPACE/datasets/test + rm -rf $WORKSPACE/datasets/benchmark + rm -f testoutput.txt +} + +# Set cleanup trap for Jenkins +if [ ! -z "$JENKINS_HOME" ] ; then + logger "Jenkins environment detected, setting cleanup trap..." + trap cleanup EXIT +fi + +# Set path, build parallel level, and CUDA version +cd $WORKSPACE +export PATH=/conda/bin:/usr/local/cuda/bin:$PATH +export PARALLEL_LEVEL=4 +export CUDA_REL=${CUDA_VERSION%.*} +export HOME=$WORKSPACE +export GIT_DESCRIBE_TAG=`git describe --tags` +export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` + +# Set Benchmark Vars +export DATASETS_DIR=${WORKSPACE}/datasets +export BENCHMARKS_DIR=${WORKSPACE}/benchmarks + +########################################## +# Environment Setup # +########################################## + +# TODO: Delete build section when artifacts are available + +logger "Check environment..." +env + +logger "Check GPU usage..." +nvidia-smi + +logger "Activate conda env..." +source activate rapids + + +# Enter dependencies to be shown in ASV tooltips. +CUGRAPH_DEPS=(cudf rmm) +LIBCUGRAPH_DEPS=(cudf rmm) + +logger "conda install required packages" +conda install -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge -c defaults \ + "cudf=${MINOR_VERSION}" \ + "rmm=${MINOR_VERSION}" \ + "cudatoolkit=$CUDA_REL" \ + "dask-cudf=${MINOR_VERSION}" \ + "dask-cuda=${MINOR_VERSION}" \ + "ucx-py=${MINOR_VERSION}" \ + "rapids-build-env=${MINOR_VERSION}" \ + rapids-pytest-benchmark + +# Install the master version of dask and distributed +logger "pip install git+https://github.com/dask/distributed.git --upgrade --no-deps" +pip install "git+https://github.com/dask/distributed.git" --upgrade --no-deps + +logger "pip install git+https://github.com/dask/dask.git --upgrade --no-deps" +pip install "git+https://github.com/dask/dask.git" --upgrade --no-deps + +logger "Check versions..." +python --version +$CC --version +$CXX --version +conda list + +########################################## +# Build cuGraph # +########################################## + +logger "Build libcugraph..." +$WORKSPACE/build.sh clean libcugraph cugraph + +########################################## +# Run Benchmarks # +########################################## + +logger "Downloading Datasets for Benchmarks..." +cd $DATASETS_DIR +bash ./get_test_data.sh --benchmark +ERRORCODE=$((ERRORCODE | $?)) +# Exit if dataset download failed +if (( ${ERRORCODE} != 0 )); then + exit ${ERRORCODE} +fi + + +# Concatenate dependency arrays, convert to JSON array, +# and remove duplicates. +X=("${CUGRAPH_DEPS[@]}" "${LIBCUGRAPH_DEPS[@]}") +DEPS=$(printf '%s\n' "${X[@]}" | jq -R . | jq -s 'unique') + +# Build object with k/v pairs of "dependency:version" +DEP_VER_DICT=$(jq -n '{}') +for DEP in $(echo "${DEPS}" | jq -r '.[]'); do + VER=$(conda list | grep "^${DEP}" | awk '{print $2"-"$3}') + DEP_VER_DICT=$(echo "${DEP_VER_DICT}" | jq -c --arg DEP "${DEP}" --arg VER "${VER}" '. + { ($DEP): $VER }') +done + +# Pass in an array of dependencies to get a dict of "dependency:version" +function getReqs() { + local DEPS_ARR=("$@") + local REQS="{}" + for DEP in "${DEPS_ARR[@]}"; do + VER=$(echo "${DEP_VER_DICT}" | jq -r --arg DEP "${DEP}" '.[$DEP]') + REQS=$(echo "${REQS}" | jq -c --arg DEP "${DEP}" --arg VER "${VER}" '. + { ($DEP): $VER }') + done + + echo "${REQS}" +} + +REQS=$(getReqs "${CUGRAPH_DEPS[@]}") + +BENCHMARK_META=$(jq -n \ + --arg NODE "${ASV_LABEL}" \ + --arg BRANCH "branch-${MINOR_VERSION}" \ + --argjson REQS "${REQS}" ' + { + "machineName": $NODE, + "commitBranch": $BRANCH, + "requirements": $REQS + } +') + +echo "Benchmark meta:" +echo "${BENCHMARK_META}" | jq "." + +logger "Running Benchmarks..." +cd $BENCHMARKS_DIR +set +e +time pytest -v -m "small and managedmem_on and poolallocator_on" \ + --benchmark-gpu-device=0 \ + --benchmark-gpu-max-rounds=3 \ + --benchmark-asv-output-dir="${S3_ASV_DIR}" \ + --benchmark-asv-metadata="${BENCHMARK_META}" + + + +EXITCODE=$? + +# The reqs below can be passed as requirements for +# C++ benchmarks in the future. +# REQS=$(getReqs "${LIBCUGRAPH_DEPS[@]}") + +set -e +JOBEXITCODE=0 diff --git a/ci/checks/changelog.sh b/ci/checks/changelog.sh index 6cd869d1171..73921f6bf19 100755 --- a/ci/checks/changelog.sh +++ b/ci/checks/changelog.sh @@ -1,20 +1,20 @@ #!/bin/bash -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2020, NVIDIA CORPORATION. ############################ # cuGraph CHANGELOG Tester # ############################ -# Checkout master for comparison -git checkout --quiet master +# Checkout main for comparison +git checkout --force --quiet main # Switch back to tip of PR branch -git checkout --quiet current-pr-branch +git checkout --force --quiet current-pr-branch # Ignore errors during searching set +e # Get list of modified files between matster and PR branch -CHANGELOG=`git diff --name-only master...current-pr-branch | grep CHANGELOG.md` +CHANGELOG=`git diff --name-only main...current-pr-branch | grep CHANGELOG.md` # Check if CHANGELOG has PR ID PRNUM=`cat CHANGELOG.md | grep "$PR_ID"` RETVAL=0 diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py new file mode 100644 index 00000000000..cb7f6d1d360 --- /dev/null +++ b/ci/checks/copyright.py @@ -0,0 +1,189 @@ +# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import datetime +import re +import argparse +import io +import os +import git_helpers + +FilesToCheck = [ + re.compile(r"[.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$"), + re.compile(r"CMakeLists[.]txt$"), + re.compile(r"CMakeLists_standalone[.]txt$"), + re.compile(r"setup[.]cfg$"), + re.compile(r"[.]flake8[.]cython$"), + re.compile(r"meta[.]yaml$") +] + +# this will break starting at year 10000, which is probably OK :) +CheckSimple = re.compile(r"Copyright \(c\) (\d{4}), NVIDIA CORPORATION") +CheckDouble = re.compile( + r"Copyright \(c\) (\d{4})-(\d{4}), NVIDIA CORPORATION") + + +def checkThisFile(f): + # This check covers things like symlinks which point to files that DNE + if not(os.path.exists(f)): + return False + if git_helpers and git_helpers.isFileEmpty(f): + return False + for checker in FilesToCheck: + if checker.search(f): + return True + return False + + +def getCopyrightYears(line): + res = CheckSimple.search(line) + if res: + return (int(res.group(1)), int(res.group(1))) + res = CheckDouble.search(line) + if res: + return (int(res.group(1)), int(res.group(2))) + return (None, None) + + +def replaceCurrentYear(line, start, end): + # first turn a simple regex into double (if applicable). then update years + res = CheckSimple.sub(r"Copyright (c) \1-\1, NVIDIA CORPORATION", line) + res = CheckDouble.sub( + r"Copyright (c) {:04d}-{:04d}, NVIDIA CORPORATION".format(start, end), + res) + return res + + +def checkCopyright(f, update_current_year): + """ + Checks for copyright headers and their years + """ + errs = [] + thisYear = datetime.datetime.now().year + lineNum = 0 + crFound = False + yearMatched = False + with io.open(f, "r", encoding="utf-8") as fp: + lines = fp.readlines() + for line in lines: + lineNum += 1 + start, end = getCopyrightYears(line) + if start is None: + continue + crFound = True + if start > end: + e = [f, lineNum, "First year after second year in the copyright " + "header (manual fix required)", None] + errs.append(e) + if thisYear < start or thisYear > end: + e = [f, lineNum, "Current year not included in the " + "copyright header", None] + if thisYear < start: + e[-1] = replaceCurrentYear(line, thisYear, end) + if thisYear > end: + e[-1] = replaceCurrentYear(line, start, thisYear) + errs.append(e) + else: + yearMatched = True + fp.close() + # copyright header itself not found + if not crFound: + e = [f, 0, "Copyright header missing or formatted incorrectly " + "(manual fix required)", None] + errs.append(e) + # even if the year matches a copyright header, make the check pass + if yearMatched: + errs = [] + + if update_current_year: + errs_update = [x for x in errs if x[-1] is not None] + if len(errs_update) > 0: + print("File: {}. Changing line(s) {}".format( + f, ', '.join(str(x[1]) for x in errs if x[-1] is not None))) + for _, lineNum, __, replacement in errs_update: + lines[lineNum - 1] = replacement + with io.open(f, "w", encoding="utf-8") as out_file: + for new_line in lines: + out_file.write(new_line) + errs = [x for x in errs if x[-1] is None] + + return errs + + + +def getAllFilesUnderDir(root, pathFilter=None): + retList = [] + for (dirpath, dirnames, filenames) in os.walk(root): + for fn in filenames: + filePath = os.path.join(dirpath, fn) + if pathFilter(filePath): + retList.append(filePath) + return retList + + +def checkCopyright_main(): + """ + Checks for copyright headers in all the modified files. In case of local + repo, this script will just look for uncommitted files and in case of CI + it compares between branches "$PR_TARGET_BRANCH" and "current-pr-branch" + """ + retVal = 0 + + argparser = argparse.ArgumentParser( + description="Checks for a consistent copyright header") + argparser.add_argument("--update-current-year", dest='update_current_year', + action="store_true", required=False, help="If set, " + "update the current year if a header is already " + "present and well formatted.") + argparser.add_argument("--git-modified-only", dest='git_modified_only', + action="store_true", required=False, help="If set, " + "only files seen as modified by git will be " + "processed.") + + (args, dirs) = argparser.parse_known_args() + if args.git_modified_only: + files = git_helpers.modifiedFiles(pathFilter=checkThisFile) + else: + files = [] + for d in [os.path.abspath(d) for d in dirs]: + if not(os.path.isdir(d)): + raise ValueError(f"{d} is not a directory.") + files += getAllFilesUnderDir(d, pathFilter=checkThisFile) + + errors = [] + for f in files: + errors += checkCopyright(f, args.update_current_year) + + if len(errors) > 0: + print("Copyright headers incomplete in some of the files!") + for e in errors: + print(" %s:%d Issue: %s" % (e[0], e[1], e[2])) + print("") + n_fixable = sum(1 for e in errors if e[-1] is not None) + path_parts = os.path.abspath(__file__).split(os.sep) + file_from_repo = os.sep.join(path_parts[path_parts.index("ci"):]) + if n_fixable > 0: + print("You can run {} --update-current-year to fix {} of these " + "errors.\n".format(file_from_repo, n_fixable)) + retVal = 1 + else: + print("Copyright check passed") + + return retVal + + +if __name__ == "__main__": + import sys + sys.exit(checkCopyright_main()) diff --git a/ci/checks/style.sh b/ci/checks/style.sh index fa933e41410..696f566a96a 100755 --- a/ci/checks/style.sh +++ b/ci/checks/style.sh @@ -1,11 +1,17 @@ #!/bin/bash -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2020, NVIDIA CORPORATION. ######################## # cuGraph Style Tester # ######################## -# Ignore errors and set path -set +e +# Assume this script is run from the root of the cugraph repo + +# Make failing commands visible when used in a pipeline and allow the script to +# continue on errors, but use ERRORCODE to still allow any failing command to be +# captured for returning a final status code. This allows all style check to +# take place to provide a more comprehensive list of style violations. +set -o pipefail +ERRORCODE=0 PATH=/conda/bin:$PATH # Activate common conda env @@ -13,11 +19,12 @@ source activate gdf # Run flake8 and get results/return code FLAKE=`flake8 --config=python/.flake8 python` -FLAKE_RETVAL=$? +ERRORCODE=$((ERRORCODE | $?)) # Run clang-format and check for a consistent code format CLANG_FORMAT=`python cpp/scripts/run-clang-format.py 2>&1` CLANG_FORMAT_RETVAL=$? +ERRORCODE=$((ERRORCODE | ${CLANG_FORMAT_RETVAL})) # Output results if failure otherwise show pass if [ "$FLAKE" != "" ]; then @@ -36,8 +43,19 @@ else echo -e "\n\n>>>> PASSED: clang format check\n\n" fi -RETVALS=($FLAKE_RETVAL $CLANG_FORMAT_RETVAL) -IFS=$'\n' -RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1` +# Check for copyright headers in the files modified currently +#COPYRIGHT=`env PYTHONPATH=ci/utils python ci/checks/copyright.py cpp python benchmarks ci 2>&1` +COPYRIGHT=`env PYTHONPATH=ci/utils python ci/checks/copyright.py --git-modified-only 2>&1` +CR_RETVAL=$? +ERRORCODE=$((ERRORCODE | ${CR_RETVAL})) + +# Output results if failure otherwise show pass +if [ "$CR_RETVAL" != "0" ]; then + echo -e "\n\n>>>> FAILED: copyright check; begin output\n\n" + echo -e "$COPYRIGHT" + echo -e "\n\n>>>> FAILED: copyright check; end output\n\n" +else + echo -e "\n\n>>>> PASSED: copyright check\n\n" +fi -exit $RETVAL +exit ${ERRORCODE} diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index dfbbbffc73b..2cdb77bbbc2 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2020, NVIDIA CORPORATION. ######################################### # cuGraph CPU conda build script for CI # ######################################### @@ -20,10 +20,6 @@ export HOME=$WORKSPACE # Switch to project root; also root of repo checkout cd $WORKSPACE -# Get latest tag and number of commits since tag -export GIT_DESCRIBE_TAG=`git describe --abbrev=0 --tags` -export GIT_DESCRIBE_NUMBER=`git rev-list ${GIT_DESCRIBE_TAG}..HEAD --count` - # If nightly build, append current YYMMDD to version if [[ "$BUILD_MODE" = "branch" && "$SOURCE_BRANCH" = branch-* ]] ; then export VERSION_SUFFIX=`date +%y%m%d` diff --git a/ci/cpu/cugraph/build_cugraph.sh b/ci/cpu/cugraph/build_cugraph.sh index 874488ff020..70f5baee230 100755 --- a/ci/cpu/cugraph/build_cugraph.sh +++ b/ci/cpu/cugraph/build_cugraph.sh @@ -1,9 +1,25 @@ #!/usr/bin/env bash +# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + set -e if [ "$BUILD_CUGRAPH" == "1" ]; then echo "Building cugraph" CUDA_REL=${CUDA_VERSION%.*} - - conda build conda/recipes/cugraph --python=$PYTHON + if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then + conda build conda/recipes/cugraph --python=$PYTHON + else + conda build conda/recipes/cugraph -c ci/artifacts/cugraph/cpu/conda-bld/ --dirty --no-remove-work-dir --python=$PYTHON + fi fi diff --git a/ci/cpu/cugraph/upload-anaconda.sh b/ci/cpu/cugraph/upload-anaconda.sh index e729972cf43..9601905d6c4 100755 --- a/ci/cpu/cugraph/upload-anaconda.sh +++ b/ci/cpu/cugraph/upload-anaconda.sh @@ -1,13 +1,22 @@ #!/bin/bash +# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# Adopted from https://github.com/tmcdonell/travis-scripts/blob/dfaac280ac2082cd6bcaba3217428347899f2975/update-accelerate-buildbot.sh +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. set -e -if [ "$UPLOAD_CUGRAPH" == "1" ]; then +if [[ "$BUILD_CUGRAPH" == "1" && "$UPLOAD_CUGRAPH" == "1" ]]; then export UPLOADFILE=`conda build conda/recipes/cugraph -c rapidsai -c nvidia -c numba -c conda-forge -c defaults --python=$PYTHON --output` - SOURCE_BRANCH=master # Have to label all CUDA versions due to the compatibility to work with any CUDA if [ "$LABEL_MAIN" == "1" ]; then @@ -22,8 +31,7 @@ if [ "$UPLOAD_CUGRAPH" == "1" ]; then test -e ${UPLOADFILE} - # Restrict uploads to master branch - if [ ${GIT_BRANCH} != ${SOURCE_BRANCH} ]; then + if [ ${BUILD_MODE} != "branch" ]; then echo "Skipping upload" return 0 fi diff --git a/ci/cpu/libcugraph/build_libcugraph.sh b/ci/cpu/libcugraph/build_libcugraph.sh index b728c130d0e..e5ff77d7db9 100755 --- a/ci/cpu/libcugraph/build_libcugraph.sh +++ b/ci/cpu/libcugraph/build_libcugraph.sh @@ -1,9 +1,25 @@ #!/usr/bin/env bash +# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + set -e if [ "$BUILD_LIBCUGRAPH" == '1' ]; then echo "Building libcugraph" CUDA_REL=${CUDA_VERSION%.*} - - conda build conda/recipes/libcugraph + if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then + conda build conda/recipes/libcugraph + else + conda build --dirty --no-remove-work-dir conda/recipes/libcugraph + fi fi diff --git a/ci/cpu/libcugraph/upload-anaconda.sh b/ci/cpu/libcugraph/upload-anaconda.sh index 11316dc5b1f..8cd71070778 100755 --- a/ci/cpu/libcugraph/upload-anaconda.sh +++ b/ci/cpu/libcugraph/upload-anaconda.sh @@ -1,23 +1,31 @@ #!/bin/bash +# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# Adopted from https://github.com/tmcdonell/travis-scripts/blob/dfaac280ac2082cd6bcaba3217428347899f2975/update-accelerate-buildbot.sh +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. set -e -if [ "$UPLOAD_LIBCUGRAPH" == "1" ]; then +if [[ "$BUILD_LIBCUGRAPH" == "1" && "$UPLOAD_LIBCUGRAPH" == "1" ]]; then CUDA_REL=${CUDA_VERSION%.*} export UPLOADFILE=`conda build conda/recipes/libcugraph --output` - SOURCE_BRANCH=master LABEL_OPTION="--label main" echo "LABEL_OPTION=${LABEL_OPTION}" test -e ${UPLOADFILE} - # Restrict uploads to master branch - if [ ${GIT_BRANCH} != ${SOURCE_BRANCH} ]; then + if [ ${BUILD_MODE} != "branch" ]; then echo "Skipping upload" return 0 fi diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh index 2abc137662c..ee471329b35 100644 --- a/ci/cpu/prebuild.sh +++ b/ci/cpu/prebuild.sh @@ -1,15 +1,30 @@ #!/usr/bin/env bash +# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -export BUILD_CUGRAPH=1 -export BUILD_LIBCUGRAPH=1 +if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then + #If project flash is not activate, always build both + export BUILD_CUGRAPH=1 + export BUILD_LIBCUGRAPH=1 +fi -if [[ "$CUDA" == "10.0" ]]; then +if [[ "$CUDA" == "10.1" ]]; then export UPLOAD_CUGRAPH=1 else export UPLOAD_CUGRAPH=0 fi -if [[ "$PYTHON" == "3.6" ]]; then +if [[ "$PYTHON" == "3.7" ]]; then export UPLOAD_LIBCUGRAPH=1 else export UPLOAD_LIBCUGRAPH=0 diff --git a/ci/docs/build.sh b/ci/docs/build.sh index 1bf8b6b569a..71ad79419a0 100644 --- a/ci/docs/build.sh +++ b/ci/docs/build.sh @@ -61,15 +61,3 @@ done mv $PROJECT_WORKSPACE/cpp/doxygen/html/* $DOCS_WORKSPACE/api/libcugraph/$BRANCH_VERSION mv $PROJECT_WORKSPACE/docs/build/html/* $DOCS_WORKSPACE/api/cugraph/$BRANCH_VERSION -# Customize HTML documentation -./update_symlinks.sh $NIGHTLY_VERSION -./customization/lib_map.sh - - -for PROJECT in ${PROJECTS[@]}; do - echo "" - echo "Customizing: $PROJECT" - ./customization/customize_docs_in_folder.sh api/$PROJECT/ $NIGHTLY_VERSION - git add $DOCS_WORKSPACE/api/$PROJECT/* -done - diff --git a/ci/getGTestTimes.sh b/ci/getGTestTimes.sh index b2c3c7718e0..8a3752d76e2 100755 --- a/ci/getGTestTimes.sh +++ b/ci/getGTestTimes.sh @@ -1,4 +1,16 @@ #!/bin/bash +# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # This script will print the gtest results sorted by runtime. This will print # the results two ways: first by printing all tests sorted by runtime, then by diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 78c020375d9..3cef2e56877 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2020, NVIDIA CORPORATION. ########################################## # cuGraph GPU build & testscript for CI # ########################################## @@ -57,21 +57,19 @@ source activate gdf logger "conda install required packages" conda install -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge -c defaults \ - cudf=${MINOR_VERSION} \ - rmm=${MINOR_VERSION} \ - networkx>=2.3 \ - python-louvain \ - cudatoolkit=$CUDA_REL \ - dask>=2.12.0 \ - distributed>=2.12.0 \ - dask-cudf=${MINOR_VERSION} \ - dask-cuda=${MINOR_VERSION} \ - scikit-learn=0.23.0 \ - nccl>=2.5 \ - ucx-py=${MINOR_VERSION} \ - libcypher-parser \ - ipython=7.3* \ - jupyterlab + "cudf=${MINOR_VERSION}" \ + "rmm=${MINOR_VERSION}" \ + "cudatoolkit=$CUDA_REL" \ + "dask-cudf=${MINOR_VERSION}" \ + "dask-cuda=${MINOR_VERSION}" \ + "ucx-py=${MINOR_VERSION}" \ + "rapids-build-env=$MINOR_VERSION.*" \ + "rapids-notebook-env=$MINOR_VERSION.*" \ + rapids-pytest-benchmark + +# https://docs.rapids.ai/maintainers/depmgmt/ +# conda remove --force rapids-build-env rapids-notebook-env +# conda install "your-pkg=1.0.0" # Install the master version of dask and distributed logger "pip install git+https://github.com/dask/distributed.git --upgrade --no-deps" @@ -91,8 +89,10 @@ conda list # BUILD - Build libcugraph and cuGraph from source ################################################################################ -logger "Build libcugraph..." -$WORKSPACE/build.sh clean libcugraph cugraph +if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then + logger "Build libcugraph..." + $WORKSPACE/build.sh clean libcugraph cugraph +fi ################################################################################ # TEST - Run GoogleTest and py.tests for libcugraph and cuGraph diff --git a/ci/gpu/test-notebooks.sh b/ci/gpu/test-notebooks.sh index 491458df5ce..247eb328d2e 100755 --- a/ci/gpu/test-notebooks.sh +++ b/ci/gpu/test-notebooks.sh @@ -1,4 +1,16 @@ #!/bin/bash +# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. #RAPIDS_DIR=/rapids NOTEBOOKS_DIR=${WORKSPACE}/notebooks @@ -11,7 +23,7 @@ TOPLEVEL_NB_FOLDERS=$(find . -name *.ipynb |cut -d'/' -f2|sort -u) # Add notebooks that should be skipped here # (space-separated list of filenames without paths) -SKIPNBS="uvm.ipynb" +SKIPNBS="uvm.ipynb bfs_benchmark.ipynb louvain_benchmark.ipynb pagerank_benchmark.ipynb sssp_benchmark.ipynb release.ipynb" ## Check env env diff --git a/ci/local/README.md b/ci/local/README.md index c20a073e833..28bbe3590ea 100644 --- a/ci/local/README.md +++ b/ci/local/README.md @@ -25,7 +25,7 @@ where: Example Usage: `bash build.sh -r ~/rapids/cugraph -i gpuci/rapidsai-base:cuda10.1-ubuntu16.04-gcc5-py3.6` -For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai-base/tags) page. +For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai/tags) page. Style Check: ```bash @@ -51,6 +51,7 @@ The docker image will generate build artifacts in a folder on your machine locat The script will build your repository and run all tests. If any tests fail, it dumps the user into the docker container itself to allow you to debug from within the container. If all the tests pass as expected the container exits and is automatically removed. Remember to exit the container if tests fail and you do not wish to debug within the container itself. +If you would like to rerun the tests after changing some code in the container, run `bash ci/gpu/build.sh`. ### Container File Structure diff --git a/ci/local/build.sh b/ci/local/build.sh index c6f7f1a51e2..51b9380a311 100755 --- a/ci/local/build.sh +++ b/ci/local/build.sh @@ -1,6 +1,21 @@ #!/bin/bash - -DOCKER_IMAGE="gpuci/rapidsai-base:cuda10.0-ubuntu16.04-gcc5-py3.6" +# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +GIT_DESCRIBE_TAG=`git describe --tags` +MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` + +DOCKER_IMAGE="gpuci/rapidsai:${MINOR_VERSION}-cuda10.1-devel-ubuntu16.04-py3.7" REPO_PATH=${PWD} RAPIDS_DIR_IN_CONTAINER="/rapids" CPP_BUILD_DIR="cpp/build" @@ -139,4 +154,4 @@ docker run --rm -it ${GPU_OPTS} \ -v "$PASSWD_FILE":/etc/passwd:ro \ -v "$GROUP_FILE":/etc/group:ro \ --cap-add=SYS_PTRACE \ - "${DOCKER_IMAGE}" bash -c "${COMMAND}" \ No newline at end of file + "${DOCKER_IMAGE}" bash -c "${COMMAND}" diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index b9faa5cbf1f..d853c3693c6 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -1,7 +1,16 @@ #!/bin/bash -######################## -# RMM Version Updater # -######################## +# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. ## Usage # bash update-version.sh @@ -17,6 +26,7 @@ CURRENT_TAG=`git tag | grep -xE 'v[0-9\.]+' | sort --version-sort | tail -n 1 | CURRENT_MAJOR=`echo $CURRENT_TAG | awk '{split($0, a, "."); print a[1]}'` CURRENT_MINOR=`echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}'` CURRENT_PATCH=`echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}'` +CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR} NEXT_MAJOR=$((CURRENT_MAJOR + 1)) NEXT_MINOR=$((CURRENT_MINOR + 1)) NEXT_PATCH=$((CURRENT_PATCH + 1)) @@ -51,3 +61,11 @@ sed_runner 's/'"CUGRAPH VERSION .* LANGUAGES C CXX CUDA)"'/'"CUGRAPH VERSION ${N # RTD update sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/source/conf.py sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/source/conf.py + +for FILE in conda/environments/*.yml; do + sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" ${FILE}; + sed_runner "s/rmm=${CURRENT_SHORT_TAG}/rmm=${NEXT_SHORT_TAG}/g" ${FILE}; + sed_runner "s/dask-cuda=${CURRENT_SHORT_TAG}/dask-cuda=${NEXT_SHORT_TAG}/g" ${FILE}; + sed_runner "s/dask-cudf=${CURRENT_SHORT_TAG}/dask-cudf=${NEXT_SHORT_TAG}/g" ${FILE}; + sed_runner "s/ucx-py=${CURRENT_SHORT_TAG}/ucx-py=${NEXT_SHORT_TAG}/g" ${FILE}; +done diff --git a/ci/test.sh b/ci/test.sh index 37ec2fcc956..fde9bbb3d8d 100755 --- a/ci/test.sh +++ b/ci/test.sh @@ -1,4 +1,16 @@ #!/bin/bash +# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # note: do not use set -e in order to allow all gtest invocations to take place, # and instead keep track of exit status and exit with an overall exit status @@ -45,7 +57,12 @@ else fi fi -cd ${CUGRAPH_ROOT}/cpp/build +if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then + cd ${CUGRAPH_ROOT}/cpp/build +else + export LD_LIBRARY_PATH="$WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build:$LD_LIBRARY_PATH" + cd $WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build +fi for gt in gtests/*; do test_name=$(basename $gt) @@ -54,9 +71,22 @@ for gt in gtests/*; do ERRORCODE=$((ERRORCODE | $?)) done -echo "Python py.test for cuGraph..." +if [[ "$PROJECT_FLASH" == "1" ]]; then + echo "Installing libcugraph..." + conda install -c $WORKSPACE/ci/artifacts/cugraph/cpu/conda-bld/ libcugraph + export LIBCUGRAPH_BUILD_DIR="$WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build" + echo "Build cugraph..." + $WORKSPACE/build.sh cugraph +fi + +echo "Python pytest for cuGraph..." cd ${CUGRAPH_ROOT}/python -py.test --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-cugraph.xml -v --cov-config=.coveragerc --cov=cugraph --cov-report=xml:${WORKSPACE}/python/cugraph/cugraph-coverage.xml --cov-report term +pytest --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-cugraph.xml -v --cov-config=.coveragerc --cov=cugraph --cov-report=xml:${WORKSPACE}/python/cugraph/cugraph-coverage.xml --cov-report term --ignore=cugraph/raft +ERRORCODE=$((ERRORCODE | $?)) + +echo "Python benchmarks for cuGraph (running as tests)..." +cd ${CUGRAPH_ROOT}/benchmarks +pytest -v -m "managedmem_on and poolallocator_on and tiny" --benchmark-disable ERRORCODE=$((ERRORCODE | $?)) exit ${ERRORCODE} diff --git a/ci/utils/git_helpers.py b/ci/utils/git_helpers.py new file mode 100644 index 00000000000..83ad73fe283 --- /dev/null +++ b/ci/utils/git_helpers.py @@ -0,0 +1,137 @@ +# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import subprocess +import os +import re + + +def isFileEmpty(f): + return os.stat(f).st_size == 0 + + +def __git(*opts): + """Runs a git command and returns its output""" + cmd = "git " + " ".join(list(opts)) + ret = subprocess.check_output(cmd, shell=True) + return ret.decode("UTF-8") + + +def __gitdiff(*opts): + """Runs a git diff command with no pager set""" + return __git("--no-pager", "diff", *opts) + + +def branch(): + """Returns the name of the current branch""" + name = __git("rev-parse", "--abbrev-ref", "HEAD") + name = name.rstrip() + return name + + +def uncommittedFiles(): + """ + Returns a list of all changed files that are not yet committed. This + means both untracked/unstaged as well as uncommitted files too. + """ + files = __git("status", "-u", "-s") + ret = [] + for f in files.splitlines(): + f = f.strip(" ") + f = re.sub("\s+", " ", f) # noqa: W605 + tmp = f.split(" ", 1) + # only consider staged files or uncommitted files + # in other words, ignore untracked files + if tmp[0] == "M" or tmp[0] == "A": + ret.append(tmp[1]) + return ret + + +def changedFilesBetween(b1, b2): + """Returns a list of files changed between branches b1 and b2""" + current = branch() + __git("checkout", "--quiet", b1) + __git("checkout", "--quiet", b2) + files = __gitdiff("--name-only", "--ignore-submodules", "%s...%s" % + (b1, b2)) + __git("checkout", "--quiet", current) + return files.splitlines() + + +def changesInFileBetween(file, b1, b2, pathFilter=None): + """Filters the changed lines to a file between the branches b1 and b2""" + current = branch() + __git("checkout", "--quiet", b1) + __git("checkout", "--quiet", b2) + diffs = __gitdiff("--ignore-submodules", "-w", "--minimal", "-U0", + "%s...%s" % (b1, b2), "--", file) + __git("checkout", "--quiet", current) + lines = [] + for line in diffs.splitlines(): + if pathFilter is None or pathFilter(line): + lines.append(line) + return lines + + +def modifiedFiles(pathFilter=None): + """ + If inside a CI-env (ie. currentBranch=current-pr-branch and the env-var + PR_TARGET_BRANCH is defined), then lists out all files modified between + these 2 branches. Else, lists out all the uncommitted files in the current + branch. + + Such utility function is helpful while putting checker scripts as part of + cmake, as well as CI process. This way, during development, only the files + touched (but not yet committed) by devs can be checked. But, during the CI + process ALL files modified by the dev, as submiited in the PR, will be + checked. This happens, all the while using the same script. + """ + if "PR_TARGET_BRANCH" in os.environ and branch() == "current-pr-branch": + allFiles = changedFilesBetween(os.environ["PR_TARGET_BRANCH"], + branch()) + else: + allFiles = uncommittedFiles() + files = [] + for f in allFiles: + if pathFilter is None or pathFilter(f): + files.append(f) + return files + + +def listAllFilesInDir(folder): + """Utility function to list all files/subdirs in the input folder""" + allFiles = [] + for root, dirs, files in os.walk(folder): + for name in files: + allFiles.append(os.path.join(root, name)) + return allFiles + + +def listFilesToCheck(filesDirs, pathFilter=None): + """ + Utility function to filter the input list of files/dirs based on the input + pathFilter method and returns all the files that need to be checked + """ + allFiles = [] + for f in filesDirs: + if os.path.isfile(f): + if pathFilter is None or pathFilter(f): + allFiles.append(f) + elif os.path.isdir(f): + files = listAllFilesInDir(f) + for f_ in files: + if pathFilter is None or pathFilter(f_): + allFiles.append(f_) + return allFiles diff --git a/ci/utils/nbtest.sh b/ci/utils/nbtest.sh index f7b9774c6fd..8c86baeaa09 100755 --- a/ci/utils/nbtest.sh +++ b/ci/utils/nbtest.sh @@ -1,4 +1,16 @@ #!/bin/bash +# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. MAGIC_OVERRIDE_CODE=" def my_run_line_magic(*args, **kwargs): diff --git a/ci/utils/nbtestlog2junitxml.py b/ci/utils/nbtestlog2junitxml.py index 15b362e4b70..e9712253b0e 100644 --- a/ci/utils/nbtestlog2junitxml.py +++ b/ci/utils/nbtestlog2junitxml.py @@ -1,3 +1,16 @@ +# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Generate a junit-xml file from parsing a nbtest log import re diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 00000000000..c0a3a2fba2b --- /dev/null +++ b/codecov.yml @@ -0,0 +1,5 @@ +#Configuration File for CodeCov +coverage: + status: + project: off + patch: off diff --git a/conda/environments/cugraph_dev_cuda10.1.yml b/conda/environments/cugraph_dev_cuda10.1.yml index 40e4da01244..eb987f326c8 100644 --- a/conda/environments/cugraph_dev_cuda10.1.yml +++ b/conda/environments/cugraph_dev_cuda10.1.yml @@ -5,21 +5,22 @@ channels: - rapidsai-nightly - conda-forge dependencies: -- cudf=0.14.* -- nvstrings=0.14.* -- rmm=0.14.* +- cudf=0.15.* +- rmm=0.15.* - dask>=2.12.0 - distributed>=2.12.0 -- dask-cuda=0.14* -- dask-cudf=0.14* +- dask-cuda=0.15* +- dask-cudf=0.15* - nccl>=2.5 -- ucx-py=0.14* +- ucx-py=0.15* - scipy - networkx - python-louvain - cudatoolkit=10.1 +- clang=8.0.1 +- clang-tools=8.0.1 - cmake>=3.12 -- python>=3.6,<3.8 +- python>=3.6,<3.9 - notebook>=0.5.0 - boost - cython>=0.29,<0.30 @@ -35,3 +36,4 @@ dependencies: - recommonmark - pip - libcypher-parser +- rapids-pytest-benchmark diff --git a/conda/environments/cugraph_dev_cuda10.2.yml b/conda/environments/cugraph_dev_cuda10.2.yml index 6625d6c711c..028e0fce1a4 100644 --- a/conda/environments/cugraph_dev_cuda10.2.yml +++ b/conda/environments/cugraph_dev_cuda10.2.yml @@ -5,21 +5,22 @@ channels: - rapidsai-nightly - conda-forge dependencies: -- cudf=0.14.* -- nvstrings=0.14.* -- rmm=0.14.* +- cudf=0.15.* +- rmm=0.15.* - dask>=2.12.0 - distributed>=2.12.0 -- dask-cuda=0.14* -- dask-cudf=0.14* +- dask-cuda=0.15* +- dask-cudf=0.15* - nccl>=2.5 -- ucx-py=0.14* +- ucx-py=0.15* - scipy - networkx - python-louvain - cudatoolkit=10.2 +- clang=8.0.1 +- clang-tools=8.0.1 - cmake>=3.12 -- python>=3.6,<3.8 +- python>=3.6,<3.9 - notebook>=0.5.0 - boost - cython>=0.29,<0.30 @@ -35,3 +36,4 @@ dependencies: - recommonmark - pip - libcypher-parser +- rapids-pytest-benchmark diff --git a/conda/environments/cugraph_dev_cuda10.0.yml b/conda/environments/cugraph_dev_cuda11.0.yml similarity index 70% rename from conda/environments/cugraph_dev_cuda10.0.yml rename to conda/environments/cugraph_dev_cuda11.0.yml index 83e98d90437..bc3b84badf2 100644 --- a/conda/environments/cugraph_dev_cuda10.0.yml +++ b/conda/environments/cugraph_dev_cuda11.0.yml @@ -5,21 +5,22 @@ channels: - rapidsai-nightly - conda-forge dependencies: -- cudf=0.14.* -- nvstrings=0.14.* -- rmm=0.14.* +- cudf=0.15.* +- rmm=0.15.* - dask>=2.12.0 - distributed>=2.12.0 -- dask-cuda=0.14* -- dask-cudf=0.14* +- dask-cuda=0.15* +- dask-cudf=0.15* - nccl>=2.5 -- ucx-py=0.14* +- ucx-py=0.15* - scipy - networkx - python-louvain -- cudatoolkit=10.0 +- cudatoolkit=11.0 +- clang=8.0.1 +- clang-tools=8.0.1 - cmake>=3.12 -- python>=3.6,<3.8 +- python>=3.6,<3.9 - notebook>=0.5.0 - boost - cython>=0.29,<0.30 @@ -35,3 +36,4 @@ dependencies: - recommonmark - pip - libcypher-parser +- rapids-pytest-benchmark diff --git a/conda/recipes/cugraph/meta.yaml b/conda/recipes/cugraph/meta.yaml index 4be2ef4014d..1a32fd2a4b1 100644 --- a/conda/recipes/cugraph/meta.yaml +++ b/conda/recipes/cugraph/meta.yaml @@ -4,7 +4,6 @@ # conda build -c nvidia -c rapidsai -c conda-forge -c defaults . {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} -{% set git_revision_count=environ.get('GIT_DESCRIBE_NUMBER', 0) %} {% set py_version=environ.get('CONDA_PY', 36) %} package: name: cugraph @@ -14,8 +13,8 @@ source: path: ../../.. build: - number: {{ git_revision_count }} - string: py{{ py_version }}_{{ git_revision_count }} + number: {{ GIT_DESCRIBE_NUMBER }} + string: py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} script_env: - CC - CXX diff --git a/conda/recipes/libcugraph/meta.yaml b/conda/recipes/libcugraph/meta.yaml index 2d0f81dd27a..22731102110 100644 --- a/conda/recipes/libcugraph/meta.yaml +++ b/conda/recipes/libcugraph/meta.yaml @@ -4,18 +4,17 @@ # conda build -c nvidia -c rapidsai -c conda-forge -c defaults . {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} -{% set git_revision_count=environ.get('GIT_DESCRIBE_NUMBER', 0) %} {% set cuda_version='.'.join(environ.get('CUDA', '9.2').split('.')[:2]) %} package: name: libcugraph version: {{ version }} source: - path: ../../.. + git_url: ../../.. build: - number: {{ git_revision_count }} - string: cuda{{ cuda_version }}_{{ git_revision_count }} + number: {{ GIT_DESCRIBE_NUMBER }} + string: cuda{{ cuda_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} script_env: - CC - CXX diff --git a/conda_build.sh b/conda_build.sh index 14e3fae1e1f..4643e302f5c 100755 --- a/conda_build.sh +++ b/conda_build.sh @@ -8,7 +8,7 @@ conda build -c nvidia -c rapidsai -c rapidsai-nightly/label/cuda${CUDA_REL} -c c if [ "$UPLOAD_PACKAGE" == '1' ]; then export UPLOADFILE=`conda build -c nvidia -c rapidsai -c conda-forge -c defaults --python=${PYTHON} conda/recipes/cugraph --output` - SOURCE_BRANCH=master + SOURCE_BRANCH=main test -e ${UPLOADFILE} diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d948b27a939..70d7edf99a3 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -1,6 +1,5 @@ #============================================================================= -# Copyright 2018 BlazingDB, Inc. -# Copyright 2018 Percy Camilo Triveño Aucahuasi +# Copyright (c) 2018-2020, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,7 +16,7 @@ cmake_minimum_required(VERSION 3.12 FATAL_ERROR) -project(CUGRAPH VERSION 0.14.0 LANGUAGES C CXX CUDA) +project(CUGRAPH VERSION 0.15.0 LANGUAGES C CXX CUDA) ################################################################################################### # - build type ------------------------------------------------------------------------------------ @@ -104,13 +103,6 @@ set(CMAKE_EXE_LINKER_FLAGS "-Wl,--disable-new-dtags") option(BUILD_TESTS "Configure CMake to build tests" ON) -option(BUILD_MPI "Build with MPI" OFF) -if (BUILD_MPI) - find_package(MPI REQUIRED) - set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MPI_C_COMPILE_FLAGS}") - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MPI_CXX_COMPILE_FLAGS}") - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_CXX_LINK_FLAGS}") -endif(BUILD_MPI) ################################################################################################### # - cmake modules --------------------------------------------------------------------------------- @@ -194,24 +186,52 @@ if (RMM_INCLUDE AND RMM_LIBRARY) endif (RMM_INCLUDE AND RMM_LIBRARY) ################################################################################################### -# - External Projects ----------------------------------------------------------------------------- - -# https://cmake.org/cmake/help/v3.0/module/ExternalProject.html -include(ExternalProject) +# - Fetch Content ----------------------------------------------------------------------------- +include(FetchContent) # - CUB -set(CUB_DIR ${CMAKE_CURRENT_BINARY_DIR}/cub CACHE STRING "Path to cub repo") -set(CUB_INCLUDE_DIR ${CUB_DIR}/src/cub CACHE STRING "Path to cub includes") +message("Fetching CUB") -ExternalProject_Add(cub - GIT_REPOSITORY https://github.com/NVlabs/cub.git - GIT_TAG v1.8.0 - PREFIX ${CUB_DIR} - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" +FetchContent_Declare( + cub + GIT_REPOSITORY https://github.com/thrust/cub.git + GIT_TAG 1.9.10 + GIT_SHALLOW true ) +FetchContent_GetProperties(cub) +if(NOT cub_POPULATED) + FetchContent_Populate(cub) + # We are not using the cub CMake targets, so no need to call `add_subdirectory()`. +endif() +set(CUB_INCLUDE_DIR "${cub_SOURCE_DIR}") + +# - THRUST +message("Fetching Thrust") + +FetchContent_Declare( + thrust + GIT_REPOSITORY https://github.com/thrust/thrust.git + GIT_TAG 1.9.10 + GIT_SHALLOW true +) + +FetchContent_GetProperties(thrust) +if(NOT thrust_POPULATED) + FetchContent_Populate(thrust) + # We are not using the thrust CMake targets, so no need to call `add_subdirectory()`. +endif() +set(THRUST_INCLUDE_DIR "${thrust_SOURCE_DIR}") + + + + +################################################################################################### +# - External Projects ----------------------------------------------------------------------------- + +# https://cmake.org/cmake/help/v3.0/module/ExternalProject.html +include(ExternalProject) + # - CUHORNET set(CUHORNET_DIR ${CMAKE_CURRENT_BINARY_DIR}/cuhornet CACHE STRING "Path to cuhornet repo") set(CUHORNET_INCLUDE_DIR ${CUHORNET_DIR}/src/cuhornet CACHE STRING "Path to cuhornet includes") @@ -219,7 +239,7 @@ set(CUHORNET_INCLUDE_DIR ${CUHORNET_DIR}/src/cuhornet CACHE STRING "Path to cuho ExternalProject_Add(cuhornet GIT_REPOSITORY https://github.com/rapidsai/cuhornet.git - GIT_TAG master + GIT_TAG main PREFIX ${CUHORNET_DIR} CONFIGURE_COMMAND "" BUILD_COMMAND "" @@ -232,12 +252,18 @@ set(CUGUNROCK_DIR ${CMAKE_CURRENT_BINARY_DIR}/cugunrock CACHE STRING ExternalProject_Add(cugunrock GIT_REPOSITORY https://github.com/rapidsai/cugunrock.git - GIT_TAG fea_full_bc # provide a branch, a tag, or even a commit hash + GIT_TAG main PREFIX ${CUGUNROCK_DIR} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX= -DGPU_ARCHS="" -DGUNROCK_BUILD_SHARED_LIBS=OFF -DGUNROCK_BUILD_TESTS=OFF + -DCUDA_AUTODETECT_GENCODE=FALSE + -DGUNROCK_GENCODE_SM60=TRUE + -DGUNROCK_GENCODE_SM61=TRUE + -DGUNROCK_GENCODE_SM70=TRUE + -DGUNROCK_GENCODE_SM72=TRUE + -DGUNROCK_GENCODE_SM75=TRUE BUILD_BYPRODUCTS ${CUGUNROCK_DIR}/lib/libgunrock.a ) @@ -263,7 +289,7 @@ endif(NOT NCCL_PATH) if(DEFINED ENV{RAFT_PATH}) message(STATUS "RAFT_PATH environment variable detected.") message(STATUS "RAFT_DIR set to $ENV{RAFT_PATH}") - set(RAFT_DIR ENV{RAFT_PATH}) + set(RAFT_DIR "$ENV{RAFT_PATH}") ExternalProject_Add(raft DOWNLOAD_COMMAND "" @@ -278,14 +304,14 @@ else(DEFINED ENV{RAFT_PATH}) ExternalProject_Add(raft GIT_REPOSITORY https://github.com/rapidsai/raft.git - GIT_TAG e003de27fc4e4a096337f184dddbd7942a68bb5c + GIT_TAG 099e2b874b05555a78bed1666fa2d22f784e56a7 PREFIX ${RAFT_DIR} CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "") # Redefining RAFT_DIR so it coincides with the one inferred by env variable. - set(RAFT_DIR ${RAFT_DIR}/src/raft/ CACHE STRING "Path to RAFT repo") + set(RAFT_DIR "${RAFT_DIR}/src/raft/") endif(DEFINED ENV{RAFT_PATH}) @@ -301,13 +327,14 @@ link_directories( "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}") add_library(cugraph SHARED - src/comms/mpi/comms_mpi.cpp src/db/db_object.cu src/db/db_parser_integration_test.cu src/db/db_operators.cu - src/utilities/cusparse_helper.cu + src/utilities/spmv_1D.cu src/structure/graph.cu src/link_analysis/pagerank.cu + src/link_analysis/pagerank_1D.cu + src/link_analysis/gunrock_hits.cpp src/traversal/bfs.cu src/traversal/sssp.cu src/link_prediction/jaccard.cu @@ -318,25 +345,17 @@ add_library(cugraph SHARED src/community/spectral_clustering.cu src/community/louvain.cpp src/community/louvain_kernels.cu + src/community/leiden.cpp + src/community/leiden_kernels.cu src/community/ktruss.cu src/community/ECG.cu src/community/triangles_counting.cu src/community/extract_subgraph_by_vertex.cu src/cores/core_number.cu src/traversal/two_hop_neighbors.cu - src/utilities/cusparse_helper.cu src/components/connectivity.cu src/centrality/katz_centrality.cu src/centrality/betweenness_centrality.cu - src/nvgraph/kmeans.cu - src/nvgraph/lanczos.cu - src/nvgraph/spectral_matrix.cu - src/nvgraph/modularity_maximization.cu - src/nvgraph/nvgraph_cusparse.cpp - src/nvgraph/nvgraph_cublas.cpp - src/nvgraph/nvgraph_lapack.cu - src/nvgraph/nvgraph_vector_kernels.cu - src/nvgraph/partition.cu ) # @@ -346,20 +365,17 @@ add_library(cugraph SHARED add_dependencies(cugraph cugunrock) add_dependencies(cugraph raft) -if (BUILD_MPI) - add_compile_definitions(ENABLE_OPG=1) -endif (BUILD_MPI) - ################################################################################################### # - include paths --------------------------------------------------------------------------------- target_include_directories(cugraph PRIVATE + "${CUB_INCLUDE_DIR}" + "${THRUST_INCLUDE_DIR}" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}" "${LIBCYPHERPARSER_INCLUDE}" "${Boost_INCLUDE_DIRS}" "${RMM_INCLUDE}" "${CMAKE_CURRENT_SOURCE_DIR}/../thirdparty" - "${CUB_INCLUDE_DIR}" "${CUHORNET_INCLUDE_DIR}/hornet/include" "${CUHORNET_INCLUDE_DIR}/hornetsnest/include" "${CUHORNET_INCLUDE_DIR}/xlib/include" @@ -367,7 +383,6 @@ target_include_directories(cugraph "${CMAKE_CURRENT_SOURCE_DIR}/src" "${CUGUNROCK_DIR}/include" "${NCCL_INCLUDE_DIRS}" - "${MPI_CXX_INCLUDE_PATH}" "${RAFT_DIR}/cpp/include" PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include" @@ -377,7 +392,7 @@ target_include_directories(cugraph # - link libraries -------------------------------------------------------------------------------- target_link_libraries(cugraph PRIVATE - ${RMM_LIBRARY} gunrock ${NVSTRINGS_LIBRARY} cublas cusparse curand cusolver cudart cuda ${LIBCYPHERPARSER_LIBRARY} ${MPI_CXX_LIBRARIES} ${NCCL_LIBRARIES}) + ${RMM_LIBRARY} gunrock cublas cusparse curand cusolver cudart cuda ${LIBCYPHERPARSER_LIBRARY} ${MPI_CXX_LIBRARIES} ${NCCL_LIBRARIES}) if(OpenMP_CXX_FOUND) target_link_libraries(cugraph PRIVATE diff --git a/cpp/cmake/Modules/FindNCCL.cmake b/cpp/cmake/Modules/FindNCCL.cmake index 16ca4458a7f..0f673707444 100644 --- a/cpp/cmake/Modules/FindNCCL.cmake +++ b/cpp/cmake/Modules/FindNCCL.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2020, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/cpp/include/algorithms.hpp b/cpp/include/algorithms.hpp index ece827475ee..5241043fe88 100644 --- a/cpp/include/algorithms.hpp +++ b/cpp/include/algorithms.hpp @@ -17,6 +17,7 @@ #include #include +#include namespace cugraph { @@ -28,6 +29,7 @@ namespace cugraph { * when the tolerance descreases and/or alpha increases toward the limiting value of 1. * The user is free to use default values or to provide inputs for the initial guess, * tolerance and maximum number of iterations. + * * @throws cugraph::logic_error with a custom message when an error occurs. @@ -38,7 +40,9 @@ namespace cugraph { 32-bit) * @tparam WT Type of edge weights. Supported value : float or double. * - * @param[in] graph cuGRAPH graph descriptor, should contain the connectivity + * @param[in] handle Library handle (RAFT). If a communicator is set in the handle, + the multi GPU version will be selected. + * @param[in] graph cuGraph graph descriptor, should contain the connectivity information as a transposed adjacency list (CSC). Edge weights are not used for this algorithm. * @param[in] alpha The damping factor alpha represents the probability to follow an outgoing edge, standard value is 0.85. Thus, 1.0-alpha is the probability to “teleport” to a @@ -48,36 +52,38 @@ namespace cugraph { * @param[in] pagerank Array of size V. Should contain the initial guess if has_guess=true. In this case the initial guess cannot be the vector of 0s. Memory is provided and owned by the caller. - * @param[in] personalization_subset_size (optional) The number of vertices for to personalize. - Initialized to 0 by default. - * @param[in] personalization_subset (optional) Array of size personalization_subset_size containing - vertices for running personalized pagerank. Initialized to nullptr by default. Memory is provided - and owned by the caller. - * @param[in] personalization_values (optional) Array of size personalization_subset_size containing - values associated with personalization_subset vertices. Initialized to nullptr by default. Memory - is provided and owned by the caller. - * @param[in] tolerance Set the tolerance the approximation, this parameter should be a - small magnitude value. + * @param[in] personalization_subset_size (optional) Supported on single-GPU, on the roadmap for + Multi-GPU. The number of vertices for to personalize. Initialized to 0 by default. + * @param[in] personalization_subset (optional) Supported on single-GPU, on the roadmap for + Multi-GPU..= Array of size personalization_subset_size containing vertices for running personalized + pagerank. Initialized to nullptr by default. Memory is provided and owned by the caller. + * @param[in] personalization_values (optional) Supported on single-GPU, on the roadmap for + Multi-GPU. Array of size personalization_subset_size containing values associated with + personalization_subset vertices. Initialized to nullptr by default. Memory is provided and owned by + the caller. + * @param[in] tolerance Supported on single-GPU. Set the tolerance the approximation, + this parameter should be a small magnitude value. * The lower the tolerance the better the approximation. If this - value is 0.0f, cuGRAPH will use the default value which is 1.0E-5. + value is 0.0f, cuGraph will use the default value which is 1.0E-5. * Setting too small a tolerance can lead to non-convergence due to numerical roundoff. Usually values between 0.01 and 0.00001 are acceptable. * @param[in] max_iter (optional) The maximum number of iterations before an answer is returned. This can be used to limit the execution time and do an early exit before the solver reaches the convergence tolerance. - * If this value is lower or equal to 0 cuGRAPH will use the + * If this value is lower or equal to 0 cuGraph will use the default value, which is 500. - * @param[in] has_guess (optional) This parameter is used to notify cuGRAPH if it - should use a user-provided initial guess. False means the user does not have a guess, in this case - cuGRAPH will use a uniform vector set to 1/V. - * If the value is True, cuGRAPH will read the pagerank parameter + * @param[in] has_guess (optional) Supported on single-GPU. This parameter is used to + notify cuGraph if it should use a user-provided initial guess. False means the user does not have a + guess, in this case cuGraph will use a uniform vector set to 1/V. + * If the value is True, cuGraph will read the pagerank parameter and use this as an initial guess. * @param[out] *pagerank The PageRank : pagerank[i] is the PageRank of vertex i. Memory remains provided and owned by the caller. * */ template -void pagerank(experimental::GraphCSCView const &graph, +void pagerank(raft::handle_t const &handle, + GraphCSCView const &graph, WT *pagerank, VT personalization_subset_size = 0, VT *personalization_subset = nullptr, @@ -106,7 +112,7 @@ void pagerank(experimental::GraphCSCView const &graph, * caller */ template -void jaccard(experimental::GraphCSRView const &graph, WT const *weights, WT *result); +void jaccard(GraphCSRView const &graph, WT const *weights, WT *result); /** * @brief Compute jaccard similarity coefficient for selected vertex pairs @@ -130,7 +136,7 @@ void jaccard(experimental::GraphCSRView const &graph, WT const *weig * caller */ template -void jaccard_list(experimental::GraphCSRView const &graph, +void jaccard_list(GraphCSRView const &graph, WT const *weights, ET num_pairs, VT const *first, @@ -156,7 +162,7 @@ void jaccard_list(experimental::GraphCSRView const &graph, * caller */ template -void overlap(experimental::GraphCSRView const &graph, WT const *weights, WT *result); +void overlap(GraphCSRView const &graph, WT const *weights, WT *result); /** * @brief Compute overlap coefficient for select pairs of vertices @@ -180,7 +186,7 @@ void overlap(experimental::GraphCSRView const &graph, WT const *weig * caller */ template -void overlap_list(experimental::GraphCSRView const &graph, +void overlap_list(GraphCSRView const &graph, WT const *weights, ET num_pairs, VT const *first, @@ -203,7 +209,7 @@ void overlap_list(experimental::GraphCSRView const &graph, * @tparam WT Type of edge weights. Supported values : float or * double. * - * @param[in] graph cuGRAPH graph descriptor, should contain the + * @param[in] graph cuGraph graph descriptor, should contain the * connectivity information as a COO. Graph is considered undirected. Edge weights are used for this * algorithm and set to 1 by default. * @param[out] pos Device array (2, n) containing x-axis and y-axis @@ -241,7 +247,7 @@ void overlap_list(experimental::GraphCSRView const &graph, * */ template -void force_atlas2(experimental::GraphCOOView &graph, +void force_atlas2(GraphCOOView &graph, float *pos, const int max_iter = 500, float *x_start = nullptr, @@ -267,39 +273,87 @@ void force_atlas2(experimental::GraphCOOView &graph, * * The current implementation does not support a weighted graph. * - * @throws cugraph::logic_error with a custom message when an error - * occurs. + * @throws cugraph::logic_error if `result == nullptr` or + * `number_of_sources < 0` or `number_of_sources !=0 and sources == nullptr`. + * @tparam vertex_t Type of vertex identifiers. Supported value : int + * (signed, 32-bit) + * @tparam edge_t Type of edge identifiers. Supported value : int + * (signed, 32-bit) + * @tparam weight_t Type of edge weights. Supported values : float or + * double. + * @tparam result_t Type of computed result. Supported values : float or + * double + * @param[in] handle Library handle (RAFT). If a communicator is set in the + * handle, the multi GPU version will be selected. + * @param[in] graph cuGRAPH graph descriptor, should contain the + * connectivity information as a CSR + * @param[out] result Device array of centrality scores + * @param[in] normalized If true, return normalized scores, if false return + * unnormalized scores. + * @param[in] endpoints If true, include endpoints of paths in score, if false + * do not + * @param[in] weight If specified, device array of weights for each edge + * @param[in] k If specified, number of vertex samples defined in the + * vertices array. + * @param[in] vertices If specified, host array of vertex ids to estimate + * betweenness these vertices will serve as sources for the traversal + * algorihtm to obtain shortest path counters. + * @param[in] total_number_of_source_used If specified use this number to normalize results + * when using subsampling, it allows accumulation of results across multiple calls. * - * @tparam VT Type of vertex identifiers. Supported value : int (signed, - * 32-bit) - * @tparam ET Type of edge identifiers. Supported value : int (signed, - * 32-bit) - * @tparam WT Type of edge weights. Supported values : float or double. - * @tparam result_t Type of computed result. Supported values : float or double - * (double only supported in default implementation) + */ +template +void betweenness_centrality(const raft::handle_t &handle, + GraphCSRView const &graph, + result_t *result, + bool normalized = true, + bool endpoints = false, + weight_t const *weight = nullptr, + vertex_t k = 0, + vertex_t const *vertices = nullptr); + +/** + * @brief Compute edge betweenness centrality for a graph * - * @param[in] graph cuGRAPH graph descriptor, should contain the connectivity - * information as a CSR - * @param[out] result Device array of centrality scores - * @param[in] normalized If true, return normalized scores, if false return unnormalized - * scores. - * @param[in] endpoints If true, include endpoints of paths in score, if false do not - * @param[in] weight If specified, device array of weights for each edge - * @param[in] k If specified, number of vertex samples defined in the vertices - * array. - * @param[in] vertices If specified, host array of vertex ids to estimate betweenness - * centrality, these vertices will serve as sources for the traversal algorihtm to obtain - * shortest path counters. + * Betweenness centrality of an edge is the sum of the fraction of all-pairs shortest paths that + * pass through this edge. The weight parameter is currenlty not supported + * + * @throws cugraph::logic_error if `result == nullptr` or + * `number_of_sources < 0` or `number_of_sources !=0 and sources == nullptr` or `endpoints == + * true`. + * @tparam vertex_t Type of vertex identifiers. Supported value : int + * (signed, 32-bit) + * @tparam edge_t Type of edge identifiers. Supported value : int + * (signed, 32-bit) + * @tparam weight_t Type of edge weights. Supported values : float or + * double. + * @tparam result_t Type of computed result. Supported values : float or + * double + * @param[in] handle Library handle (RAFT). If a communicator is set in the + * handle, the multi GPU version will be selected. + * @param[in] graph cuGraph graph descriptor, should contain the + * connectivity information as a CSR + * @param[out] result Device array of centrality scores + * @param[in] normalized If true, return normalized scores, if false return + * unnormalized scores. + * @param[in] weight If specified, device array of weights for each edge + * @param[in] k If specified, number of vertex samples defined in the + * vertices array. + * @param[in] vertices If specified, host array of vertex ids to estimate + * betweenness these vertices will serve as sources for the traversal + * algorihtm to obtain shortest path counters. + * @param[in] total_number_of_source_used If specified use this number to normalize results + * when using subsampling, it allows accumulation of results across multiple calls. * */ -template -void betweenness_centrality(experimental::GraphCSRView const &graph, - result_t *result, - bool normalized = true, - bool endpoints = false, - WT const *weight = nullptr, - VT k = 0, - VT const *vertices = nullptr); +template +void edge_betweenness_centrality(const raft::handle_t &handle, + GraphCSRView const &graph, + result_t *result, + bool normalized = true, + weight_t const *weight = nullptr, + vertex_t k = 0, + vertex_t const *vertices = nullptr); enum class cugraph_cc_t { CUGRAPH_WEAK = 0, ///> Weakly Connected Components @@ -330,14 +384,14 @@ enum class cugraph_cc_t { * @tparam ET Type of edge identifiers. Supported value : int (signed, 32-bit) * @tparam WT Type of edge weights. Supported values : float or double. * - * @param[in] graph cuGRAPH graph descriptor, should contain the connectivity + * @param[in] graph cuGraph graph descriptor, should contain the connectivity * information as a CSR * @param[in] connectivity_type STRONG or WEAK * @param[out] labels Device array of component labels (labels[i] indicates the label * associated with vertex id i. */ template -void connected_components(experimental::GraphCSRView const &graph, +void connected_components(GraphCSRView const &graph, cugraph_cc_t connectivity_type, VT *labels); @@ -358,7 +412,7 @@ void connected_components(experimental::GraphCSRView const &graph, * 32-bit) * @tparam WT Type of edge weights. Supported values : float or double. * - * @param[in] graph cuGRAPH graph descriptor, should contain the connectivity + * @param[in] graph cuGraph graph descriptor, should contain the connectivity * information as a COO * @param[in] k The order of the truss * @param[in] mr Memory resource used to allocate the returned graph @@ -366,8 +420,8 @@ void connected_components(experimental::GraphCSRView const &graph, * */ template -std::unique_ptr> k_truss_subgraph( - experimental::GraphCOOView const &graph, +std::unique_ptr> k_truss_subgraph( + GraphCOOView const &graph, int k, rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()); @@ -384,7 +438,7 @@ std::unique_ptr> k_truss_subgraph( * @tparam WT Type of edge weights. Supported values : float or double. * @tparam result_t Type of computed result. Supported values : float * - * @param[in] graph cuGRAPH graph descriptor, should contain the connectivity + * @param[in] graph cuGraph graph descriptor, should contain the connectivity * information as a CSR * @param[out] result Device array of centrality scores * @param[in] alpha Attenuation factor with a default value of 0.1. Alpha is set to @@ -404,7 +458,7 @@ std::unique_ptr> k_truss_subgraph( * @param[in] normalized If True normalize the resulting katz centrality values */ template -void katz_centrality(experimental::GraphCSRView const &graph, +void katz_centrality(GraphCSRView const &graph, result_t *result, double alpha, int max_iter, @@ -415,14 +469,14 @@ void katz_centrality(experimental::GraphCSRView const &graph, /** * @brief Compute the Core Number for the nodes of the graph G * - * @param[in] graph cuGRAPH graph descriptor with a valid edgeList or adjList + * @param[in] graph cuGraph graph descriptor with a valid edgeList or adjList * @param[out] core_number Populated by the core number of every vertex in the graph * * @throws cugraph::logic_error when an error occurs. */ /* ----------------------------------------------------------------------------*/ template -void core_number(experimental::GraphCSRView const &graph, VT *core_number); +void core_number(GraphCSRView const &graph, VT *core_number); /** * @brief Compute K Core of the graph G @@ -435,7 +489,7 @@ void core_number(experimental::GraphCSRView const &graph, VT *core_n * 32-bit) * @tparam WT Type of edge weights. Supported values : float or double. * - * @param[in] graph cuGRAPH graph in coordinate format + * @param[in] graph cuGraph graph in coordinate format * @param[in] k Order of the core. This value must not be negative. * @param[in] vertex_id User specified vertex identifiers for which core number values * are supplied @@ -446,8 +500,8 @@ void core_number(experimental::GraphCSRView const &graph, VT *core_n * @param[out] out_graph Unique pointer to K Core subgraph in COO format */ template -std::unique_ptr> k_core( - experimental::GraphCOOView const &graph, +std::unique_ptr> k_core( + GraphCOOView const &graph, int k, VT const *vertex_id, VT const *core_number, @@ -472,8 +526,7 @@ std::unique_ptr> k_core( * @return Graph in COO format */ template -std::unique_ptr> get_two_hop_neighbors( - experimental::GraphCSRView const &graph); +std::unique_ptr> get_two_hop_neighbors(GraphCSRView const &graph); /** * @Synopsis Performs a single source shortest path traversal of a graph starting from a vertex. @@ -486,7 +539,7 @@ std::unique_ptr> get_two_hop_neighbo * 32-bit) * @tparam WT Type of edge weights. Supported values : float or double. * - * @param[in] graph cuGRAPH graph descriptor, should contain the connectivity + * @param[in] graph cuGraph graph descriptor, should contain the connectivity * information as a CSR * * @param[out] distances If set to a valid pointer, array of size V populated by distance @@ -500,7 +553,7 @@ std::unique_ptr> get_two_hop_neighbo * */ template -void sssp(experimental::GraphCSRView const &graph, +void sssp(GraphCSRView const &graph, WT *distances, VT *predecessors, const VT source_vertex); @@ -519,7 +572,9 @@ void sssp(experimental::GraphCSRView const &graph, * 32-bit) * @tparam WT Type of edge weights. Supported values : int (signed, 32-bit) * - * @param[in] graph cuGRAPH graph descriptor, should contain the connectivity + * @param[in] handle Library handle (RAFT). If a communicator is set in the handle, + the multi GPU version will be selected. + * @param[in] graph cuGraph graph descriptor, should contain the connectivity * information as a CSR * * @param[out] distances If set to a valid pointer, this is populated by distance of @@ -535,41 +590,96 @@ void sssp(experimental::GraphCSRView const &graph, * * @param[in] directed Treat the input graph as directed * - * @throws cugraph::logic_error when an error occurs. + * @param[in] mg_batch If set to true use SG BFS path when comms are initialized. + * */ template -void bfs(experimental::GraphCSRView const &graph, +void bfs(raft::handle_t const &handle, + GraphCSRView const &graph, VT *distances, VT *predecessors, double *sp_counters, const VT start_vertex, - bool directed = true); + bool directed = true, + bool mg_batch = false); /** * @brief Louvain implementation * - * Compute a clustering of the graph by minimizing modularity + * Compute a clustering of the graph by maximizing modularity + * + * Computed using the Louvain method described in: + * + * VD Blondel, J-L Guillaume, R Lambiotte and E Lefebvre: Fast unfolding of + * community hierarchies in large networks, J Stat Mech P10008 (2008), + * http://arxiv.org/abs/0803.0476 * * @throws cugraph::logic_error when an error occurs. * - * @tparam VT Type of vertex identifiers. + * @tparam vertex_t Type of vertex identifiers. * Supported value : int (signed, 32-bit) - * @tparam ET Type of edge identifiers. + * @tparam edge_t Type of edge identifiers. * Supported value : int (signed, 32-bit) - * @tparam WT Type of edge weights. Supported values : float or double. + * @tparam weight_t Type of edge weights. Supported values : float or double. * * @param[in] graph input graph object (CSR) * @param[out] final_modularity modularity of the returned clustering * @param[out] num_level number of levels of the returned clustering * @param[out] clustering Pointer to device array where the clustering should be stored * @param[in] max_iter (optional) maximum number of iterations to run (default 100) + * @param[in] resolution (optional) The value of the resolution parameter to use. + * Called gamma in the modularity formula, this changes the size + * of the communities. Higher resolutions lead to more smaller + * communities, lower resolutions lead to fewer larger + * communities. (default 1) + * */ -template -void louvain(experimental::GraphCSRView const &graph, - WT *final_modularity, +template +void louvain(GraphCSRView const &graph, + weight_t *final_modularity, int *num_level, - VT *louvain_parts, - int max_iter = 100); + vertex_t *louvain_parts, + int max_iter = 100, + weight_t resolution = weight_t{1}); + +/** + * @brief Leiden implementation + * + * Compute a clustering of the graph by maximizing modularity using the Leiden improvements + * to the Louvain method. + * + * Computed using the Leiden method described in: + * + * Traag, V. A., Waltman, L., & van Eck, N. J. (2019). From Louvain to Leiden: + * guaranteeing well-connected communities. Scientific reports, 9(1), 5233. + * doi: 10.1038/s41598-019-41695-z + * + * @throws cugraph::logic_error when an error occurs. + * + * @tparam vertex_t Type of vertex identifiers. + * Supported value : int (signed, 32-bit) + * @tparam edge_t Type of edge identifiers. + * Supported value : int (signed, 32-bit) + * @tparam weight_t Type of edge weights. Supported values : float or double. + * + * @param[in] graph input graph object (CSR) + * @param[out] final_modularity modularity of the returned clustering + * @param[out] num_level number of levels of the returned clustering + * @param[out] clustering Pointer to device array where the clustering should be stored + * @param[in] max_iter (optional) maximum number of iterations to run (default 100) + * @param[in] resolution (optional) The value of the resolution parameter to use. + * Called gamma in the modularity formula, this changes the size + * of the communities. Higher resolutions lead to more smaller + * communities, lower resolutions lead to fewer larger + * communities. (default 1) + */ +template +void leiden(GraphCSRView const &graph, + weight_t &final_modularity, + int &num_level, + vertex_t *leiden_parts, + int max_iter = 100, + weight_t resolution = weight_t{1}); /** * @brief Computes the ecg clustering of the given graph. @@ -596,12 +706,9 @@ void louvain(experimental::GraphCSRView const &graph, * written */ template -void ecg(experimental::GraphCSRView const &graph_csr, - WT min_weight, - VT ensemble_size, - VT *ecg_parts); +void ecg(GraphCSRView const &graph_csr, WT min_weight, VT ensemble_size, VT *ecg_parts); -namespace nvgraph { +namespace triangle { /** * @brief Count the number of triangles in the graph @@ -619,8 +726,10 @@ namespace nvgraph { * @return The number of triangles */ template -uint64_t triangle_count(experimental::GraphCSRView const &graph); +uint64_t triangle_count(GraphCSRView const &graph); +} // namespace triangle +namespace subgraph { /** * @brief Extract subgraph by vertices * @@ -642,8 +751,9 @@ uint64_t triangle_count(experimental::GraphCSRView const &graph); * @param[out] result a graph in COO format containing the edges in the subgraph */ template -std::unique_ptr> extract_subgraph_vertex( - experimental::GraphCOOView const &graph, VT const *vertices, VT num_vertices); +std::unique_ptr> extract_subgraph_vertex(GraphCOOView const &graph, + VT const *vertices, + VT num_vertices); /** * @brief Wrapper function for Nvgraph balanced cut clustering @@ -663,11 +773,14 @@ std::unique_ptr> extract_subgraph_vertex( * @param[in] evs_max_iter The maximum number of iterations of the eigenvalue solver * @param[in] kmean_tolerance The tolerance to use for the kmeans solver * @param[in] kmean_max_iter The maximum number of iteration of the k-means solver - * @param[out] clustering Pointer to device memory where the resulting clustering will be - * stored + * @param[out] clustering Pointer to device memory where the resulting clustering will + * be stored */ +} // namespace subgraph + +namespace ext_raft { template -void balancedCutClustering(experimental::GraphCSRView const &graph, +void balancedCutClustering(GraphCSRView const &graph, VT num_clusters, VT num_eigen_vects, WT evs_tolerance, @@ -694,11 +807,11 @@ void balancedCutClustering(experimental::GraphCSRView const &graph, * @param[in] evs_max_iter The maximum number of iterations of the eigenvalue solver * @param[in] kmean_tolerance The tolerance to use for the kmeans solver * @param[in] kmean_max_iter The maximum number of iteration of the k-means solver - * @param[out] clustering Pointer to device memory where the resulting clustering will be - * stored + * @param[out] clustering Pointer to device memory where the resulting clustering will + * be stored */ template -void spectralModularityMaximization(experimental::GraphCSRView const &graph, +void spectralModularityMaximization(GraphCSRView const &graph, VT n_clusters, VT n_eig_vects, WT evs_tolerance, @@ -724,7 +837,7 @@ void spectralModularityMaximization(experimental::GraphCSRView const * @param[out] score Pointer to a float in which the result will be written */ template -void analyzeClustering_modularity(experimental::GraphCSRView const &graph, +void analyzeClustering_modularity(GraphCSRView const &graph, int n_clusters, VT const *clustering, WT *score); @@ -746,7 +859,7 @@ void analyzeClustering_modularity(experimental::GraphCSRView const & * @param[out] score Pointer to a float in which the result will be written */ template -void analyzeClustering_edge_cut(experimental::GraphCSRView const &graph, +void analyzeClustering_edge_cut(GraphCSRView const &graph, int n_clusters, VT const *clustering, WT *score); @@ -768,10 +881,50 @@ void analyzeClustering_edge_cut(experimental::GraphCSRView const &gr * @param[out] score Pointer to a float in which the result will be written */ template -void analyzeClustering_ratio_cut(experimental::GraphCSRView const &graph, +void analyzeClustering_ratio_cut(GraphCSRView const &graph, int n_clusters, VT const *clustering, WT *score); -} // namespace nvgraph +} // namespace ext_raft + +namespace gunrock { +/** + * @brief Compute the HITS vertex values for a graph + * + * cuGraph uses the gunrock implementation of HITS + * + * @throws cugraph::logic_error on an error + * + * @tparam VT Type of vertex identifiers. + * Supported value : int (signed, 32-bit) + * @tparam ET Type of edge identifiers. + * Supported value : int (signed, 32-bit) + * @tparam WT Type of edge weights. + * Supported value : float + * + * @param[in] graph input graph object (CSR). Edge weights are not used + * for this algorithm. + * @param[in] max_iter Maximum number of iterations to run + * @param[in] tolerance Currently ignored. gunrock implementation runs + * the specified number of iterations and stops + * @param[in] starting value Currently ignored. gunrock does not support. + * @param[in] normalized Currently ignored, gunrock computes this as true + * @param[out] *hubs Device memory pointing to the node value based + * on outgoing links + * @param[out] *authorities Device memory pointing to the node value based + * on incoming links + * + */ +template +void hits(GraphCSRView const &graph, + int max_iter, + WT tolerance, + WT const *starting_value, + bool normalized, + WT *hubs, + WT *authorities); + +} // namespace gunrock + } // namespace cugraph diff --git a/cpp/include/comms_mpi.hpp b/cpp/include/comms_mpi.hpp deleted file mode 100644 index 7a17bdfea4c..00000000000 --- a/cpp/include/comms_mpi.hpp +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#if ENABLE_OPG -#include -#include -#endif -#include -namespace cugraph { -namespace experimental { - -enum class ReduceOp { SUM, MAX, MIN }; - -// basic info about the snmg env setup -class Comm { - private: - int _p{0}; - int _rank{0}; - bool _finalize_mpi{false}; - bool _finalize_nccl{false}; - - int _device_id{0}; - int _device_count{0}; - - int _sm_count_per_device{0}; - int _max_grid_dim_1D{0}; - int _max_block_dim_1D{0}; - int _l2_cache_size{0}; - int _shared_memory_size_per_sm{0}; - -#if ENABLE_OPG - MPI_Comm _mpi_comm{}; - ncclComm_t _nccl_comm{}; -#endif - - public: - Comm(){}; - Comm(int p); -#if ENABLE_OPG - Comm(ncclComm_t comm, int size, int rank); -#endif - ~Comm(); - int get_rank() const { return _rank; } - int get_p() const { return _p; } - int get_dev() const { return _device_id; } - int get_dev_count() const { return _device_count; } - int get_sm_count() const { return _sm_count_per_device; } - bool is_master() const { return (_rank == 0) ? true : false; } - - void barrier(); - - template - void allgather(size_t size, value_t *sendbuff, value_t *recvbuff) const; - - template - void allreduce(size_t size, value_t *sendbuff, value_t *recvbuff, ReduceOp reduce_op) const; -}; - -} // namespace experimental -} // namespace cugraph diff --git a/cpp/include/functions.hpp b/cpp/include/functions.hpp index db737a4f5a4..1e88acb54b7 100644 --- a/cpp/include/functions.hpp +++ b/cpp/include/functions.hpp @@ -15,70 +15,13 @@ */ #pragma once +#include #include #include namespace cugraph { -/** - * @brief Convert COO to CSR, unweighted - * - * Takes a list of edges in COOrdinate format and generates a CSR format. - * Note, if you want CSC format simply pass the src and dst arrays - * in the opposite order. - * - * @throws cugraph::logic_error when an error occurs. - * - * @tparam vertex_t type of vertex index - * @tparam edge_t type of edge index - * - * @param[in] num_edges Number of edges - * @param[in] src Device array containing original source vertices - * @param[in] dst Device array containing original dest vertices - * @param[out] offsets Device array containing the CSR offsets - * @param[out] indices Device array containing the CSR indices - * - * @return Number of unique vertices in the src and dst arrays - * - */ -template -vertex_t coo2csr( - edge_t num_edges, vertex_t const *src, vertex_t const *dst, edge_t **offsets, vertex_t **indices); - -/** - * @brief Convert COO to CSR, weighted - * - * Takes a list of edges in COOrdinate format and generates a CSR format. - * Note, if you want CSC format simply pass the src and dst arrays - * in the opposite order. - * - * @throws cugraph::logic_error when an error occurs. - * - * @tparam vertex_t type of vertex index - * @tparam edge_t type of edge index - * @tparam weight_t type of the edge weight - * - * @param[in] num_edges Number of edges - * @param[in] src Device array containing original source vertices - * @param[in] dst Device array containing original dest vertices - * @param[in] weights Device array containing original edge weights - * @param[out] offsets Device array containing the CSR offsets - * @param[out] indices Device array containing the CSR indices - * @param[out] csr_weights Device array containing the CSR edge weights - * - * @return Number of unique vertices in the src and dst arrays - * - */ -template -vertex_t coo2csr_weighted(edge_t num_edges, - vertex_t const *src, - vertex_t const *dst, - weight_t const *weights, - edge_t **offsets, - vertex_t **indices, - weight_t **csr_weights); - /** * @brief Convert COO to CSR * @@ -90,15 +33,15 @@ vertex_t coo2csr_weighted(edge_t num_edges, * @tparam ET type of edge index * @tparam WT type of the edge weight * - * @param[in] graph cuGRAPH graph in coordinate format + * @param[in] graph cuGraph graph in coordinate format * @param[in] mr Memory resource used to allocate the returned graph * * @return Unique pointer to generate Compressed Sparse Row graph * */ template -std::unique_ptr> coo_to_csr( - experimental::GraphCOOView const &graph, +std::unique_ptr> coo_to_csr( + GraphCOOView const &graph, rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()); /** @@ -135,4 +78,24 @@ std::unique_ptr renumber_vertices( ET *map_size, rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()); +/** + * @brief Broadcast using handle communicator + * + * Use handle's communicator to operate broadcasting. + * + * @throws cugraph::logic_error when an error occurs. + * + * @tparam value_t Type of the data to broadcast + * + * @param[out] value Point to the data + * @param[in] count Number of elements to broadcast + * + */ + +// FIXME: It would be better to expose it in RAFT +template +void comms_bcast(const raft::handle_t &handle, value_t *value, size_t count) +{ + handle.get_comms().bcast(value, count, 0, handle.get_stream()); +} } // namespace cugraph diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp index d7b1a2838ac..9d42b4acdd7 100644 --- a/cpp/include/graph.hpp +++ b/cpp/include/graph.hpp @@ -14,16 +14,15 @@ * limitations under the License. */ #pragma once -#include +#include +#include +#include #include #include +#include #include -#include -#include - namespace cugraph { -namespace experimental { enum class PropType { PROP_UNDEF, PROP_FALSE, PROP_TRUE }; @@ -47,107 +46,133 @@ enum class DegreeDirection { /** * @brief Base class graphs, all but vertices and edges * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight + * @tparam vertex_t Type of vertex id + * @tparam edge_t Type of edge id + * @tparam weight_t Type of weight */ -template +template class GraphViewBase { public: - WT *edge_data; ///< edge weight - Comm comm; + raft::handle_t *handle; + weight_t *edge_data; ///< edge weight GraphProperties prop; - VT number_of_vertices; - ET number_of_edges; + vertex_t number_of_vertices; + edge_t number_of_edges; + + vertex_t *local_vertices; + edge_t *local_edges; + vertex_t *local_offsets; /** * @brief Fill the identifiers array with the vertex identifiers. * - * @param[out] identifier Pointer to device memory to store the vertex + * @param[out] identifiers Pointer to device memory to store the vertex * identifiers */ - void get_vertex_identifiers(VT *identifiers) const; - void set_communicator(Comm &comm_) { comm = comm_; } + void get_vertex_identifiers(vertex_t *identifiers) const; + + void set_local_data(vertex_t *vertices, edge_t *edges, vertex_t *offsets) + { + local_vertices = vertices; + local_edges = edges; + local_offsets = offsets; + } - GraphViewBase(WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) - : edge_data(edge_data_), - comm(), + void set_handle(raft::handle_t *handle_in) { handle = handle_in; } + + GraphViewBase(weight_t *edge_data, vertex_t number_of_vertices, edge_t number_of_edges) + : handle(nullptr), + edge_data(edge_data), prop(), - number_of_vertices(number_of_vertices_), - number_of_edges(number_of_edges_) + number_of_vertices(number_of_vertices), + number_of_edges(number_of_edges), + local_vertices(nullptr), + local_edges(nullptr), + local_offsets(nullptr) { } + bool has_data(void) const { return edge_data != nullptr; } }; /** * @brief A graph stored in COO (COOrdinate) format. * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight + * @tparam vertex_t Type of vertex id + * @tparam edge_t Type of edge id + * @tparam weight_t Type of weight */ -template -class GraphCOOView : public GraphViewBase { +template +class GraphCOOView : public GraphViewBase { public: - VT *src_indices{nullptr}; ///< rowInd - VT *dst_indices{nullptr}; ///< colInd + vertex_t *src_indices{nullptr}; ///< rowInd + vertex_t *dst_indices{nullptr}; ///< colInd /** * @brief Computes degree(in, out, in+out) of all the nodes of a Graph * * @throws cugraph::logic_error when an error occurs. * - * @param[out] degree Device array of size V (V is number of vertices) initialized + * @param[out] degree Device array of size V (V is number of + * vertices) initialized * to zeros. Will contain the computed degree of every vertex. * @param[in] direction IN_PLUS_OUT, IN or OUT */ - void degree(ET *degree, DegreeDirection direction) const; + void degree(edge_t *degree, DegreeDirection direction) const; /** * @brief Default constructor */ - GraphCOOView() : GraphViewBase(nullptr, 0, 0) {} + GraphCOOView() : GraphViewBase(nullptr, 0, 0) {} /** * @brief Wrap existing arrays representing an edge list in a Graph. * - * GraphCOOView does not own the memory used to represent this graph. This + * GraphCOOView does not own the memory used to represent this + * graph. This * function does not allocate memory. * - * @param source_indices This array of size E (number of edges) contains the index of the + * @param source_indices This array of size E (number of edges) + * contains the index of the * source for each edge. Indices must be in the range [0, V-1]. - * @param destination_indices This array of size E (number of edges) contains the index of the + * @param destination_indices This array of size E (number of edges) + * contains the index of the * destination for each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array size E (number of edges) contains the weight for each - * edge. This array can be null in which case the graph is considered unweighted. + * @param edge_data This array size E (number of edges) contains + * the weight for each + * edge. This array can be null in which case the graph is considered + * unweighted. * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCOOView( - VT *src_indices_, VT *dst_indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) - : GraphViewBase(edge_data_, number_of_vertices_, number_of_edges_), - src_indices(src_indices_), - dst_indices(dst_indices_) + GraphCOOView(vertex_t *src_indices, + vertex_t *dst_indices, + weight_t *edge_data, + vertex_t number_of_vertices, + edge_t number_of_edges) + : GraphViewBase(edge_data, number_of_vertices, number_of_edges), + src_indices(src_indices), + dst_indices(dst_indices) { } }; /** - * @brief Base class for graph stored in CSR (Compressed Sparse Row) format or CSC (Compressed + * @brief Base class for graph stored in CSR (Compressed Sparse Row) + * format or CSC (Compressed * Sparse Column) format * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight + * @tparam vertex_t Type of vertex id + * @tparam edge_t Type of edge id + * @tparam weight_t Type of weight */ -template -class GraphCompressedSparseBaseView : public GraphViewBase { +template +class GraphCompressedSparseBaseView : public GraphViewBase { public: - ET *offsets{nullptr}; ///< CSR offsets - VT *indices{nullptr}; ///< CSR indices + edge_t *offsets{nullptr}; ///< CSR offsets + vertex_t *indices{nullptr}; ///< CSR indices /** * @brief Fill the identifiers in the array with the source vertex @@ -156,42 +181,53 @@ class GraphCompressedSparseBaseView : public GraphViewBase { * @param[out] src_indices Pointer to device memory to store the * source vertex identifiers */ - void get_source_indices(VT *src_indices) const; + void get_source_indices(vertex_t *src_indices) const; /** * @brief Computes degree(in, out, in+out) of all the nodes of a Graph * * @throws cugraph::logic_error when an error occurs. * - * @param[out] degree Device array of size V (V is number of vertices) initialized + * @param[out] degree Device array of size V (V is number of + * vertices) initialized * to zeros. Will contain the computed degree of every vertex. - * @param[in] x Integer value indicating type of degree calculation + * @param[in] direction Integer value indicating type of degree + * calculation * 0 : in+out degree * 1 : in-degree * 2 : out-degree */ - void degree(ET *degree, DegreeDirection direction) const; + void degree(edge_t *degree, DegreeDirection direction) const; /** * @brief Wrap existing arrays representing adjacency lists in a Graph. - * GraphCSRView does not own the memory used to represent this graph. This + * GraphCSRView does not own the memory used to represent this + * graph. This * function does not allocate memory. * - * @param offsets This array of size V+1 (V is number of vertices) contains the - * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of + * @param offsets This array of size V+1 (V is number of + * vertices) contains the + * offset of adjacency lists of every vertex. Offsets must be in the range [0, + * E] (number of * edges). - * @param indices This array of size E contains the index of the destination for + * @param indices This array of size E contains the index of + * the destination for * each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for - * each edge. This array can be null in which case the graph is considered unweighted. + * @param edge_data This array of size E (number of edges) + * contains the weight for + * each edge. This array can be null in which case the graph is considered + * unweighted. * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCompressedSparseBaseView( - ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) - : GraphViewBase(edge_data_, number_of_vertices_, number_of_edges_), - offsets{offsets_}, - indices{indices_} + GraphCompressedSparseBaseView(edge_t *offsets, + vertex_t *indices, + weight_t *edge_data, + vertex_t number_of_vertices, + edge_t number_of_edges) + : GraphViewBase(edge_data, number_of_vertices, number_of_edges), + offsets{offsets}, + indices{indices} { } }; @@ -199,37 +235,49 @@ class GraphCompressedSparseBaseView : public GraphViewBase { /** * @brief A graph stored in CSR (Compressed Sparse Row) format. * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight + * @tparam vertex_t Type of vertex id + * @tparam edge_t Type of edge id + * @tparam weight_t Type of weight */ -template -class GraphCSRView : public GraphCompressedSparseBaseView { +template +class GraphCSRView : public GraphCompressedSparseBaseView { public: /** * @brief Default constructor */ - GraphCSRView() : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, 0) {} + GraphCSRView() + : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, 0) + { + } /** * @brief Wrap existing arrays representing adjacency lists in a Graph. - * GraphCSRView does not own the memory used to represent this graph. This + * GraphCSRView does not own the memory used to represent this + * graph. This * function does not allocate memory. * - * @param offsets This array of size V+1 (V is number of vertices) contains the - * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of + * @param offsets This array of size V+1 (V is number of + * vertices) contains the + * offset of adjacency lists of every vertex. Offsets must be in the range [0, + * E] (number of * edges). - * @param indices This array of size E contains the index of the destination for + * @param indices This array of size E contains the index of + * the destination for * each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for - * each edge. This array can be null in which case the graph is considered unweighted. + * @param edge_data This array of size E (number of edges) + * contains the weight for + * each edge. This array can be null in which case the graph is considered + * unweighted. * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCSRView( - ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) - : GraphCompressedSparseBaseView( - offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) + GraphCSRView(edge_t *offsets, + vertex_t *indices, + weight_t *edge_data, + vertex_t number_of_vertices, + edge_t number_of_edges) + : GraphCompressedSparseBaseView( + offsets, indices, edge_data, number_of_vertices, number_of_edges) { } }; @@ -237,57 +285,75 @@ class GraphCSRView : public GraphCompressedSparseBaseView { /** * @brief A graph stored in CSC (Compressed Sparse Column) format. * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight + * @tparam vertex_t Type of vertex id + * @tparam edge_t Type of edge id + * @tparam weight_t Type of weight */ -template -class GraphCSCView : public GraphCompressedSparseBaseView { +template +class GraphCSCView : public GraphCompressedSparseBaseView { public: /** * @brief Default constructor */ - GraphCSCView() : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, 0) {} + GraphCSCView() + : GraphCompressedSparseBaseView(nullptr, nullptr, nullptr, 0, 0) + { + } /** - * @brief Wrap existing arrays representing transposed adjacency lists in a Graph. - * GraphCSCView does not own the memory used to represent this graph. This + * @brief Wrap existing arrays representing transposed adjacency lists in + * a Graph. + * GraphCSCView does not own the memory used to represent this + * graph. This * function does not allocate memory. * - * @param offsets This array of size V+1 (V is number of vertices) contains the - * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of + * @param offsets This array of size V+1 (V is number of + * vertices) contains the + * offset of adjacency lists of every vertex. Offsets must be in the range [0, + * E] (number of * edges). - * @param indices This array of size E contains the index of the destination for + * @param indices This array of size E contains the index of + * the destination for * each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for - * each edge. This array can be null in which case the graph is considered unweighted. + * @param edge_data This array of size E (number of edges) + * contains the weight for + * each edge. This array can be null in which case the graph is considered + * unweighted. * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ - GraphCSCView( - ET *offsets_, VT *indices_, WT *edge_data_, VT number_of_vertices_, ET number_of_edges_) - : GraphCompressedSparseBaseView( - offsets_, indices_, edge_data_, number_of_vertices_, number_of_edges_) + GraphCSCView(edge_t *offsets, + vertex_t *indices, + weight_t *edge_data, + vertex_t number_of_vertices, + edge_t number_of_edges) + : GraphCompressedSparseBaseView( + offsets, indices, edge_data, number_of_vertices, number_of_edges) { } }; /** - * @brief TODO : Change this Take ownership of the provided graph arrays in COO format + * @brief TODO : Change this Take ownership of the provided graph arrays in + * COO format * - * @param source_indices This array of size E (number of edges) contains the index of the + * @param source_indices This array of size E (number of edges) contains + * the index of the * source for each edge. Indices must be in the range [0, V-1]. - * @param destination_indices This array of size E (number of edges) contains the index of the + * @param destination_indices This array of size E (number of edges) contains + * the index of the * destination for each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array size E (number of edges) contains the weight for each - * edge. This array can be null in which case the graph is considered unweighted. + * @param edge_data This array size E (number of edges) contains + * the weight for each + * edge. This array can be null in which case the graph is considered + * unweighted. * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph */ -template +template struct GraphCOOContents { - VT number_of_vertices; - ET number_of_edges; + vertex_t number_of_vertices; + edge_t number_of_edges; std::unique_ptr src_indices; std::unique_ptr dst_indices; std::unique_ptr edge_data; @@ -298,278 +364,291 @@ struct GraphCOOContents { * * This class will src_indices and dst_indicies (until moved) * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight + * @tparam vertex_t Type of vertex id + * @tparam edge_t Type of edge id + * @tparam weight_t Type of weight */ -template +template class GraphCOO { - VT number_of_vertices_; - ET number_of_edges_; - rmm::device_buffer src_indices_{}; ///< rowInd - rmm::device_buffer dst_indices_{}; ///< colInd - rmm::device_buffer edge_data_{}; ///< CSR data + vertex_t number_of_vertices_p; + edge_t number_of_edges_p; + rmm::device_buffer src_indices_p{}; ///< rowInd + rmm::device_buffer dst_indices_p{}; ///< colInd + rmm::device_buffer edge_data_p{}; ///< CSR data public: /** * @brief Take ownership of the provided graph arrays in COO format * - * @param source_indices This array of size E (number of edges) contains the index of the - * source for each edge. Indices must be in the range [0, V-1]. - * @param destination_indices This array of size E (number of edges) contains the index of the - * destination for each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array size E (number of edges) contains the weight for each - * edge. This array can be null in which case the graph is considered unweighted. * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph + * @param has_data Wiether or not the class has data, default = False + * @param stream Specify the cudaStream, default = null + * @param mr Specify the memory resource */ - GraphCOO(VT number_of_vertices, - ET number_of_edges, + GraphCOO(vertex_t number_of_vertices, + edge_t number_of_edges, bool has_data = false, cudaStream_t stream = nullptr, rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) - : number_of_vertices_(number_of_vertices), - number_of_edges_(number_of_edges), - src_indices_(sizeof(VT) * number_of_edges, stream, mr), - dst_indices_(sizeof(VT) * number_of_edges, stream, mr), - edge_data_((has_data ? sizeof(WT) * number_of_edges : 0), stream, mr) + : number_of_vertices_p(number_of_vertices), + number_of_edges_p(number_of_edges), + src_indices_p(sizeof(vertex_t) * number_of_edges, stream, mr), + dst_indices_p(sizeof(vertex_t) * number_of_edges, stream, mr), + edge_data_p((has_data ? sizeof(weight_t) * number_of_edges : 0), stream, mr) { } - GraphCOO(GraphCOOView const &graph, + GraphCOO(GraphCOOView const &graph, cudaStream_t stream = nullptr, rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) - : number_of_vertices_(graph.number_of_vertices), - number_of_edges_(graph.number_of_edges), - src_indices_(graph.src_indices, graph.number_of_edges * sizeof(VT), stream, mr), - dst_indices_(graph.dst_indices, graph.number_of_edges * sizeof(VT), stream, mr) + : number_of_vertices_p(graph.number_of_vertices), + number_of_edges_p(graph.number_of_edges), + src_indices_p(graph.src_indices, graph.number_of_edges * sizeof(vertex_t), stream, mr), + dst_indices_p(graph.dst_indices, graph.number_of_edges * sizeof(vertex_t), stream, mr) { if (graph.has_data()) { - edge_data_ = - rmm::device_buffer{graph.edge_data, graph.number_of_edges * sizeof(WT), stream, mr}; + edge_data_p = + rmm::device_buffer{graph.edge_data, graph.number_of_edges * sizeof(weight_t), stream, mr}; } } - VT number_of_vertices(void) { return number_of_vertices_; } - ET number_of_edges(void) { return number_of_edges_; } - VT *src_indices(void) { return static_cast(src_indices_.data()); } - VT *dst_indices(void) { return static_cast(dst_indices_.data()); } - WT *edge_data(void) { return static_cast(edge_data_.data()); } + vertex_t number_of_vertices(void) { return number_of_vertices_p; } + edge_t number_of_edges(void) { return number_of_edges_p; } + vertex_t *src_indices(void) { return static_cast(src_indices_p.data()); } + vertex_t *dst_indices(void) { return static_cast(dst_indices_p.data()); } + weight_t *edge_data(void) { return static_cast(edge_data_p.data()); } - GraphCOOContents release() noexcept + GraphCOOContents release() noexcept { - VT number_of_vertices = number_of_vertices_; - ET number_of_edges = number_of_edges_; - number_of_vertices_ = 0; - number_of_edges_ = 0; - return GraphCOOContents{ + vertex_t number_of_vertices = number_of_vertices_p; + edge_t number_of_edges = number_of_edges_p; + number_of_vertices_p = 0; + number_of_edges_p = 0; + return GraphCOOContents{ number_of_vertices, number_of_edges, - std::make_unique(std::move(src_indices_)), - std::make_unique(std::move(dst_indices_)), - std::make_unique(std::move(edge_data_))}; + std::make_unique(std::move(src_indices_p)), + std::make_unique(std::move(dst_indices_p)), + std::make_unique(std::move(edge_data_p))}; } - GraphCOOView view(void) noexcept + GraphCOOView view(void) noexcept { - return GraphCOOView( - src_indices(), dst_indices(), edge_data(), number_of_vertices_, number_of_edges_); + return GraphCOOView( + src_indices(), dst_indices(), edge_data(), number_of_vertices_p, number_of_edges_p); } - bool has_data(void) { return nullptr != edge_data_.data(); } + bool has_data(void) { return nullptr != edge_data_p.data(); } }; -template +template struct GraphSparseContents { - VT number_of_vertices; - ET number_of_edges; + vertex_t number_of_vertices; + edge_t number_of_edges; std::unique_ptr offsets; std::unique_ptr indices; std::unique_ptr edge_data; }; /** - * @brief Base class for constructted graphs stored in CSR (Compressed Sparse Row) format or + * @brief Base class for constructted graphs stored in CSR (Compressed + * Sparse Row) format or * CSC (Compressed Sparse Column) format * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight + * @tparam vertex_t Type of vertex id + * @tparam edge_t Type of edge id + * @tparam weight_t Type of weight */ -template +template class GraphCompressedSparseBase { - VT number_of_vertices_{0}; - ET number_of_edges_{0}; - rmm::device_buffer offsets_{}; ///< CSR offsets - rmm::device_buffer indices_{}; ///< CSR indices - rmm::device_buffer edge_data_{}; ///< CSR data + vertex_t number_of_vertices_p{0}; + edge_t number_of_edges_p{0}; + rmm::device_buffer offsets_p{}; ///< CSR offsets + rmm::device_buffer indices_p{}; ///< CSR indices + rmm::device_buffer edge_data_p{}; ///< CSR data - bool has_data_{false}; + bool has_data_p{false}; public: /** * @brief Take ownership of the provided graph arrays in CSR/CSC format * - * @param offsets This array of size V+1 (V is number of vertices) contains the - * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of - * edges). - * @param indices This array of size E contains the index of the destination for - * each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for - * each edge. This array can be null in which case the graph is considered unweighted. * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph + * @param has_data Wiether or not the class has data, default = False + * @param stream Specify the cudaStream, default = null + * @param mr Specify the memory resource */ - GraphCompressedSparseBase(VT number_of_vertices, - ET number_of_edges, + GraphCompressedSparseBase(vertex_t number_of_vertices, + edge_t number_of_edges, bool has_data, cudaStream_t stream, rmm::mr::device_memory_resource *mr) - : number_of_vertices_(number_of_vertices), - number_of_edges_(number_of_edges), - offsets_(sizeof(ET) * (number_of_vertices + 1), stream, mr), - indices_(sizeof(VT) * number_of_edges, stream, mr), - edge_data_((has_data ? sizeof(WT) * number_of_edges : 0), stream, mr) + : number_of_vertices_p(number_of_vertices), + number_of_edges_p(number_of_edges), + offsets_p(sizeof(edge_t) * (number_of_vertices + 1), stream, mr), + indices_p(sizeof(vertex_t) * number_of_edges, stream, mr), + edge_data_p((has_data ? sizeof(weight_t) * number_of_edges : 0), stream, mr) { } - GraphCompressedSparseBase(GraphSparseContents &&contents) - : number_of_vertices_(contents.number_of_vertices), - number_of_edges_(contents.number_of_edges), - offsets_(std::move(*contents.offsets.release())), - indices_(std::move(*contents.indices.release())), - edge_data_(std::move(*contents.edge_data.release())) + GraphCompressedSparseBase(GraphSparseContents &&contents) + : number_of_vertices_p(contents.number_of_vertices), + number_of_edges_p(contents.number_of_edges), + offsets_p(std::move(*contents.offsets.release())), + indices_p(std::move(*contents.indices.release())), + edge_data_p(std::move(*contents.edge_data.release())) { } - VT number_of_vertices(void) { return number_of_vertices_; } - ET number_of_edges(void) { return number_of_edges_; } - ET *offsets(void) { return static_cast(offsets_.data()); } - VT *indices(void) { return static_cast(indices_.data()); } - WT *edge_data(void) { return static_cast(edge_data_.data()); } + vertex_t number_of_vertices(void) { return number_of_vertices_p; } + edge_t number_of_edges(void) { return number_of_edges_p; } + edge_t *offsets(void) { return static_cast(offsets_p.data()); } + vertex_t *indices(void) { return static_cast(indices_p.data()); } + weight_t *edge_data(void) { return static_cast(edge_data_p.data()); } - GraphSparseContents release() noexcept + GraphSparseContents release() noexcept { - VT number_of_vertices = number_of_vertices_; - ET number_of_edges = number_of_edges_; - number_of_vertices_ = 0; - number_of_edges_ = 0; - return GraphSparseContents{ + vertex_t number_of_vertices = number_of_vertices_p; + edge_t number_of_edges = number_of_edges_p; + number_of_vertices_p = 0; + number_of_edges_p = 0; + return GraphSparseContents{ number_of_vertices, number_of_edges, - std::make_unique(std::move(offsets_)), - std::make_unique(std::move(indices_)), - std::make_unique(std::move(edge_data_))}; + std::make_unique(std::move(offsets_p)), + std::make_unique(std::move(indices_p)), + std::make_unique(std::move(edge_data_p))}; } - bool has_data(void) { return nullptr != edge_data_.data(); } + bool has_data(void) { return nullptr != edge_data_p.data(); } }; /** - * @brief A constructed graph stored in CSR (Compressed Sparse Row) format. + * @brief A constructed graph stored in CSR (Compressed Sparse Row) + * format. * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight + * @tparam vertex_t Type of vertex id + * @tparam edge_t Type of edge id + * @tparam weight_t Type of weight */ -template -class GraphCSR : public GraphCompressedSparseBase { +template +class GraphCSR : public GraphCompressedSparseBase { public: /** * @brief Default constructor */ - GraphCSR() : GraphCompressedSparseBase() {} + GraphCSR() : GraphCompressedSparseBase() {} /** * @brief Take ownership of the provided graph arrays in CSR format * - * @param offsets This array of size V+1 (V is number of vertices) contains the - * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of - * edges). - * @param indices This array of size E contains the index of the destination for - * each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for - * each edge. This array can be null in which case the graph is considered unweighted. * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph + * @param has_data Wiether or not the class has data, default = False + * @param stream Specify the cudaStream, default = null + * @param mr Specify the memory resource */ - GraphCSR(VT number_of_vertices_, - ET number_of_edges_, + GraphCSR(vertex_t number_of_vertices_, + edge_t number_of_edges_, bool has_data_ = false, cudaStream_t stream = nullptr, rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) - : GraphCompressedSparseBase( + : GraphCompressedSparseBase( number_of_vertices_, number_of_edges_, has_data_, stream, mr) { } - GraphCSR(GraphSparseContents &&contents) - : GraphCompressedSparseBase(std::move(contents)) + GraphCSR(GraphSparseContents &&contents) + : GraphCompressedSparseBase(std::move(contents)) { } - GraphCSRView view(void) noexcept + GraphCSRView view(void) noexcept { - return GraphCSRView(GraphCompressedSparseBase::offsets(), - GraphCompressedSparseBase::indices(), - GraphCompressedSparseBase::edge_data(), - GraphCompressedSparseBase::number_of_vertices(), - GraphCompressedSparseBase::number_of_edges()); + return GraphCSRView( + GraphCompressedSparseBase::offsets(), + GraphCompressedSparseBase::indices(), + GraphCompressedSparseBase::edge_data(), + GraphCompressedSparseBase::number_of_vertices(), + GraphCompressedSparseBase::number_of_edges()); } }; /** - * @brief A constructed graph stored in CSC (Compressed Sparse Column) format. + * @brief A constructed graph stored in CSC (Compressed Sparse Column) + * format. * - * @tparam VT Type of vertex id - * @tparam ET Type of edge id - * @tparam WT Type of weight + * @tparam vertex_t Type of vertex id + * @tparam edge_t Type of edge id + * @tparam weight_t Type of weight */ -template -class GraphCSC : public GraphCompressedSparseBase { +template +class GraphCSC : public GraphCompressedSparseBase { public: /** * @brief Default constructor */ - GraphCSC() : GraphCompressedSparseBase() {} + GraphCSC() : GraphCompressedSparseBase() {} /** * @brief Take ownership of the provided graph arrays in CSR format * - * @param offsets This array of size V+1 (V is number of vertices) contains the - * offset of adjacency lists of every vertex. Offsets must be in the range [0, E] (number of - * edges). - * @param indices This array of size E contains the index of the destination for - * each edge. Indices must be in the range [0, V-1]. - * @param edge_data This array of size E (number of edges) contains the weight for - * each edge. This array can be null in which case the graph is considered unweighted. * @param number_of_vertices The number of vertices in the graph * @param number_of_edges The number of edges in the graph + * @param has_data Wiether or not the class has data, default = False + * @param stream Specify the cudaStream, default = null + * @param mr Specify the memory resource */ - GraphCSC(VT number_of_vertices_, - ET number_of_edges_, - bool has_data_ = false, + GraphCSC(vertex_t number_of_vertices_in, + edge_t number_of_edges_in, + bool has_data_in = false, cudaStream_t stream = nullptr, rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource()) - : GraphCompressedSparseBase( - number_of_vertices_, number_of_edges_, has_data_, stream, mr) + : GraphCompressedSparseBase( + number_of_vertices_in, number_of_edges_in, has_data_in, stream, mr) { } - GraphCSC(GraphSparseContents &&contents) - : GraphCompressedSparseBase(contents) + GraphCSC(GraphSparseContents &&contents) + : GraphCompressedSparseBase(contents) { } - GraphCSCView view(void) noexcept + GraphCSCView view(void) noexcept { - return GraphCSCView(GraphCompressedSparseBase::offsets(), - GraphCompressedSparseBase::indices(), - GraphCompressedSparseBase::edge_data(), - GraphCompressedSparseBase::number_of_vertices(), - GraphCompressedSparseBase::number_of_edges()); + return GraphCSCView( + GraphCompressedSparseBase::offsets(), + GraphCompressedSparseBase::indices(), + GraphCompressedSparseBase::edge_data(), + GraphCompressedSparseBase::number_of_vertices(), + GraphCompressedSparseBase::number_of_edges()); } }; -} // namespace experimental +template +struct invalid_idx; + +template +struct invalid_idx< + T, + typename std::enable_if_t::value && std::is_signed::value>> + : std::integral_constant { +}; + +template +struct invalid_idx< + T, + typename std::enable_if_t::value && std::is_unsigned::value>> + : std::integral_constant::max()> { +}; + +template +struct invalid_vertex_id : invalid_idx { +}; + +template +struct invalid_edge_id : invalid_idx { +}; } // namespace cugraph diff --git a/cpp/include/utilities/error.hpp b/cpp/include/utilities/error.hpp new file mode 100644 index 00000000000..e44e2c910ea --- /dev/null +++ b/cpp/include/utilities/error.hpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace cugraph { + +/** + * @brief Exception thrown when logical precondition is violated. + * + * This exception should not be thrown directly and is instead thrown by the + * CUGRAPH_EXPECTS and CUGRAPH_FAIL macros. + * + */ +struct logic_error : public raft::exception { + explicit logic_error(char const* const message) : raft::exception(message) {} + explicit logic_error(std::string const& message) : raft::exception(message) {} +}; + +} // namespace cugraph + +/** + * @brief Macro for checking (pre-)conditions that throws an exception when a condition is false + * + * @param[in] cond Expression that evaluates to true or false + * @param[in] fmt String literal description of the reason that cond is expected to be true with + * optinal format tagas + * @throw cugraph::logic_error if the condition evaluates to false. + */ +#define CUGRAPH_EXPECTS(cond, fmt, ...) \ + do { \ + if (!(cond)) { \ + std::string msg{}; \ + SET_ERROR_MSG(msg, "cuGraph failure at ", fmt, ##__VA_ARGS__); \ + throw cugraph::logic_error(msg); \ + } \ + } while (0) + +/** + * @brief Indicates that an erroneous code path has been taken. + * + * @param[in] fmt String literal description of the reason that this code path is erroneous with + * optinal format tagas + * @throw always throws cugraph::logic_error + */ +#define CUGRAPH_FAIL(fmt, ...) \ + do { \ + std::string msg{}; \ + SET_ERROR_MSG(msg, "cuGraph failure at ", fmt, ##__VA_ARGS__); \ + throw cugraph::logic_error(msg); \ + } while (0) diff --git a/cpp/src/centrality/betweenness_centrality.cu b/cpp/src/centrality/betweenness_centrality.cu index 5948c6f9ec9..8ff62f7ddb6 100644 --- a/cpp/src/centrality/betweenness_centrality.cu +++ b/cpp/src/centrality/betweenness_centrality.cu @@ -18,148 +18,207 @@ #include +#include + #include #include +#include +#include -#include - +#include #include "betweenness_centrality.cuh" +#include "betweenness_centrality_kernels.cuh" namespace cugraph { namespace detail { +namespace { +template +void betweenness_centrality_impl(raft::handle_t const &handle, + GraphCSRView const &graph, + result_t *result, + bool normalize, + bool endpoints, + weight_t const *weight, + vertex_t number_of_sources, + vertex_t const *sources, + vertex_t total_number_of_sources) +{ + // Current Implementation relies on BFS + // FIXME: For SSSP version + // Brandes Algorithm expects non negative weights for the accumulation + bool is_edge_betweenness = false; + verify_betweenness_centrality_input( + result, is_edge_betweenness, normalize, endpoints, weight, number_of_sources, sources); + cugraph::detail::BC bc(handle, graph); + bc.configure( + result, is_edge_betweenness, normalize, endpoints, weight, sources, number_of_sources); + bc.compute(); + bc.rescale_by_total_sources_used(total_number_of_sources); +} -template -void BC::setup() +template +void edge_betweenness_centrality_impl(raft::handle_t const &handle, + GraphCSRView const &graph, + result_t *result, + bool normalize, + weight_t const *weight, + vertex_t number_of_sources, + vertex_t const *sources, + vertex_t total_number_of_sources) { - // --- Set up parameters from graph adjList --- - number_of_vertices = graph.number_of_vertices; - number_of_edges = graph.number_of_edges; - offsets_ptr = graph.offsets; - indices_ptr = graph.indices; + // Current Implementation relies on BFS + // FIXME: For SSSP version + // Brandes Algorithm expects non negative weights for the accumulation + bool is_edge_betweenness = true; + bool endpoints = false; + verify_betweenness_centrality_input( + result, is_edge_betweenness, normalize, endpoints, weight, number_of_sources, sources); + cugraph::detail::BC bc(handle, graph); + bc.configure( + result, is_edge_betweenness, normalize, endpoints, weight, sources, number_of_sources); + bc.compute(); + // NOTE: As of 07/2020 NetworkX does not apply rescaling based on number + // of sources + // bc.rescale_by_total_sources_used(total_number_of_sources); } +template +vertex_t get_total_number_of_sources(raft::handle_t const &handle, vertex_t local_number_of_sources) +{ + vertex_t total_number_of_sources_used = local_number_of_sources; + if (handle.comms_initialized()) { + rmm::device_scalar d_number_of_sources(local_number_of_sources, handle.get_stream()); + handle.get_comms().allreduce(d_number_of_sources.data(), + d_number_of_sources.data(), + 1, + raft::comms::op_t::SUM, + handle.get_stream()); + total_number_of_sources_used = d_number_of_sources.value(handle.get_stream()); + // CUDA_TRY( + // cudaMemcpy(&total_number_of_sources_used, data, sizeof(vertex_t), cudaMemcpyDeviceToHost)); + } + return total_number_of_sources_used; +} +} // namespace -template -void BC::configure(result_t *_betweenness, - bool _normalized, - bool _endpoints, - WT const *_weights, - VT const *_sources, - VT _number_of_sources) +template +void verify_betweenness_centrality_input(result_t *result, + bool is_edge_betweenness, + bool normalize, + bool endpoints, + weight_t const *weights, + vertex_t const number_of_sources, + vertex_t const *sources) +{ + static_assert(std::is_same::value, "vertex_t should be int"); + static_assert(std::is_same::value, "edge_t should be int"); + static_assert(std::is_same::value || std::is_same::value, + "weight_t should be float or double"); + static_assert(std::is_same::value || std::is_same::value, + "result_t should be float or double"); + + CUGRAPH_EXPECTS(result != nullptr, "Invalid API parameter: betwenness pointer is NULL"); + CUGRAPH_EXPECTS(number_of_sources >= 0, "Number of sources must be positive or equal to 0."); + if (number_of_sources != 0) { + CUGRAPH_EXPECTS(sources != nullptr, + "Sources cannot be NULL if number_of_source is different from 0."); + } + if (is_edge_betweenness) { + CUGRAPH_EXPECTS(!endpoints, "Endpoints is not supported for edge betweenness centrality."); + } +} + +template +void BC::setup() +{ + number_of_vertices_ = graph_.number_of_vertices; + number_of_edges_ = graph_.number_of_edges; + offsets_ptr_ = graph_.offsets; + indices_ptr_ = graph_.indices; +} + +template +void BC::configure(result_t *betweenness, + bool is_edge_betweenness, + bool normalized, + bool endpoints, + weight_t const *weights, + vertex_t const *sources, + vertex_t number_of_sources) { // --- Bind betweenness output vector to internal --- - betweenness = _betweenness; - normalized = _normalized; - endpoints = _endpoints; - sources = _sources; - number_of_sources = _number_of_sources; - edge_weights_ptr = _weights; + betweenness_ = betweenness; + normalized_ = normalized; + endpoints_ = endpoints; + sources_ = sources; + number_of_sources_ = number_of_sources; + edge_weights_ptr_ = weights; + is_edge_betweenness_ = is_edge_betweenness; // --- Working data allocation --- - distances_vec.resize(number_of_vertices); - predecessors_vec.resize(number_of_vertices); - sp_counters_vec.resize(number_of_vertices); - deltas_vec.resize(number_of_vertices); - - distances = distances_vec.data().get(); - predecessors = predecessors_vec.data().get(); - sp_counters = sp_counters_vec.data().get(); - deltas = deltas_vec.data().get(); + initialize_work_vectors(); + initialize_pointers_to_vectors(); // --- Get Device Information --- - CUDA_TRY(cudaGetDevice(&device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&max_grid_dim_1D, cudaDevAttrMaxGridDimX, device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&max_block_dim_1D, cudaDevAttrMaxBlockDimX, device_id)); + initialize_device_information(); // --- Confirm that configuration went through --- - configured = true; + configured_ = true; } -// Dependecy Accumulation: McLaughlin and Bader, 2018 -// NOTE: Accumulation kernel might not scale well, as each thread is handling -// all the edges for each node, an approach similar to the traversal -// bucket (i.e. BFS / SSSP) system might enable speed up -// NOTE: Shortest Path counter can increase extremely fast, thus double are used -// however, the user might want to get the result back in float -// we delay casting the result until dependecy accumulation -template -__global__ void accumulation_kernel(result_t *betweenness, - VT number_vertices, - VT const *indices, - ET const *offsets, - VT *distances, - double *sp_counters, - double *deltas, - VT source, - VT depth) +template +void BC::initialize_work_vectors() { - for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < number_vertices; - tid += gridDim.x * blockDim.x) { - VT w = tid; - double dsw = 0; - double sw = sp_counters[w]; - if (distances[w] == depth) { // Process nodes at this depth - ET edge_start = offsets[w]; - ET edge_end = offsets[w + 1]; - ET edge_count = edge_end - edge_start; - for (ET edge_idx = 0; edge_idx < edge_count; ++edge_idx) { // Visit neighbors - VT v = indices[edge_start + edge_idx]; - if (distances[v] == distances[w] + 1) { - double factor = (static_cast(1) + deltas[v]) / sp_counters[v]; - dsw += sw * factor; - } - } - deltas[w] = dsw; - } - } + distances_vec_.resize(number_of_vertices_); + predecessors_vec_.resize(number_of_vertices_); + sp_counters_vec_.resize(number_of_vertices_); + deltas_vec_.resize(number_of_vertices_); } -template -void BC::accumulate(result_t *betweenness, - VT *distances, - double *sp_counters, - double *deltas, - VT source, - VT max_depth) +template +void BC::initialize_pointers_to_vectors() { - dim3 grid, block; - block.x = max_block_dim_1D; - grid.x = min(max_grid_dim_1D, (number_of_edges / block.x + 1)); - // Step 1) Dependencies (deltas) are initialized to 0 before starting - thrust::fill(rmm::exec_policy(stream)->on(stream), - deltas, - deltas + number_of_vertices, - static_cast(0)); - // Step 2) Process each node, -1 is used to notify unreached nodes in the sssp - for (VT depth = max_depth; depth > 0; --depth) { - accumulation_kernel<<>>(betweenness, - number_of_vertices, - graph.indices, - graph.offsets, - distances, - sp_counters, - deltas, - source, - depth); - } + distances_ = distances_vec_.data().get(); + predecessors_ = predecessors_vec_.data().get(); + sp_counters_ = sp_counters_vec_.data().get(); + deltas_ = deltas_vec_.data().get(); +} - thrust::transform(rmm::exec_policy(stream)->on(stream), - deltas, - deltas + number_of_vertices, - betweenness, - betweenness, - thrust::plus()); +template +void BC::initialize_device_information() +{ + max_grid_dim_1D_ = handle_.get_device_properties().maxGridSize[0]; + max_block_dim_1D_ = handle_.get_device_properties().maxThreadsDim[0]; } -// We do not verifiy the graph structure as the new graph structure -// enforces CSR Format +template +void BC::compute() +{ + CUGRAPH_EXPECTS(configured_, "BC must be configured before computation"); + if (sources_) { + for (vertex_t source_idx = 0; source_idx < number_of_sources_; ++source_idx) { + vertex_t source_vertex = sources_[source_idx]; + compute_single_source(source_vertex); + } + } else { + for (vertex_t source_vertex = 0; source_vertex < number_of_vertices_; ++source_vertex) { + compute_single_source(source_vertex); + } + } + rescale(); +} -// FIXME: Having a system that relies on an class might make it harder to -// dispatch later -template -void BC::compute_single_source(VT source_vertex) +template +void BC::compute_single_source(vertex_t source_vertex) { // Step 1) Singe-source shortest-path problem - cugraph::bfs(graph, distances, predecessors, sp_counters, source_vertex, graph.prop.directed); + cugraph::bfs(handle_, + graph_, + distances_, + predecessors_, + sp_counters_, + source_vertex, + graph_.prop.directed, + true); // FIXME: Remove that with a BC specific class to gather // information during traversal @@ -168,166 +227,335 @@ void BC::compute_single_source(VT source_vertex) // the traversal, this value is avalaible within the bfs implementation and // there could be a way to access it directly and avoid both replace and the // max - thrust::replace(rmm::exec_policy(stream)->on(stream), - distances, - distances + number_of_vertices, - std::numeric_limits::max(), - static_cast(-1)); - auto current_max_depth = thrust::max_element( - rmm::exec_policy(stream)->on(stream), distances, distances + number_of_vertices); - VT max_depth = 0; - cudaMemcpy(&max_depth, current_max_depth, sizeof(VT), cudaMemcpyDeviceToHost); + thrust::replace(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + distances_, + distances_ + number_of_vertices_, + std::numeric_limits::max(), + static_cast(-1)); + auto current_max_depth = + thrust::max_element(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + distances_, + distances_ + number_of_vertices_); + vertex_t max_depth = 0; + CUDA_TRY(cudaMemcpy(&max_depth, current_max_depth, sizeof(vertex_t), cudaMemcpyDeviceToHost)); // Step 2) Dependency accumulation - accumulate(betweenness, distances, sp_counters, deltas, source_vertex, max_depth); + accumulate(source_vertex, max_depth); +} + +template +void BC::accumulate(vertex_t source_vertex, + vertex_t max_depth) +{ + dim3 grid_configuration, block_configuration; + block_configuration.x = max_block_dim_1D_; + grid_configuration.x = min(max_grid_dim_1D_, (number_of_edges_ / block_configuration.x + 1)); + + initialize_dependencies(); + + if (is_edge_betweenness_) { + accumulate_edges(max_depth, grid_configuration, block_configuration); + } else if (endpoints_) { + accumulate_vertices_with_endpoints( + source_vertex, max_depth, grid_configuration, block_configuration); + } else { + accumulate_vertices(max_depth, grid_configuration, block_configuration); + } } -template -void BC::compute() +template +void BC::initialize_dependencies() { - CUGRAPH_EXPECTS(configured, "BC must be configured before computation"); - // If sources is defined we only process vertices contained in it - thrust::fill(rmm::exec_policy(stream)->on(stream), - betweenness, - betweenness + number_of_vertices, + thrust::fill(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + deltas_, + deltas_ + number_of_vertices_, static_cast(0)); - cudaStreamSynchronize(stream); - if (sources) { - for (VT source_idx = 0; source_idx < number_of_sources; ++source_idx) { - VT source_vertex = sources[source_idx]; - compute_single_source(source_vertex); - } - } else { // Otherwise process every vertices - // NOTE: Maybe we could still use number of sources and set it to number_of_vertices? - // It woudl imply having a host vector of size |V| - // But no need for the if/ else statement - for (VT source_vertex = 0; source_vertex < number_of_vertices; ++source_vertex) { - compute_single_source(source_vertex); - } +} +template +void BC::accumulate_edges(vertex_t max_depth, + dim3 grid_configuration, + dim3 block_configuration) +{ + for (vertex_t depth = max_depth; depth >= 0; --depth) { + edges_accumulation_kernel + <<>>(betweenness_, + number_of_vertices_, + graph_.indices, + graph_.offsets, + distances_, + sp_counters_, + deltas_, + depth); } - rescale(); } -template -void BC::rescale() +template +void BC::accumulate_vertices_with_endpoints( + vertex_t source_vertex, vertex_t max_depth, dim3 grid_configuration, dim3 block_configuration) { - thrust::device_vector normalizer(number_of_vertices); - bool modified = false; - result_t rescale_factor = static_cast(1); - result_t casted_number_of_vertices = static_cast(number_of_vertices); - result_t casted_number_of_sources = static_cast(number_of_sources); - if (normalized) { - if (number_of_vertices > 2) { - rescale_factor /= ((casted_number_of_vertices - 1) * (casted_number_of_vertices - 2)); - modified = true; + for (vertex_t depth = max_depth; depth > 0; --depth) { + endpoints_accumulation_kernel + <<>>(betweenness_, + number_of_vertices_, + graph_.indices, + graph_.offsets, + distances_, + sp_counters_, + deltas_, + depth); + } + add_reached_endpoints_to_source_betweenness(source_vertex); + add_vertices_dependencies_to_betweenness(); +} + +// Distances should contain -1 for unreached nodes, + +// FIXME: There might be a cleaner way to add a value to a single +// score in the betweenness vector +template +void BC::add_reached_endpoints_to_source_betweenness( + vertex_t source_vertex) +{ + vertex_t number_of_unvisited_vertices = + thrust::count(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + distances_, + distances_ + number_of_vertices_, + -1); + vertex_t number_of_visited_vertices_except_source = + number_of_vertices_ - number_of_unvisited_vertices - 1; + rmm::device_vector buffer(1); + buffer[0] = number_of_visited_vertices_except_source; + thrust::transform(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + buffer.begin(), + buffer.end(), + betweenness_ + source_vertex, + betweenness_ + source_vertex, + thrust::plus()); +} + +template +void BC::add_vertices_dependencies_to_betweenness() +{ + thrust::transform(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + deltas_, + deltas_ + number_of_vertices_, + betweenness_, + betweenness_, + thrust::plus()); +} + +template +void BC::accumulate_vertices(vertex_t max_depth, + dim3 grid_configuration, + dim3 block_configuration) +{ + for (vertex_t depth = max_depth; depth > 0; --depth) { + accumulation_kernel + <<>>(betweenness_, + number_of_vertices_, + graph_.indices, + graph_.offsets, + distances_, + sp_counters_, + deltas_, + depth); + } + add_vertices_dependencies_to_betweenness(); +} + +template +void BC::rescale() +{ + bool modified = false; + result_t rescale_factor = static_cast(1); + if (normalized_) { + if (is_edge_betweenness_) { + std::tie(rescale_factor, modified) = + rescale_edges_betweenness_centrality(rescale_factor, modified); + } else { + std::tie(rescale_factor, modified) = + rescale_vertices_betweenness_centrality(rescale_factor, modified); } } else { - if (!graph.prop.directed) { + if (!graph_.prop.directed) { rescale_factor /= static_cast(2); modified = true; } } - if (modified) { - if (number_of_sources > 0) { - rescale_factor *= (casted_number_of_vertices / casted_number_of_sources); + apply_rescale_factor_to_betweenness(rescale_factor); +} + +template +std::tuple +BC::rescale_edges_betweenness_centrality( + result_t rescale_factor, bool modified) +{ + result_t casted_number_of_vertices_ = static_cast(number_of_vertices_); + if (number_of_vertices_ > 1) { + rescale_factor /= ((casted_number_of_vertices_) * (casted_number_of_vertices_ - 1)); + modified = true; + } + return std::make_tuple(rescale_factor, modified); +} + +template +std::tuple +BC::rescale_vertices_betweenness_centrality( + result_t rescale_factor, bool modified) +{ + result_t casted_number_of_vertices = static_cast(number_of_vertices_); + if (number_of_vertices_ > 2) { + if (endpoints_) { + rescale_factor /= (casted_number_of_vertices * (casted_number_of_vertices - 1)); + } else { + rescale_factor /= ((casted_number_of_vertices - 1) * (casted_number_of_vertices - 2)); } + modified = true; } - thrust::fill(normalizer.begin(), normalizer.end(), rescale_factor); - thrust::transform(rmm::exec_policy(stream)->on(stream), - betweenness, - betweenness + number_of_vertices, - normalizer.begin(), - betweenness, + return std::make_tuple(rescale_factor, modified); +} + +template +void BC::apply_rescale_factor_to_betweenness( + result_t rescale_factor) +{ + size_t result_size = number_of_vertices_; + if (is_edge_betweenness_) result_size = number_of_edges_; + thrust::transform(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + betweenness_, + betweenness_ + result_size, + thrust::make_constant_iterator(rescale_factor), + betweenness_, thrust::multiplies()); } -template -void verify_input(result_t *result, - bool normalize, - bool endpoints, - WT const *weights, - VT const number_of_sources, - VT const *sources) +template +void BC::rescale_by_total_sources_used( + vertex_t total_number_of_sources_used) { - CUGRAPH_EXPECTS(result != nullptr, "Invalid API parameter: output betwenness is nullptr"); - if (typeid(VT) != typeid(int)) { - CUGRAPH_FAIL("Unsupported vertex id data type, please use int"); - } - if (typeid(ET) != typeid(int)) { CUGRAPH_FAIL("Unsupported edge id data type, please use int"); } - if (typeid(WT) != typeid(float) && typeid(WT) != typeid(double)) { - CUGRAPH_FAIL("Unsupported weight data type, please use float or double"); - } - if (typeid(result_t) != typeid(float) && typeid(result_t) != typeid(double)) { - CUGRAPH_FAIL("Unsupported result data type, please use float or double"); - } - if (number_of_sources < 0) { - CUGRAPH_FAIL("Number of sources must be positive or equal to 0."); - } else if (number_of_sources != 0) { - CUGRAPH_EXPECTS(sources != nullptr, - "sources cannot be null if number_of_source is different from 0."); + result_t rescale_factor = static_cast(1); + result_t casted_total_number_of_sources_used = + static_cast(total_number_of_sources_used); + result_t casted_number_of_vertices = static_cast(number_of_vertices_); + + if (normalized_) { + if (number_of_vertices_ > 2 && total_number_of_sources_used > 0) { + rescale_factor *= (casted_number_of_vertices / casted_total_number_of_sources_used); + } + } else if (!graph_.prop.directed) { + if (number_of_vertices_ > 2 && total_number_of_sources_used > 0) { + rescale_factor *= (casted_number_of_vertices / casted_total_number_of_sources_used); + } } - if (endpoints) { CUGRAPH_FAIL("Endpoints option is currently not supported."); } + apply_rescale_factor_to_betweenness(rescale_factor); } -/** - * ---------------------------------------------------------------------------* - * @brief Native betweenness centrality - * - * @file betweenness_centrality.cu - * --------------------------------------------------------------------------*/ -template -void betweenness_centrality(experimental::GraphCSRView const &graph, +} // namespace detail + +template +void betweenness_centrality(raft::handle_t const &handle, + GraphCSRView const &graph, result_t *result, bool normalize, bool endpoints, - WT const *weight, - VT const number_of_sources, - VT const *sources) + weight_t const *weight, + vertex_t k, + vertex_t const *vertices) { - // Current Implementation relies on BFS - // FIXME: For SSSP version - // Brandes Algorithm expects non negative weights for the accumulation - verify_input( - result, normalize, endpoints, weight, number_of_sources, sources); - cugraph::detail::BC bc(graph); - bc.configure(result, normalize, endpoints, weight, sources, number_of_sources); - bc.compute(); + vertex_t total_number_of_sources_used = detail::get_total_number_of_sources(handle, k); + if (handle.comms_initialized()) { + rmm::device_vector betweenness(graph.number_of_vertices, 0); + detail::betweenness_centrality_impl(handle, + graph, + betweenness.data().get(), + normalize, + endpoints, + weight, + k, + vertices, + total_number_of_sources_used); + handle.get_comms().reduce(betweenness.data().get(), + result, + betweenness.size(), + raft::comms::op_t::SUM, + 0, + handle.get_stream()); + } else { + detail::betweenness_centrality_impl(handle, + graph, + result, + normalize, + endpoints, + weight, + k, + vertices, + total_number_of_sources_used); + } } -} // namespace detail -/** - * @param[out] result array(number_of_vertices) - * @param[in] normalize bool True -> Apply normalization - * @param[in] endpoints (NIY) bool Include endpoints - * @param[in] weights (NIY) array(number_of_edges) Weights to use - * @param[in] k Number of sources - * @param[in] vertices array(k) Sources for traversal - */ -template -void betweenness_centrality(experimental::GraphCSRView const &graph, - result_t *result, - bool normalize, - bool endpoints, - WT const *weight, - VT k, - VT const *vertices) +template void betweenness_centrality(const raft::handle_t &, + GraphCSRView const &, + float *, + bool, + bool, + float const *, + int, + int const *); +template void betweenness_centrality( + const raft::handle_t &, + GraphCSRView const &, + double *, + bool, + bool, + double const *, + int, + int const *); + +template +void edge_betweenness_centrality(raft::handle_t const &handle, + GraphCSRView const &graph, + result_t *result, + bool normalize, + weight_t const *weight, + vertex_t k, + vertex_t const *vertices) { - detail::betweenness_centrality(graph, result, normalize, endpoints, weight, k, vertices); + vertex_t total_number_of_sources_used = detail::get_total_number_of_sources(handle, k); + if (handle.comms_initialized()) { + rmm::device_vector betweenness(graph.number_of_edges, 0); + detail::edge_betweenness_centrality_impl(handle, + graph, + betweenness.data().get(), + normalize, + weight, + k, + vertices, + total_number_of_sources_used); + handle.get_comms().reduce(betweenness.data().get(), + result, + betweenness.size(), + raft::comms::op_t::SUM, + 0, + handle.get_stream()); + } else { + detail::edge_betweenness_centrality_impl( + handle, graph, result, normalize, weight, k, vertices, total_number_of_sources_used); + } } -template void betweenness_centrality( - experimental::GraphCSRView const &, +template void edge_betweenness_centrality( + const raft::handle_t &, + GraphCSRView const &, float *, bool, - bool, float const *, int, int const *); -template void betweenness_centrality( - experimental::GraphCSRView const &, + +template void edge_betweenness_centrality( + raft::handle_t const &handle, + GraphCSRView const &, double *, bool, - bool, double const *, int, int const *); - } // namespace cugraph diff --git a/cpp/src/centrality/betweenness_centrality.cuh b/cpp/src/centrality/betweenness_centrality.cuh index d4f448618e2..418ac06faa4 100644 --- a/cpp/src/centrality/betweenness_centrality.cuh +++ b/cpp/src/centrality/betweenness_centrality.cuh @@ -15,79 +15,134 @@ */ // Author: Xavier Cadet xcadet@nvidia.com + #pragma once #include namespace cugraph { namespace detail { -template +template +void betweenness_centrality(raft::handle_t const &handle, + GraphCSRView const &graph, + result_t *result, + bool normalize, + bool endpoints, + weight_t const *weight, + vertex_t const number_of_sources, + vertex_t const *sources); + +template +void edge_betweenness_centrality(GraphCSRView const &graph, + result_t *result, + bool normalize, + weight_t const *weight, + vertex_t const number_of_sources, + vertex_t const *sources); + +template +void verify_betweenness_centrality_input(result_t *result, + bool is_edge_betweenness, + bool normalize, + bool endpoints, + weight_t const *weights, + vertex_t const number_of_sources, + vertex_t const *sources); + +template class BC { + public: + virtual ~BC(void) {} + BC(raft::handle_t const &handle, + GraphCSRView const &graph, + cudaStream_t stream = 0) + : handle_(handle), graph_(graph) + { + setup(); + } + void configure(result_t *betweenness, + bool is_edge_betweenness, + bool normalize, + bool endpoints, + weight_t const *weight, + vertex_t const *sources, + vertex_t const number_of_sources); + + void configure_edge(result_t *betweenness, + bool normalize, + weight_t const *weight, + vertex_t const *sources, + vertex_t const number_of_sources); + void compute(); + void rescale_by_total_sources_used(vertex_t total_number_of_sources_used); + private: + // --- RAFT handle --- + raft::handle_t const &handle_; // --- Information concerning the graph --- - const experimental::GraphCSRView &graph; + const GraphCSRView &graph_; // --- These information are extracted on setup --- - VT number_of_vertices; // Number of vertices in the graph - VT number_of_edges; // Number of edges in the graph - ET const *offsets_ptr; // Pointer to the offsets - VT const *indices_ptr; // Pointers to the indices + vertex_t number_of_vertices_; // Number of vertices in the graph + vertex_t number_of_edges_; // Number of edges in the graph + edge_t const *offsets_ptr_; // Pointer to the offsets + vertex_t const *indices_ptr_; // Pointers to the indices // --- Information from configuration --- - bool configured = false; // Flag to ensure configuration was called - bool normalized = false; // If True normalize the betweenness + bool configured_ = false; // Flag to ensure configuration was called + bool normalized_ = false; // If True normalize the betweenness + bool is_edge_betweenness_ = false; // If True compute edge_betweeness + // FIXME: For weighted version - WT const *edge_weights_ptr = nullptr; // Pointer to the weights - bool endpoints = false; // If True normalize the betweenness - VT const *sources = nullptr; // Subset of vertices to gather information from - VT number_of_sources; // Number of vertices in sources + weight_t const *edge_weights_ptr_ = nullptr; // Pointer to the weights + bool endpoints_ = false; // If True normalize the betweenness + vertex_t const *sources_ = nullptr; // Subset of vertices to gather information from + vertex_t number_of_sources_; // Number of vertices in sources // --- Output ---- // betweenness is set/read by users - using Vectors - result_t *betweenness = nullptr; + result_t *betweenness_ = nullptr; // --- Data required to perform computation ---- - rmm::device_vector distances_vec; - rmm::device_vector predecessors_vec; - rmm::device_vector sp_counters_vec; - rmm::device_vector deltas_vec; - - VT *distances = nullptr; // array(|V|) stores the distances gathered by the latest SSSP - VT *predecessors = nullptr; // array(|V|) stores the predecessors of the latest SSSP - double *sp_counters = - nullptr; // array(|V|) stores the shortest path counter for the latest SSSP - double *deltas = nullptr; // array(|V|) stores the dependencies for the latest SSSP - - // FIXME: This should be replaced using RAFT handle - int device_id = 0; - int max_grid_dim_1D = 0; - int max_block_dim_1D = 0; - cudaStream_t stream; - - // ----------------------------------------------------------------------- - void setup(); // Saves information related to the graph itself - - void accumulate(result_t *betweenness, - VT *distances, - double *sp_counters, - double *deltas, - VT source, - VT max_depth); - void compute_single_source(VT source_vertex); - void rescale(); + rmm::device_vector distances_vec_; + rmm::device_vector predecessors_vec_; + rmm::device_vector sp_counters_vec_; + rmm::device_vector deltas_vec_; - public: - virtual ~BC(void) {} - BC(experimental::GraphCSRView const &_graph, cudaStream_t _stream = 0) - : graph(_graph), stream(_stream) - { - setup(); - } - void configure(result_t *betweenness, - bool normalize, - bool endpoints, - WT const *weigth, - VT const *sources, - VT const number_of_sources); - void compute(); + vertex_t *distances_ = + nullptr; // array(|V|) stores the distances gathered by the latest SSSP + vertex_t *predecessors_ = + nullptr; // array(|V|) stores the predecessors of the latest SSSP + double *sp_counters_ = + nullptr; // array(|V|) stores the shortest path counter for the latest SSSP + double *deltas_ = nullptr; // array(|V|) stores the dependencies for the latest SSSP + + int max_grid_dim_1D_ = 0; + int max_block_dim_1D_ = 0; + + void setup(); + + void initialize_work_vectors(); + void initialize_pointers_to_vectors(); + void initialize_device_information(); + + void compute_single_source(vertex_t source_vertex); + + void accumulate(vertex_t source_vertex, vertex_t max_depth); + void initialize_dependencies(); + void accumulate_edges(vertex_t max_depth, dim3 grid_configuration, dim3 block_configuration); + void accumulate_vertices_with_endpoints(vertex_t source_vertex, + vertex_t max_depth, + dim3 grid_configuration, + dim3 block_configuration); + void accumulate_vertices(vertex_t max_depth, dim3 grid_configuration, dim3 block_configuration); + void add_reached_endpoints_to_source_betweenness(vertex_t source_vertex); + void add_vertices_dependencies_to_betweenness(); + + void rescale(); + std::tuple rescale_vertices_betweenness_centrality(result_t rescale_factor, + bool modified); + std::tuple rescale_edges_betweenness_centrality(result_t rescale_factor, + bool modified); + void apply_rescale_factor_to_betweenness(result_t scaling_factor); }; } // namespace detail } // namespace cugraph diff --git a/cpp/src/centrality/betweenness_centrality_kernels.cuh b/cpp/src/centrality/betweenness_centrality_kernels.cuh new file mode 100644 index 00000000000..3cb5add8ad6 --- /dev/null +++ b/cpp/src/centrality/betweenness_centrality_kernels.cuh @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cugraph { +namespace detail { +// Dependecy Accumulation: based on McLaughlin and Bader, 2018 +// FIXME: Accumulation kernel mights not scale well, as each thread is handling +// all the edges for each node, an approach similar to the traversal +// bucket (i.e. BFS / SSSP) system might enable speed up. +// Should look into forAllEdge type primitive for different +// load balancing +template +__global__ void edges_accumulation_kernel(result_t *betweenness, + vertex_t number_vertices, + vertex_t const *indices, + edge_t const *offsets, + vertex_t *distances, + double *sp_counters, + double *deltas, + vertex_t depth) +{ + for (int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; thread_idx < number_vertices; + thread_idx += gridDim.x * blockDim.x) { + vertex_t vertex = thread_idx; + double vertex_delta = 0; + double vertex_sigma = sp_counters[vertex]; + if (distances[vertex] == depth) { + edge_t first_edge_idx = offsets[vertex]; + edge_t last_edge_idx = offsets[vertex + 1]; + for (edge_t edge_idx = first_edge_idx; edge_idx < last_edge_idx; ++edge_idx) { + vertex_t successor = indices[edge_idx]; + if (distances[successor] == distances[vertex] + 1) { + double factor = (static_cast(1) + deltas[successor]) / sp_counters[successor]; + double coefficient = vertex_sigma * factor; + + vertex_delta += coefficient; + betweenness[edge_idx] += coefficient; + } + } + deltas[vertex] = vertex_delta; + } + } +} + +template +__global__ void endpoints_accumulation_kernel(result_t *betweenness, + vertex_t number_vertices, + vertex_t const *indices, + edge_t const *offsets, + vertex_t *distances, + double *sp_counters, + double *deltas, + vertex_t depth) +{ + for (int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; thread_idx < number_vertices; + thread_idx += gridDim.x * blockDim.x) { + vertex_t vertex = thread_idx; + double vertex_delta = 0; + double vertex_sigma = sp_counters[vertex]; + if (distances[vertex] == depth) { + edge_t first_edge_idx = offsets[vertex]; + edge_t last_edge_idx = offsets[vertex + 1]; + for (edge_t edge_idx = first_edge_idx; edge_idx < last_edge_idx; ++edge_idx) { + vertex_t successor = indices[edge_idx]; + if (distances[successor] == distances[vertex] + 1) { + double factor = (static_cast(1) + deltas[successor]) / sp_counters[successor]; + vertex_delta += vertex_sigma * factor; + } + } + betweenness[vertex] += 1; + deltas[vertex] = vertex_delta; + } + } +} +template +__global__ void accumulation_kernel(result_t *betweenness, + vertex_t number_vertices, + vertex_t const *indices, + edge_t const *offsets, + vertex_t *distances, + double *sp_counters, + double *deltas, + vertex_t depth) +{ + for (int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; thread_idx < number_vertices; + thread_idx += gridDim.x * blockDim.x) { + vertex_t vertex = thread_idx; + double vertex_delta = 0; + double vertex_sigma = sp_counters[vertex]; + if (distances[vertex] == depth) { + edge_t first_edge_idx = offsets[vertex]; + edge_t last_edge_idx = offsets[vertex + 1]; + for (edge_t edge_idx = first_edge_idx; edge_idx < last_edge_idx; ++edge_idx) { + vertex_t successor = indices[edge_idx]; + if (distances[successor] == distances[vertex] + 1) { + double factor = (static_cast(1) + deltas[successor]) / sp_counters[successor]; + vertex_delta += vertex_sigma * factor; + } + } + deltas[vertex] = vertex_delta; + } + } +} +} // namespace detail +} // namespace cugraph \ No newline at end of file diff --git a/cpp/src/centrality/katz_centrality.cu b/cpp/src/centrality/katz_centrality.cu index 2e24a3110c1..0119a388680 100644 --- a/cpp/src/centrality/katz_centrality.cu +++ b/cpp/src/centrality/katz_centrality.cu @@ -24,12 +24,12 @@ #include #include #include -#include "utilities/error_utils.h" +#include "utilities/error.hpp" namespace cugraph { template -void katz_centrality(experimental::GraphCSRView const &graph, +void katz_centrality(GraphCSRView const &graph, result_t *result, double alpha, int max_iter, @@ -52,6 +52,6 @@ void katz_centrality(experimental::GraphCSRView const &graph, } template void katz_centrality( - experimental::GraphCSRView const &, double *, double, int, double, bool, bool); + GraphCSRView const &, double *, double, int, double, bool, bool); } // namespace cugraph diff --git a/cpp/src/comms/mpi/comms_mpi.cpp b/cpp/src/comms/mpi/comms_mpi.cpp deleted file mode 100644 index f473c0a1939..00000000000 --- a/cpp/src/comms/mpi/comms_mpi.cpp +++ /dev/null @@ -1,279 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include "utilities/error_utils.h" - -namespace cugraph { -namespace experimental { -#if ENABLE_OPG - -/**---------------------------------------------------------------------------* - * @brief Exception thrown when a NCCL error is encountered. - * - *---------------------------------------------------------------------------**/ -struct nccl_error : public std::runtime_error { - nccl_error(std::string const &message) : std::runtime_error(message) {} -}; - -inline void throw_nccl_error(ncclResult_t error, const char *file, unsigned int line) -{ - throw nccl_error(std::string{"NCCL error encountered at: " + std::string{file} + ":" + - std::to_string(line) + ": " + ncclGetErrorString(error)}); -} - -#define NCCL_TRY(call) \ - { \ - ncclResult_t nccl_status = (call); \ - if (nccl_status != ncclSuccess) { throw_nccl_error(nccl_status, __FILE__, __LINE__); } \ - } -// MPI errors are expected to be fatal before reaching this. -// FIXME : improve when adding raft comms -#define MPI_TRY(cmd) \ - { \ - int e = cmd; \ - if (e != MPI_SUCCESS) { CUGRAPH_FAIL("Failed: MPI error"); } \ - } - -template -constexpr MPI_Datatype get_mpi_type() -{ - if (std::is_integral::value) { - if (std::is_signed::value) { - if (sizeof(value_t) == 1) { - return MPI_INT8_T; - } else if (sizeof(value_t) == 2) { - return MPI_INT16_T; - } else if (sizeof(value_t) == 4) { - return MPI_INT32_T; - } else if (sizeof(value_t) == 8) { - return MPI_INT64_T; - } else { - CUGRAPH_FAIL("unsupported type"); - } - } else { - if (sizeof(value_t) == 1) { - return MPI_UINT8_T; - } else if (sizeof(value_t) == 2) { - return MPI_UINT16_T; - } else if (sizeof(value_t) == 4) { - return MPI_UINT32_T; - } else if (sizeof(value_t) == 8) { - return MPI_UINT64_T; - } else { - CUGRAPH_FAIL("unsupported type"); - } - } - } else if (std::is_same::value) { - return MPI_FLOAT; - } else if (std::is_same::value) { - return MPI_DOUBLE; - } else { - CUGRAPH_FAIL("unsupported type"); - } -} - -template -constexpr ncclDataType_t get_nccl_type() -{ - if (std::is_integral::value) { - if (std::is_signed::value) { - if (sizeof(value_t) == 1) { - return ncclInt8; - } else if (sizeof(value_t) == 4) { - return ncclInt32; - } else if (sizeof(value_t) == 8) { - return ncclInt64; - } else { - CUGRAPH_FAIL("unsupported type"); - } - } else { - if (sizeof(value_t) == 1) { - return ncclUint8; - } else if (sizeof(value_t) == 4) { - return ncclUint32; - } else if (sizeof(value_t) == 8) { - return ncclUint64; - } else { - CUGRAPH_FAIL("unsupported type"); - } - } - } else if (std::is_same::value) { - return ncclFloat32; - } else if (std::is_same::value) { - return ncclFloat64; - } else { - CUGRAPH_FAIL("unsupported type"); - } -} - -constexpr MPI_Op get_mpi_reduce_op(ReduceOp reduce_op) -{ - if (reduce_op == ReduceOp::SUM) { - return MPI_SUM; - } else if (reduce_op == ReduceOp::MAX) { - return MPI_MAX; - } else if (reduce_op == ReduceOp::MIN) { - return MPI_MIN; - } else { - CUGRAPH_FAIL("unsupported type"); - } -} - -constexpr ncclRedOp_t get_nccl_reduce_op(ReduceOp reduce_op) -{ - if (reduce_op == ReduceOp::SUM) { - return ncclSum; - } else if (reduce_op == ReduceOp::MAX) { - return ncclMax; - } else if (reduce_op == ReduceOp::MIN) { - return ncclMin; - } else { - CUGRAPH_FAIL("unsupported type"); - } -} -#endif - -Comm::Comm(int p) : _p{p} -{ -#if ENABLE_OPG - // MPI - int flag{}, mpi_world_size; - - MPI_TRY(MPI_Initialized(&flag)); - - if (flag == false) { - int provided{}; - MPI_TRY(MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &provided)); - if (provided != MPI_THREAD_MULTIPLE) { MPI_TRY(MPI_ERR_OTHER); } - _finalize_mpi = true; - } - - MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &_rank)); - MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size)); - CUGRAPH_EXPECTS((_p == mpi_world_size), - "Invalid input arguments: p should match the number of MPI processes."); - - _mpi_comm = MPI_COMM_WORLD; - - // CUDA - - CUDA_TRY(cudaGetDeviceCount(&_device_count)); - _device_id = _rank % _device_count; // FIXME : assumes each node has the same number of GPUs - CUDA_TRY(cudaSetDevice(_device_id)); - - CUDA_TRY( - cudaDeviceGetAttribute(&_sm_count_per_device, cudaDevAttrMultiProcessorCount, _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX, _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX, _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize, _device_id)); - CUDA_TRY(cudaDeviceGetAttribute( - &_shared_memory_size_per_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, _device_id)); - - // NCCL - - ncclUniqueId nccl_unique_id_p{}; - if (get_rank() == 0) { NCCL_TRY(ncclGetUniqueId(&nccl_unique_id_p)); } - MPI_TRY(MPI_Bcast(&nccl_unique_id_p, sizeof(ncclUniqueId), MPI_BYTE, 0, _mpi_comm)); - NCCL_TRY(ncclCommInitRank(&_nccl_comm, get_p(), nccl_unique_id_p, get_rank())); - _finalize_nccl = true; -#endif -} - -#if ENABLE_OPG -Comm::Comm(ncclComm_t comm, int size, int rank) : _nccl_comm(comm), _p(size), _rank(rank) -{ - // CUDA - CUDA_TRY(cudaGetDeviceCount(&_device_count)); - _device_id = _rank % _device_count; // FIXME : assumes each node has the same number of GPUs - CUDA_TRY(cudaSetDevice(_device_id)); // FIXME : check if this is needed or if - // python takes care of this - - CUDA_TRY( - cudaDeviceGetAttribute(&_sm_count_per_device, cudaDevAttrMultiProcessorCount, _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_max_grid_dim_1D, cudaDevAttrMaxGridDimX, _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_max_block_dim_1D, cudaDevAttrMaxBlockDimX, _device_id)); - CUDA_TRY(cudaDeviceGetAttribute(&_l2_cache_size, cudaDevAttrL2CacheSize, _device_id)); - CUDA_TRY(cudaDeviceGetAttribute( - &_shared_memory_size_per_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, _device_id)); -} -#endif - -Comm::~Comm() -{ -#if ENABLE_OPG - // NCCL - if (_finalize_nccl) ncclCommDestroy(_nccl_comm); - - if (_finalize_mpi) { MPI_Finalize(); } -#endif -} - -void Comm::barrier() -{ -#if ENABLE_OPG - MPI_Barrier(MPI_COMM_WORLD); -#endif -} - -template -void Comm::allgather(size_t size, value_t *sendbuff, value_t *recvbuff) const -{ -#if ENABLE_OPG - NCCL_TRY(ncclAllGather((const void *)sendbuff, - (void *)recvbuff, - size, - get_nccl_type(), - _nccl_comm, - cudaStreamDefault)); -#endif -} - -template -void Comm::allreduce(size_t size, value_t *sendbuff, value_t *recvbuff, ReduceOp reduce_op) const -{ -#if ENABLE_OPG - NCCL_TRY(ncclAllReduce((const void *)sendbuff, - (void *)recvbuff, - size, - get_nccl_type(), - get_nccl_reduce_op(reduce_op), - _nccl_comm, - cudaStreamDefault)); -#endif -} - -// explicit -template void Comm::allgather(size_t size, int *sendbuff, int *recvbuff) const; -template void Comm::allgather(size_t size, float *sendbuff, float *recvbuff) const; -template void Comm::allgather(size_t size, double *sendbuff, double *recvbuff) const; -template void Comm::allreduce(size_t size, - int *sendbuff, - int *recvbuff, - ReduceOp reduce_op) const; -template void Comm::allreduce(size_t size, - float *sendbuff, - float *recvbuff, - ReduceOp reduce_op) const; -template void Comm::allreduce(size_t size, - double *sendbuff, - double *recvbuff, - ReduceOp reduce_op) const; - -} // namespace experimental -} // namespace cugraph diff --git a/cpp/src/community/ECG.cu b/cpp/src/community/ECG.cu index b746966627c..47a80fa48d6 100644 --- a/cpp/src/community/ECG.cu +++ b/cpp/src/community/ECG.cu @@ -16,12 +16,11 @@ #include -#include #include #include -#include #include #include +#include #include "utilities/graph_utils.cuh" namespace { @@ -108,43 +107,43 @@ void get_permutation_vector(T size, T seed, T *permutation, cudaStream_t stream) namespace cugraph { -template -void ecg(experimental::GraphCSRView const &graph, - WT min_weight, - VT ensemble_size, - VT *ecg_parts) +template +void ecg(GraphCSRView const &graph, + weight_t min_weight, + vertex_t ensemble_size, + vertex_t *ecg_parts) { CUGRAPH_EXPECTS(graph.edge_data != nullptr, "API error, louvain expects a weighted graph"); CUGRAPH_EXPECTS(ecg_parts != nullptr, "Invalid API parameter: ecg_parts is NULL"); cudaStream_t stream{0}; - rmm::device_vector ecg_weights_v(graph.edge_data, graph.edge_data + graph.number_of_edges); + rmm::device_vector ecg_weights_v(graph.edge_data, + graph.edge_data + graph.number_of_edges); - VT size{graph.number_of_vertices}; - VT seed{0}; - // VT seed{1}; // Note... this seed won't work for the unit tests... retest after fixing Louvain. + vertex_t size{graph.number_of_vertices}; + vertex_t seed{1}; - auto permuted_graph = std::make_unique>( + auto permuted_graph = std::make_unique>( size, graph.number_of_edges, graph.has_data()); // Iterate over each member of the ensemble - for (VT i = 0; i < ensemble_size; i++) { + for (vertex_t i = 0; i < ensemble_size; i++) { // Take random permutation of the graph - rmm::device_vector permutation_v(size); - VT *d_permutation = permutation_v.data().get(); + rmm::device_vector permutation_v(size); + vertex_t *d_permutation = permutation_v.data().get(); get_permutation_vector(size, seed, d_permutation, stream); seed += size; - detail::permute_graph(graph, d_permutation, permuted_graph->view()); + detail::permute_graph(graph, d_permutation, permuted_graph->view()); - // Run Louvain clustering on the random permutation - rmm::device_vector parts_v(size); - VT *d_parts = parts_v.data().get(); + // Run one level of Louvain clustering on the random permutation + rmm::device_vector parts_v(size); + vertex_t *d_parts = parts_v.data().get(); - WT final_modularity; - VT num_level; + weight_t final_modularity; + vertex_t num_level; cugraph::louvain(permuted_graph->view(), &final_modularity, &num_level, d_parts, 1); @@ -152,7 +151,7 @@ void ecg(experimental::GraphCSRView const &graph, // Keep a sum for each edge of the total number of times its endpoints are in the same partition dim3 grid, block; block.x = 512; - grid.x = min(VT{CUDA_MAX_BLOCKS}, (graph.number_of_edges / 512 + 1)); + grid.x = min(vertex_t{CUDA_MAX_BLOCKS}, (graph.number_of_edges / 512 + 1)); match_check_kernel<<>>(graph.number_of_edges, graph.number_of_vertices, graph.offsets, @@ -163,7 +162,7 @@ void ecg(experimental::GraphCSRView const &graph, } // Set weights = min_weight + (1 - min-weight)*sum/ensemble_size - update_functor uf(min_weight, ensemble_size); + update_functor uf(min_weight, ensemble_size); thrust::transform(rmm::exec_policy(stream)->on(stream), ecg_weights_v.data().get(), ecg_weights_v.data().get() + graph.number_of_edges, @@ -171,27 +170,26 @@ void ecg(experimental::GraphCSRView const &graph, uf); // Run Louvain on the original graph using the computed weights - experimental::GraphCSRView louvain_graph; + // (pass max_level = 100 for a "full run") + GraphCSRView louvain_graph; louvain_graph.indices = graph.indices; louvain_graph.offsets = graph.offsets; louvain_graph.edge_data = ecg_weights_v.data().get(); louvain_graph.number_of_vertices = graph.number_of_vertices; louvain_graph.number_of_edges = graph.number_of_edges; - WT final_modularity; - VT num_level; + weight_t final_modularity; + vertex_t num_level; cugraph::louvain(louvain_graph, &final_modularity, &num_level, ecg_parts, 100); } // Explicit template instantiations. -template void ecg( - experimental::GraphCSRView const &graph, - float min_weight, - int32_t ensemble_size, - int32_t *ecg_parts); -template void ecg( - experimental::GraphCSRView const &graph, - double min_weight, - int32_t ensemble_size, - int32_t *ecg_parts); +template void ecg(GraphCSRView const &graph, + float min_weight, + int32_t ensemble_size, + int32_t *ecg_parts); +template void ecg(GraphCSRView const &graph, + double min_weight, + int32_t ensemble_size, + int32_t *ecg_parts); } // namespace cugraph diff --git a/cpp/src/community/extract_subgraph_by_vertex.cu b/cpp/src/community/extract_subgraph_by_vertex.cu index 919f89545a0..c39b7f8ad0a 100644 --- a/cpp/src/community/extract_subgraph_by_vertex.cu +++ b/cpp/src/community/extract_subgraph_by_vertex.cu @@ -16,18 +16,16 @@ #include #include - -#include +#include #include -#include +#include namespace { template -std::unique_ptr> -extract_subgraph_by_vertices( - cugraph::experimental::GraphCOOView const &graph, +std::unique_ptr> extract_subgraph_by_vertices( + cugraph::GraphCOOView const &graph, vertex_t const *vertices, vertex_t num_vertices, cudaStream_t stream) @@ -49,7 +47,7 @@ extract_subgraph_by_vertices( if ((v >= 0) && (v < graph_num_verts)) { d_vertex_used[v] = idx; } else { - cugraph::atomicAdd(d_error_count, int64_t{1}); + atomicAdd(d_error_count, int64_t{1}); } }); @@ -72,7 +70,7 @@ extract_subgraph_by_vertices( }); if (count > 0) { - auto result = std::make_unique>( + auto result = std::make_unique>( num_vertices, count, has_weight); vertex_t *d_new_src = result->src_indices(); @@ -99,7 +97,7 @@ extract_subgraph_by_vertices( // require 2*|E| temporary memory. If this becomes important perhaps // we make 2 implementations and pick one based on the number of // vertices in the subgraph set. - auto pos = cugraph::atomicAdd(d_error_count, 1); + auto pos = atomicAdd(d_error_count, int64_t{1}); d_new_src[pos] = d_vertex_used[s]; d_new_dst[pos] = d_vertex_used[d]; if (has_weight) d_new_weight[pos] = graph_weight[e]; @@ -108,18 +106,18 @@ extract_subgraph_by_vertices( return result; } else { - return std::make_unique>( - 0, 0, has_weight); + return std::make_unique>(0, 0, has_weight); } } } // namespace namespace cugraph { -namespace nvgraph { +namespace subgraph { template -std::unique_ptr> extract_subgraph_vertex( - experimental::GraphCOOView const &graph, VT const *vertices, VT num_vertices) +std::unique_ptr> extract_subgraph_vertex(GraphCOOView const &graph, + VT const *vertices, + VT num_vertices) { CUGRAPH_EXPECTS(vertices != nullptr, "API error, vertices must be non null"); @@ -132,12 +130,14 @@ std::unique_ptr> extract_subgraph_vertex( } } -template std::unique_ptr> -extract_subgraph_vertex( - experimental::GraphCOOView const &, int32_t const *, int32_t); -template std::unique_ptr> -extract_subgraph_vertex( - experimental::GraphCOOView const &, int32_t const *, int32_t); +template std::unique_ptr> +extract_subgraph_vertex(GraphCOOView const &, + int32_t const *, + int32_t); +template std::unique_ptr> +extract_subgraph_vertex(GraphCOOView const &, + int32_t const *, + int32_t); -} // namespace nvgraph +} // namespace subgraph } // namespace cugraph diff --git a/cpp/src/community/ktruss.cu b/cpp/src/community/ktruss.cu index ea6d1091fab..11a8ed6fbae 100644 --- a/cpp/src/community/ktruss.cu +++ b/cpp/src/community/ktruss.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,8 +21,7 @@ * @file ktruss.cu * --------------------------------------------------------------------------*/ -#include -#include +#include #include #include @@ -36,8 +35,9 @@ namespace cugraph { namespace detail { template -std::unique_ptr> ktruss_subgraph_impl( - experimental::GraphCOOView const &graph, int k, rmm::mr::device_memory_resource *mr) +std::unique_ptr> ktruss_subgraph_impl(GraphCOOView const &graph, + int k, + rmm::mr::device_memory_resource *mr) { using HornetGraph = hornet::gpu::Hornet; using UpdatePtr = hornet::BatchUpdatePtr; @@ -68,7 +68,7 @@ std::unique_ptr> ktruss_subgraph_impl( kt.runForK(k); CUGRAPH_EXPECTS(cudaPeekAtLastError() == cudaSuccess, "KTruss : Failed to run"); - auto out_graph = std::make_unique>( + auto out_graph = std::make_unique>( graph.number_of_vertices, kt.getGraphEdgeCount(), graph.has_data(), stream, mr); kt.copyGraph(out_graph->src_indices(), out_graph->dst_indices()); @@ -79,8 +79,8 @@ std::unique_ptr> ktruss_subgraph_impl( return out_graph; } template -std::unique_ptr> weighted_ktruss_subgraph_impl( - experimental::GraphCOOView const &graph, int k, rmm::mr::device_memory_resource *mr) +std::unique_ptr> weighted_ktruss_subgraph_impl( + GraphCOOView const &graph, int k, rmm::mr::device_memory_resource *mr) { using HornetGraph = hornet::gpu::Hornet>; using UpdatePtr = hornet::BatchUpdatePtr, hornet::DeviceType::DEVICE>; @@ -111,7 +111,7 @@ std::unique_ptr> weighted_ktruss_subgraph_imp kt.runForK(k); CUGRAPH_EXPECTS(cudaPeekAtLastError() == cudaSuccess, "KTruss : Failed to run"); - auto out_graph = std::make_unique>( + auto out_graph = std::make_unique>( graph.number_of_vertices, kt.getGraphEdgeCount(), graph.has_data(), stream, mr); kt.copyGraph(out_graph->src_indices(), out_graph->dst_indices(), out_graph->edge_data()); @@ -125,8 +125,9 @@ std::unique_ptr> weighted_ktruss_subgraph_imp } // namespace detail template -std::unique_ptr> k_truss_subgraph( - experimental::GraphCOOView const &graph, int k, rmm::mr::device_memory_resource *mr) +std::unique_ptr> k_truss_subgraph(GraphCOOView const &graph, + int k, + rmm::mr::device_memory_resource *mr) { CUGRAPH_EXPECTS(graph.src_indices != nullptr, "Graph source indices cannot be a nullptr"); CUGRAPH_EXPECTS(graph.dst_indices != nullptr, "Graph destination indices cannot be a nullptr"); @@ -138,14 +139,10 @@ std::unique_ptr> k_truss_subgraph( } } -template std::unique_ptr> -k_truss_subgraph(experimental::GraphCOOView const &, - int, - rmm::mr::device_memory_resource *); +template std::unique_ptr> k_truss_subgraph( + GraphCOOView const &, int, rmm::mr::device_memory_resource *); -template std::unique_ptr> -k_truss_subgraph(experimental::GraphCOOView const &, - int, - rmm::mr::device_memory_resource *); +template std::unique_ptr> k_truss_subgraph( + GraphCOOView const &, int, rmm::mr::device_memory_resource *); } // namespace cugraph diff --git a/cpp/src/community/leiden.cpp b/cpp/src/community/leiden.cpp new file mode 100644 index 00000000000..9e7a49db1f1 --- /dev/null +++ b/cpp/src/community/leiden.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include + +#include + +#include "utilities/error.hpp" + +namespace cugraph { + +template +void leiden(GraphCSRView const &graph, + weight_t &final_modularity, + int &num_level, + vertex_t *leiden_parts, + int max_level, + weight_t resolution) +{ + CUGRAPH_EXPECTS(graph.edge_data != nullptr, "API error, leiden expects a weighted graph"); + CUGRAPH_EXPECTS(leiden_parts != nullptr, "API error, leiden_parts is null"); + + detail::leiden( + graph, final_modularity, num_level, leiden_parts, max_level, resolution); +} + +template void leiden( + GraphCSRView const &, float &, int &, int32_t *, int, float); +template void leiden( + GraphCSRView const &, double &, int &, int32_t *, int, double); + +} // namespace cugraph diff --git a/cpp/src/community/leiden_kernels.cu b/cpp/src/community/leiden_kernels.cu new file mode 100644 index 00000000000..5eb4219d1ac --- /dev/null +++ b/cpp/src/community/leiden_kernels.cu @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include + +#include +#include + +//#define TIMING + +#ifdef TIMING +#include +#endif + +#include + +namespace cugraph { +namespace detail { + +template +weight_t update_clustering_by_delta_modularity_constrained( + weight_t total_edge_weight, + weight_t resolution, + GraphCSRView const &graph, + rmm::device_vector const &src_indices, + rmm::device_vector const &vertex_weights, + rmm::device_vector &cluster_weights, + rmm::device_vector &cluster, + rmm::device_vector &constraint, + cudaStream_t stream) +{ + rmm::device_vector next_cluster(cluster); + rmm::device_vector delta_Q(graph.number_of_edges); + rmm::device_vector cluster_hash(graph.number_of_edges); + rmm::device_vector old_cluster_sum(graph.number_of_vertices); + + weight_t *d_delta_Q = delta_Q.data().get(); + vertex_t *d_constraint = constraint.data().get(); + vertex_t const *d_src_indices = src_indices.data().get(); + vertex_t const *d_dst_indices = graph.indices; + + weight_t new_Q = modularity(total_edge_weight, resolution, graph, cluster.data().get(), stream); + + weight_t cur_Q = new_Q - 1; + + // To avoid the potential of having two vertices swap clusters + // we will only allow vertices to move up (true) or down (false) + // during each iteration of the loop + bool up_down = true; + + while (new_Q > (cur_Q + 0.0001)) { + cur_Q = new_Q; + + compute_delta_modularity(total_edge_weight, + resolution, + graph, + src_indices, + vertex_weights, + cluster_weights, + cluster, + cluster_hash, + delta_Q, + old_cluster_sum, + stream); + + // Filter out positive delta_Q values for nodes not in the same constraint group + thrust::for_each( + rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(graph.number_of_edges), + [d_src_indices, d_dst_indices, d_constraint, d_delta_Q] __device__(vertex_t i) { + vertex_t start_cluster = d_constraint[d_src_indices[i]]; + vertex_t end_cluster = d_constraint[d_dst_indices[i]]; + if (start_cluster != end_cluster) d_delta_Q[i] = weight_t{0.0}; + }); + + assign_nodes(graph, + delta_Q, + cluster_hash, + src_indices, + next_cluster, + vertex_weights, + cluster_weights, + up_down, + stream); + + up_down = !up_down; + + new_Q = modularity(total_edge_weight, resolution, graph, next_cluster.data().get(), stream); + + if (new_Q > cur_Q) { + thrust::copy(rmm::exec_policy(stream)->on(stream), + next_cluster.begin(), + next_cluster.end(), + cluster.begin()); + } + } + + return cur_Q; +} + +template float update_clustering_by_delta_modularity_constrained( + float, + float, + GraphCSRView const &, + rmm::device_vector const &, + rmm::device_vector const &, + rmm::device_vector &, + rmm::device_vector &, + rmm::device_vector &, + cudaStream_t); + +template double update_clustering_by_delta_modularity_constrained( + double, + double, + GraphCSRView const &, + rmm::device_vector const &, + rmm::device_vector const &, + rmm::device_vector &, + rmm::device_vector &, + rmm::device_vector &, + cudaStream_t); + +template +void leiden(GraphCSRView const &graph, + weight_t &final_modularity, + int &num_level, + vertex_t *cluster_vec, + int max_level, + weight_t resolution, + cudaStream_t stream) +{ +#ifdef TIMING + HighResTimer hr_timer; +#endif + + num_level = 0; + + // + // Vectors to create a copy of the graph + // + rmm::device_vector offsets_v(graph.offsets, graph.offsets + graph.number_of_vertices + 1); + rmm::device_vector indices_v(graph.indices, graph.indices + graph.number_of_edges); + rmm::device_vector weights_v(graph.edge_data, graph.edge_data + graph.number_of_edges); + rmm::device_vector src_indices_v(graph.number_of_edges); + + // + // Weights and clustering across iterations of algorithm + // + rmm::device_vector vertex_weights_v(graph.number_of_vertices); + rmm::device_vector cluster_weights_v(graph.number_of_vertices); + rmm::device_vector cluster_v(graph.number_of_vertices); + + // + // Temporaries used within kernels. Each iteration uses less + // of this memory + // + rmm::device_vector tmp_arr_v(graph.number_of_vertices); + rmm::device_vector cluster_inverse_v(graph.number_of_vertices); + + weight_t total_edge_weight = + thrust::reduce(rmm::exec_policy(stream)->on(stream), weights_v.begin(), weights_v.end()); + weight_t best_modularity = -1; + + // + // Initialize every cluster to reference each vertex to itself + // + thrust::sequence(rmm::exec_policy(stream)->on(stream), cluster_v.begin(), cluster_v.end()); + thrust::copy( + rmm::exec_policy(stream)->on(stream), cluster_v.begin(), cluster_v.end(), cluster_vec); + + // + // Our copy of the graph. Each iteration of the outer loop will + // shrink this copy of the graph. + // + GraphCSRView current_graph(offsets_v.data().get(), + indices_v.data().get(), + weights_v.data().get(), + graph.number_of_vertices, + graph.number_of_edges); + + current_graph.get_source_indices(src_indices_v.data().get()); + + while (num_level < max_level) { + // + // Sum the weights of all edges departing a vertex. This is + // loop invariant, so we'll compute it here. + // + // Cluster weights are equivalent to vertex weights with this initial + // graph + // +#ifdef TIMING + hr_timer.start("init"); +#endif + + cugraph::detail::compute_vertex_sums(current_graph, vertex_weights_v, stream); + thrust::copy(rmm::exec_policy(stream)->on(stream), + vertex_weights_v.begin(), + vertex_weights_v.end(), + cluster_weights_v.begin()); + +#ifdef TIMING + hr_timer.stop(); + + hr_timer.start("update_clustering"); +#endif + + weight_t new_Q = update_clustering_by_delta_modularity(total_edge_weight, + resolution, + current_graph, + src_indices_v, + vertex_weights_v, + cluster_weights_v, + cluster_v, + stream); + + // After finding the initial unconstrained partition we use that partitioning as the constraint + // for the second round. + rmm::device_vector constraint(graph.number_of_vertices); + thrust::copy( + rmm::exec_policy(stream)->on(stream), cluster_v.begin(), cluster_v.end(), constraint.begin()); + new_Q = update_clustering_by_delta_modularity_constrained(total_edge_weight, + resolution, + current_graph, + src_indices_v, + vertex_weights_v, + cluster_weights_v, + cluster_v, + constraint, + stream); + +#ifdef TIMING + hr_timer.stop(); +#endif + + if (new_Q <= best_modularity) { break; } + + best_modularity = new_Q; + +#ifdef TIMING + hr_timer.start("shrinking graph"); +#endif + + // renumber the clusters to the range 0..(num_clusters-1) + vertex_t num_clusters = renumber_clusters( + graph.number_of_vertices, cluster_v, tmp_arr_v, cluster_inverse_v, cluster_vec, stream); + cluster_weights_v.resize(num_clusters); + + // shrink our graph to represent the graph of supervertices + generate_superverticies_graph(current_graph, src_indices_v, num_clusters, cluster_v, stream); + + // assign each new vertex to its own cluster + thrust::sequence(rmm::exec_policy(stream)->on(stream), cluster_v.begin(), cluster_v.end()); + +#ifdef TIMING + hr_timer.stop(); +#endif + + num_level++; + } + +#ifdef TIMING + hr_timer.display(std::cout); +#endif + + final_modularity = best_modularity; +} + +template void leiden(GraphCSRView const &, + float &, + int &, + int32_t *, + int, + float, + cudaStream_t); +template void leiden(GraphCSRView const &, + double &, + int &, + int32_t *, + int, + double, + cudaStream_t); + +} // namespace detail +} // namespace cugraph diff --git a/cpp/src/community/leiden_kernels.hpp b/cpp/src/community/leiden_kernels.hpp new file mode 100644 index 00000000000..cbe93c04f52 --- /dev/null +++ b/cpp/src/community/leiden_kernels.hpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace cugraph { +namespace detail { + +template +void leiden(GraphCSRView const& graph, + weight_t& final_modularity, + int& num_level, + vertex_t* cluster_vec, + int max_level, + weight_t resolution, + cudaStream_t stream = 0); + +} // namespace detail +} // namespace cugraph diff --git a/cpp/src/community/louvain.cpp b/cpp/src/community/louvain.cpp index 94ed67a0fcc..0e3f6ac51fd 100644 --- a/cpp/src/community/louvain.cpp +++ b/cpp/src/community/louvain.cpp @@ -23,28 +23,30 @@ #include -#include "utilities/error_utils.h" +#include "utilities/error.hpp" namespace cugraph { -template -void louvain(experimental::GraphCSRView const &graph, - WT *final_modularity, +template +void louvain(GraphCSRView const &graph, + weight_t *final_modularity, int *num_level, - VT *louvain_parts, - int max_iter) + vertex_t *louvain_parts, + int max_level, + weight_t resolution) { CUGRAPH_EXPECTS(graph.edge_data != nullptr, "API error, louvain expects a weighted graph"); CUGRAPH_EXPECTS(final_modularity != nullptr, "API error, final_modularity is null"); CUGRAPH_EXPECTS(num_level != nullptr, "API error, num_level is null"); CUGRAPH_EXPECTS(louvain_parts != nullptr, "API error, louvain_parts is null"); - detail::louvain(graph, final_modularity, num_level, louvain_parts, max_iter); + detail::louvain( + graph, final_modularity, num_level, louvain_parts, max_level, resolution); } template void louvain( - experimental::GraphCSRView const &, float *, int *, int32_t *, int); + GraphCSRView const &, float *, int *, int32_t *, int, float); template void louvain( - experimental::GraphCSRView const &, double *, int *, int32_t *, int); + GraphCSRView const &, double *, int *, int32_t *, int, double); } // namespace cugraph diff --git a/cpp/src/community/louvain_kernels.cu b/cpp/src/community/louvain_kernels.cu index 757cf2fcde2..c93e2d82fdf 100644 --- a/cpp/src/community/louvain_kernels.cu +++ b/cpp/src/community/louvain_kernels.cu @@ -17,10 +17,10 @@ #include -#include -#include #include +//#define TIMING + #ifdef TIMING #include #endif @@ -30,8 +30,12 @@ namespace cugraph { namespace detail { +namespace { // anonym. +constexpr int BLOCK_SIZE_1D = 64; +} + template -__global__ // __launch_bounds__(CUDA_MAX_KERNEL_THREADS) +__global__ // void compute_vertex_sums(vertex_t n_vertex, edge_t const *offsets, @@ -50,8 +54,9 @@ __global__ // __launch_bounds__(CUDA_MAX_KERNEL_THREADS) } template -weight_t modularity(weight_t m2, - experimental::GraphCSRView const &graph, +weight_t modularity(weight_t total_edge_weight, + weight_t resolution, + GraphCSRView const &graph, vertex_t const *d_cluster, cudaStream_t stream) { @@ -66,6 +71,10 @@ weight_t modularity(weight_t m2, weight_t *d_inc = inc.data().get(); weight_t *d_deg = deg.data().get(); + // FIXME: Already have weighted degree computed in main loop, + // could pass that in rather than computing d_deg... which + // would save an atomicAdd (synchronization) + // thrust::for_each( rmm::exec_policy(stream)->on(stream), thrust::make_counting_iterator(0), @@ -78,11 +87,10 @@ weight_t modularity(weight_t m2, for (edge_t loc = d_offsets[v]; loc < d_offsets[v + 1]; ++loc) { vertex_t neighbor = d_indices[loc]; degree += d_weights[loc]; - if (d_cluster[neighbor] == community) { increase += d_weights[loc] / 2; } + if (d_cluster[neighbor] == community) { increase += d_weights[loc]; } } if (degree > weight_t{0.0}) atomicAdd(d_deg + community, degree); - if (increase > weight_t{0.0}) atomicAdd(d_inc + community, increase); }); @@ -90,29 +98,28 @@ weight_t modularity(weight_t m2, rmm::exec_policy(stream)->on(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(graph.number_of_vertices), - [d_deg, d_inc, m2] __device__(vertex_t community) { -#ifdef DEBUG - printf(" d_inc[%d] = %g, d_deg = %g, return = %g\n", - community, - d_inc[community], - d_deg[community], - ((2 * d_inc[community] / m2) - pow(d_deg[community] / m2, 2))); -#endif - - return (2 * d_inc[community] / m2) - pow(d_deg[community] / m2, 2); + [d_deg, d_inc, total_edge_weight, resolution] __device__(vertex_t community) { + return ((d_inc[community] / total_edge_weight) - resolution * + (d_deg[community] * d_deg[community]) / + (total_edge_weight * total_edge_weight)); }, weight_t{0.0}, thrust::plus()); return Q; } +template float modularity( + float, float, GraphCSRView const &, int32_t const *, cudaStream_t); + +template double modularity( + double, double, GraphCSRView const &, int32_t const *, cudaStream_t); + template -void generate_superverticies_graph( - cugraph::experimental::GraphCSRView ¤t_graph, - rmm::device_vector &src_indices_v, - vertex_t new_number_of_vertices, - rmm::device_vector &cluster_v, - cudaStream_t stream) +void generate_superverticies_graph(cugraph::GraphCSRView ¤t_graph, + rmm::device_vector &src_indices_v, + vertex_t new_number_of_vertices, + rmm::device_vector &cluster_v, + cudaStream_t stream) { rmm::device_vector new_src_v(current_graph.number_of_edges); rmm::device_vector new_dst_v(current_graph.number_of_edges); @@ -174,13 +181,25 @@ void generate_superverticies_graph( new_number_of_vertices, current_graph.number_of_edges, stream); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); src_indices_v.resize(current_graph.number_of_edges); } +template void generate_superverticies_graph(GraphCSRView &, + rmm::device_vector &, + int32_t, + rmm::device_vector &, + cudaStream_t); + +template void generate_superverticies_graph(GraphCSRView &, + rmm::device_vector &, + int32_t, + rmm::device_vector &, + cudaStream_t); + template -void compute_vertex_sums(experimental::GraphCSRView const &graph, +void compute_vertex_sums(GraphCSRView const &graph, rmm::device_vector &sums, cudaStream_t stream) { @@ -192,6 +211,14 @@ void compute_vertex_sums(experimental::GraphCSRView graph.number_of_vertices, graph.offsets, graph.edge_data, sums.data().get()); } +template void compute_vertex_sums(GraphCSRView const &, + rmm::device_vector &, + cudaStream_t); + +template void compute_vertex_sums(GraphCSRView const &, + rmm::device_vector &, + cudaStream_t); + template vertex_t renumber_clusters(vertex_t graph_num_vertices, rmm::device_vector &cluster, @@ -204,9 +231,11 @@ vertex_t renumber_clusters(vertex_t graph_num_vertices, // Now we're going to renumber the clusters from 0 to (k-1), where k is the number of // clusters in this level of the dendogram. // - thrust::copy(cluster.begin(), cluster.end(), temp_array.begin()); - thrust::sort(temp_array.begin(), temp_array.end()); - auto tmp_end = thrust::unique(temp_array.begin(), temp_array.end()); + thrust::copy( + rmm::exec_policy(stream)->on(stream), cluster.begin(), cluster.end(), temp_array.begin()); + thrust::sort(rmm::exec_policy(stream)->on(stream), temp_array.begin(), temp_array.end()); + auto tmp_end = + thrust::unique(rmm::exec_policy(stream)->on(stream), temp_array.begin(), temp_array.end()); vertex_t old_num_clusters = cluster.size(); vertex_t new_num_clusters = thrust::distance(temp_array.begin(), tmp_end); @@ -244,10 +273,243 @@ vertex_t renumber_clusters(vertex_t graph_num_vertices, return new_num_clusters; } +template int32_t renumber_clusters(int32_t, + rmm::device_vector &, + rmm::device_vector &, + rmm::device_vector &, + int32_t *, + cudaStream_t); + +template +void compute_delta_modularity(weight_t total_edge_weight, + weight_t resolution, + GraphCSRView const &graph, + rmm::device_vector const &src_indices_v, + rmm::device_vector const &vertex_weights_v, + rmm::device_vector const &cluster_weights_v, + rmm::device_vector const &cluster_v, + rmm::device_vector &cluster_hash_v, + rmm::device_vector &delta_Q_v, + rmm::device_vector &tmp_size_V_v, + cudaStream_t stream) +{ + vertex_t const *d_src_indices = src_indices_v.data().get(); + vertex_t const *d_dst_indices = graph.indices; + edge_t const *d_offsets = graph.offsets; + weight_t const *d_weights = graph.edge_data; + vertex_t const *d_cluster = cluster_v.data().get(); + weight_t const *d_vertex_weights = vertex_weights_v.data().get(); + weight_t const *d_cluster_weights = cluster_weights_v.data().get(); + + vertex_t *d_cluster_hash = cluster_hash_v.data().get(); + weight_t *d_delta_Q = delta_Q_v.data().get(); + weight_t *d_old_cluster_sum = tmp_size_V_v.data().get(); + weight_t *d_new_cluster_sum = d_delta_Q; + + thrust::fill(cluster_hash_v.begin(), cluster_hash_v.end(), vertex_t{-1}); + thrust::fill(delta_Q_v.begin(), delta_Q_v.end(), weight_t{0.0}); + thrust::fill(tmp_size_V_v.begin(), tmp_size_V_v.end(), weight_t{0.0}); + + // + // For each source vertex, we're going to build a hash + // table to the destination cluster ids. We can use + // the offsets ranges to define the bounds of the hash + // table. + // + thrust::for_each(rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(graph.number_of_edges), + [d_src_indices, + d_dst_indices, + d_cluster, + d_offsets, + d_cluster_hash, + d_new_cluster_sum, + d_weights, + d_old_cluster_sum] __device__(edge_t loc) { + vertex_t src = d_src_indices[loc]; + vertex_t dst = d_dst_indices[loc]; + + if (src != dst) { + vertex_t old_cluster = d_cluster[src]; + vertex_t new_cluster = d_cluster[dst]; + edge_t hash_base = d_offsets[src]; + edge_t n_edges = d_offsets[src + 1] - hash_base; + + int h = (new_cluster % n_edges); + edge_t offset = hash_base + h; + while (d_cluster_hash[offset] != new_cluster) { + if (d_cluster_hash[offset] == -1) { + atomicCAS(d_cluster_hash + offset, -1, new_cluster); + } else { + h = (h + 1) % n_edges; + offset = hash_base + h; + } + } + + atomicAdd(d_new_cluster_sum + offset, d_weights[loc]); + + if (old_cluster == new_cluster) + atomicAdd(d_old_cluster_sum + src, d_weights[loc]); + } + }); + + thrust::for_each(rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(graph.number_of_edges), + [total_edge_weight, + resolution, + d_cluster_hash, + d_src_indices, + d_cluster, + d_vertex_weights, + d_delta_Q, + d_new_cluster_sum, + d_old_cluster_sum, + d_cluster_weights] __device__(edge_t loc) { + vertex_t new_cluster = d_cluster_hash[loc]; + if (new_cluster >= 0) { + vertex_t src = d_src_indices[loc]; + vertex_t old_cluster = d_cluster[src]; + weight_t k_k = d_vertex_weights[src]; + weight_t a_old = d_cluster_weights[old_cluster]; + weight_t a_new = d_cluster_weights[new_cluster]; + + // NOTE: d_delta_Q and d_new_cluster_sum are aliases + // for same device array to save memory + d_delta_Q[loc] = + 2 * + (((d_new_cluster_sum[loc] - d_old_cluster_sum[src]) / total_edge_weight) - + resolution * (a_new * k_k - a_old * k_k + k_k * k_k) / + (total_edge_weight * total_edge_weight)); + } else { + d_delta_Q[loc] = weight_t{0.0}; + } + }); +} + +template void compute_delta_modularity(float, + float, + GraphCSRView const &, + rmm::device_vector const &, + rmm::device_vector const &, + rmm::device_vector const &, + rmm::device_vector const &, + rmm::device_vector &, + rmm::device_vector &, + rmm::device_vector &, + cudaStream_t); + +template void compute_delta_modularity(double, + double, + GraphCSRView const &, + rmm::device_vector const &, + rmm::device_vector const &, + rmm::device_vector const &, + rmm::device_vector const &, + rmm::device_vector &, + rmm::device_vector &, + rmm::device_vector &, + cudaStream_t); + +template +void assign_nodes(GraphCSRView const &graph, + rmm::device_vector &delta_Q, + rmm::device_vector &cluster_hash, + rmm::device_vector const &src_indices, + rmm::device_vector &next_cluster, + rmm::device_vector const &vertex_weights, + rmm::device_vector &cluster_weights, + bool up_down, + cudaStream_t stream) +{ + rmm::device_vector temp_vertices(graph.number_of_vertices); + rmm::device_vector temp_cluster(graph.number_of_vertices, vertex_t{-1}); + rmm::device_vector temp_delta_Q(graph.number_of_vertices, weight_t{0.0}); + + weight_t *d_delta_Q = delta_Q.data().get(); + vertex_t *d_next_cluster = next_cluster.data().get(); + vertex_t *d_cluster_hash = cluster_hash.data().get(); + weight_t const *d_vertex_weights = vertex_weights.data().get(); + weight_t *d_cluster_weights = cluster_weights.data().get(); + + auto cluster_reduce_iterator = + thrust::make_zip_iterator(thrust::make_tuple(d_cluster_hash, d_delta_Q)); + + auto output_edge_iterator2 = thrust::make_zip_iterator( + thrust::make_tuple(temp_cluster.data().get(), temp_delta_Q.data().get())); + + auto cluster_reduce_end = + thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), + src_indices.begin(), + src_indices.end(), + cluster_reduce_iterator, + temp_vertices.data().get(), + output_edge_iterator2, + thrust::equal_to(), + [] __device__(auto pair1, auto pair2) { + if (thrust::get<1>(pair1) > thrust::get<1>(pair2)) + return pair1; + else + return pair2; + }); + + vertex_t final_size = thrust::distance(temp_vertices.data().get(), cluster_reduce_end.first); + + vertex_t *d_temp_vertices = temp_vertices.data().get(); + vertex_t *d_temp_clusters = temp_cluster.data().get(); + weight_t *d_temp_delta_Q = temp_delta_Q.data().get(); + + thrust::for_each(rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(final_size), + [d_temp_delta_Q, + up_down, + d_next_cluster, + d_temp_vertices, + d_vertex_weights, + d_temp_clusters, + d_cluster_weights] __device__(vertex_t id) { + if ((d_temp_clusters[id] >= 0) && (d_temp_delta_Q[id] > weight_t{0.0})) { + vertex_t new_cluster = d_temp_clusters[id]; + vertex_t old_cluster = d_next_cluster[d_temp_vertices[id]]; + + if ((new_cluster > old_cluster) == up_down) { + weight_t src_weight = d_vertex_weights[d_temp_vertices[id]]; + d_next_cluster[d_temp_vertices[id]] = d_temp_clusters[id]; + + atomicAdd(d_cluster_weights + new_cluster, src_weight); + atomicAdd(d_cluster_weights + old_cluster, -src_weight); + } + } + }); +} + +template void assign_nodes(GraphCSRView const &, + rmm::device_vector &, + rmm::device_vector &, + rmm::device_vector const &, + rmm::device_vector &, + rmm::device_vector const &, + rmm::device_vector &, + bool, + cudaStream_t); + +template void assign_nodes(GraphCSRView const &, + rmm::device_vector &, + rmm::device_vector &, + rmm::device_vector const &, + rmm::device_vector &, + rmm::device_vector const &, + rmm::device_vector &, + bool, + cudaStream_t); + template weight_t update_clustering_by_delta_modularity( - weight_t m2, - experimental::GraphCSRView const &graph, + weight_t total_edge_weight, + weight_t resolution, + GraphCSRView const &graph, rmm::device_vector const &src_indices, rmm::device_vector const &vertex_weights, rmm::device_vector &cluster_weights, @@ -255,24 +517,18 @@ weight_t update_clustering_by_delta_modularity( cudaStream_t stream) { rmm::device_vector next_cluster(cluster); - rmm::device_vector old_cluster_sum(graph.number_of_vertices); rmm::device_vector delta_Q(graph.number_of_edges); rmm::device_vector cluster_hash(graph.number_of_edges); - rmm::device_vector cluster_hash_sum(graph.number_of_edges, weight_t{0.0}); + rmm::device_vector old_cluster_sum(graph.number_of_vertices); vertex_t *d_cluster_hash = cluster_hash.data().get(); - weight_t *d_cluster_hash_sum = cluster_hash_sum.data().get(); vertex_t *d_cluster = cluster.data().get(); - vertex_t const *d_src_indices = src_indices.data().get(); - vertex_t *d_dst_indices = graph.indices; - edge_t *d_offsets = graph.offsets; - weight_t *d_weights = graph.edge_data; weight_t const *d_vertex_weights = vertex_weights.data().get(); weight_t *d_cluster_weights = cluster_weights.data().get(); weight_t *d_delta_Q = delta_Q.data().get(); - weight_t *d_old_cluster_sum = old_cluster_sum.data().get(); - weight_t new_Q = modularity(m2, graph, cluster.data().get(), stream); + weight_t new_Q = modularity( + total_edge_weight, resolution, graph, cluster.data().get(), stream); weight_t cur_Q = new_Q - 1; @@ -284,171 +540,70 @@ weight_t update_clustering_by_delta_modularity( while (new_Q > (cur_Q + 0.0001)) { cur_Q = new_Q; - thrust::fill(cluster_hash.begin(), cluster_hash.end(), vertex_t{-1}); - thrust::fill(cluster_hash_sum.begin(), cluster_hash_sum.end(), weight_t{0.0}); - thrust::fill(old_cluster_sum.begin(), old_cluster_sum.end(), weight_t{0.0}); - - // - // For each source vertex, we're going to build a hash - // table to the destination cluster ids. We can use - // the offsets ranges to define the bounds of the hash - // table. - // - thrust::for_each(rmm::exec_policy(stream)->on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(graph.number_of_edges), - [d_src_indices, - d_dst_indices, - d_cluster, - d_offsets, - d_cluster_hash, - d_cluster_hash_sum, - d_weights, - d_old_cluster_sum] __device__(edge_t loc) { - vertex_t src = d_src_indices[loc]; - vertex_t dst = d_dst_indices[loc]; - - if (src != dst) { - vertex_t old_cluster = d_cluster[src]; - vertex_t new_cluster = d_cluster[dst]; - edge_t hash_base = d_offsets[src]; - edge_t n_edges = d_offsets[src + 1] - hash_base; - - int h = (new_cluster % n_edges); - edge_t offset = hash_base + h; - while (d_cluster_hash[offset] != new_cluster) { - if (d_cluster_hash[offset] == -1) { - atomicCAS(d_cluster_hash + offset, -1, new_cluster); - } else { - h = (h + 1) % n_edges; - offset = hash_base + h; - } - } - - atomicAdd(d_cluster_hash_sum + offset, d_weights[loc]); - - if (old_cluster == new_cluster) - atomicAdd(d_old_cluster_sum + src, d_weights[loc]); - } - }); - - thrust::for_each(rmm::exec_policy(stream)->on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(graph.number_of_edges), - [m2, - d_cluster_hash, - d_src_indices, - d_cluster, - d_vertex_weights, - d_delta_Q, - d_cluster_hash_sum, - d_old_cluster_sum, - d_cluster_weights] __device__(edge_t loc) { - vertex_t new_cluster = d_cluster_hash[loc]; - if (new_cluster >= 0) { - vertex_t src = d_src_indices[loc]; - vertex_t old_cluster = d_cluster[src]; - weight_t degc_totw = d_vertex_weights[src] / m2; - - d_delta_Q[loc] = - d_cluster_hash_sum[loc] - degc_totw * d_cluster_weights[new_cluster] - - (d_old_cluster_sum[src] - - (degc_totw * (d_cluster_weights[old_cluster] - d_vertex_weights[src]))); - -#ifdef DEBUG - printf("src = %d, new cluster = %d, d_delta_Q[%d] = %g\n", - src, - new_cluster, - loc, - d_delta_Q[loc]); -#endif - } else { - d_delta_Q[loc] = weight_t{0.0}; - } - }); - - auto cluster_reduce_iterator = - thrust::make_zip_iterator(thrust::make_tuple(d_cluster_hash, d_delta_Q)); - - rmm::device_vector temp_vertices(graph.number_of_vertices); - rmm::device_vector temp_cluster(graph.number_of_vertices, vertex_t{-1}); - rmm::device_vector temp_delta_Q(graph.number_of_vertices, weight_t{0.0}); - - auto output_edge_iterator2 = thrust::make_zip_iterator( - thrust::make_tuple(temp_cluster.data().get(), temp_delta_Q.data().get())); - - auto cluster_reduce_end = - thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), - d_src_indices, - d_src_indices + graph.number_of_edges, - cluster_reduce_iterator, - temp_vertices.data().get(), - output_edge_iterator2, - thrust::equal_to(), - [] __device__(auto pair1, auto pair2) { - if (thrust::get<1>(pair1) > thrust::get<1>(pair2)) - return pair1; - else - return pair2; - }); - - vertex_t final_size = thrust::distance(temp_vertices.data().get(), cluster_reduce_end.first); - - vertex_t *d_temp_vertices = temp_vertices.data().get(); - vertex_t *d_temp_clusters = temp_cluster.data().get(); - vertex_t *d_next_cluster = next_cluster.data().get(); - weight_t *d_temp_delta_Q = temp_delta_Q.data().get(); - - thrust::for_each(rmm::exec_policy(stream)->on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(final_size), - [d_temp_delta_Q, - up_down, - d_next_cluster, - d_temp_vertices, - d_vertex_weights, - d_temp_clusters, - d_cluster_weights] __device__(vertex_t id) { - if ((d_temp_clusters[id] >= 0) && (d_temp_delta_Q[id] > weight_t{0.0})) { - vertex_t new_cluster = d_temp_clusters[id]; - vertex_t old_cluster = d_next_cluster[d_temp_vertices[id]]; - - if ((new_cluster > old_cluster) == up_down) { -#ifdef DEBUG - printf( - "%s moving vertex %d from cluster %d to cluster %d - deltaQ = %g\n", - (up_down ? "up" : "down"), - d_temp_vertices[id], - d_next_cluster[d_temp_vertices[id]], - d_temp_clusters[id], - d_temp_delta_Q[id]); -#endif - - weight_t src_weight = d_vertex_weights[d_temp_vertices[id]]; - d_next_cluster[d_temp_vertices[id]] = d_temp_clusters[id]; - - atomicAdd(d_cluster_weights + new_cluster, src_weight); - atomicAdd(d_cluster_weights + old_cluster, -src_weight); - } - } - }); + compute_delta_modularity(total_edge_weight, + resolution, + graph, + src_indices, + vertex_weights, + cluster_weights, + cluster, + cluster_hash, + delta_Q, + old_cluster_sum, + stream); + + assign_nodes(graph, + delta_Q, + cluster_hash, + src_indices, + next_cluster, + vertex_weights, + cluster_weights, + up_down, + stream); up_down = !up_down; - new_Q = modularity(m2, graph, next_cluster.data().get(), stream); + new_Q = modularity( + total_edge_weight, resolution, graph, next_cluster.data().get(), stream); - if (new_Q > cur_Q) { thrust::copy(next_cluster.begin(), next_cluster.end(), cluster.begin()); } + if (new_Q > cur_Q) { + thrust::copy(rmm::exec_policy(stream)->on(stream), + next_cluster.begin(), + next_cluster.end(), + cluster.begin()); + } } return cur_Q; } +template float update_clustering_by_delta_modularity(float, + float, + GraphCSRView const &, + rmm::device_vector const &, + rmm::device_vector const &, + rmm::device_vector &, + rmm::device_vector &, + cudaStream_t); + +template double update_clustering_by_delta_modularity( + double, + double, + GraphCSRView const &, + rmm::device_vector const &, + rmm::device_vector const &, + rmm::device_vector &, + rmm::device_vector &, + cudaStream_t); + template -void louvain(experimental::GraphCSRView const &graph, +void louvain(GraphCSRView const &graph, weight_t *final_modularity, int *num_level, vertex_t *cluster_vec, - int max_iter, + int max_level, + weight_t resolution, cudaStream_t stream) { #ifdef TIMING @@ -479,7 +634,7 @@ void louvain(experimental::GraphCSRView const &graph rmm::device_vector tmp_arr_v(graph.number_of_vertices); rmm::device_vector cluster_inverse_v(graph.number_of_vertices); - weight_t m2 = + weight_t total_edge_weight = thrust::reduce(rmm::exec_policy(stream)->on(stream), weights_v.begin(), weights_v.end()); weight_t best_modularity = -1; @@ -487,22 +642,22 @@ void louvain(experimental::GraphCSRView const &graph // Initialize every cluster to reference each vertex to itself // thrust::sequence(rmm::exec_policy(stream)->on(stream), cluster_v.begin(), cluster_v.end()); - thrust::copy(cluster_v.begin(), cluster_v.end(), cluster_vec); + thrust::copy( + rmm::exec_policy(stream)->on(stream), cluster_v.begin(), cluster_v.end(), cluster_vec); // // Our copy of the graph. Each iteration of the outer loop will // shrink this copy of the graph. // - cugraph::experimental::GraphCSRView current_graph( - offsets_v.data().get(), - indices_v.data().get(), - weights_v.data().get(), - graph.number_of_vertices, - graph.number_of_edges); + GraphCSRView current_graph(offsets_v.data().get(), + indices_v.data().get(), + weights_v.data().get(), + graph.number_of_vertices, + graph.number_of_edges); current_graph.get_source_indices(src_indices_v.data().get()); - while (true) { + while (*num_level < max_level) { // // Sum the weights of all edges departing a vertex. This is // loop invariant, so we'll compute it here. @@ -515,7 +670,10 @@ void louvain(experimental::GraphCSRView const &graph #endif cugraph::detail::compute_vertex_sums(current_graph, vertex_weights_v, stream); - thrust::copy(vertex_weights_v.begin(), vertex_weights_v.end(), cluster_weights_v.begin()); + thrust::copy(rmm::exec_policy(stream)->on(stream), + vertex_weights_v.begin(), + vertex_weights_v.end(), + cluster_weights_v.begin()); #ifdef TIMING hr_timer.stop(); @@ -523,8 +681,14 @@ void louvain(experimental::GraphCSRView const &graph hr_timer.start("update_clustering"); #endif - weight_t new_Q = update_clustering_by_delta_modularity( - m2, current_graph, src_indices_v, vertex_weights_v, cluster_weights_v, cluster_v, stream); + weight_t new_Q = update_clustering_by_delta_modularity(total_edge_weight, + resolution, + current_graph, + src_indices_v, + vertex_weights_v, + cluster_weights_v, + cluster_v, + stream); #ifdef TIMING hr_timer.stop(); @@ -552,6 +716,8 @@ void louvain(experimental::GraphCSRView const &graph #ifdef TIMING hr_timer.stop(); #endif + + (*num_level)++; } #ifdef TIMING @@ -561,17 +727,19 @@ void louvain(experimental::GraphCSRView const &graph *final_modularity = best_modularity; } -template void louvain(experimental::GraphCSRView const &, +template void louvain(GraphCSRView const &, float *, int *, int32_t *, int, + float, cudaStream_t); -template void louvain(experimental::GraphCSRView const &, +template void louvain(GraphCSRView const &, double *, int *, int32_t *, int, + double, cudaStream_t); } // namespace detail diff --git a/cpp/src/community/louvain_kernels.hpp b/cpp/src/community/louvain_kernels.hpp index dd400f97f9e..eabd562315a 100644 --- a/cpp/src/community/louvain_kernels.hpp +++ b/cpp/src/community/louvain_kernels.hpp @@ -15,17 +15,82 @@ */ #pragma once +#include + #include namespace cugraph { namespace detail { template -void louvain(experimental::GraphCSRView const &graph, +weight_t modularity(weight_t total_edge_weight, + weight_t resolution, + GraphCSRView const &graph, + vertex_t const *d_cluster, + cudaStream_t stream = 0); + +template +void generate_superverticies_graph(cugraph::GraphCSRView ¤t_graph, + rmm::device_vector &src_indices_v, + vertex_t new_number_of_vertices, + rmm::device_vector &cluster_v, + cudaStream_t stream); + +template +void compute_vertex_sums(GraphCSRView const &graph, + rmm::device_vector &sums, + cudaStream_t stream); + +template +vertex_t renumber_clusters(vertex_t graph_num_vertices, + rmm::device_vector &cluster, + rmm::device_vector &temp_array, + rmm::device_vector &cluster_inverse, + vertex_t *cluster_vec, + cudaStream_t stream); + +template +void compute_delta_modularity(weight_t total_edge_weight, + weight_t resolution, + GraphCSRView const &graph, + rmm::device_vector const &src_indices_v, + rmm::device_vector const &vertex_weights_v, + rmm::device_vector const &cluster_weights_v, + rmm::device_vector const &cluster_v, + rmm::device_vector &cluster_hash_v, + rmm::device_vector &delta_Q_v, + rmm::device_vector &tmp_size_V_v, + cudaStream_t stream = 0); + +template +void assign_nodes(GraphCSRView const &graph, + rmm::device_vector &delta_Q, + rmm::device_vector &cluster_hash, + rmm::device_vector const &src_indices, + rmm::device_vector &next_cluster, + rmm::device_vector const &vertex_weights, + rmm::device_vector &cluster_weights, + bool up_down, + cudaStream_t stream); + +template +weight_t update_clustering_by_delta_modularity( + weight_t total_edge_weight, + weight_t resolution, + GraphCSRView const &graph, + rmm::device_vector const &src_indices, + rmm::device_vector const &vertex_weights, + rmm::device_vector &cluster_weights, + rmm::device_vector &cluster, + cudaStream_t stream); + +template +void louvain(GraphCSRView const &graph, weight_t *final_modularity, int *num_level, vertex_t *cluster_vec, - int max_iter, + int max_level, + weight_t resolution, cudaStream_t stream = 0); } // namespace detail diff --git a/cpp/src/community/spectral_clustering.cu b/cpp/src/community/spectral_clustering.cu index 908ef61a7a4..f32739ddf29 100644 --- a/cpp/src/community/spectral_clustering.cu +++ b/cpp/src/community/spectral_clustering.cu @@ -15,35 +15,31 @@ */ /** ---------------------------------------------------------------------------* - * @brief Wrapper functions for Nvgraph + * @brief Wrapper functions for Spectral Clustering * - * @file nvgraph_wrapper.cpp + * @file spectral_clustering.cu * ---------------------------------------------------------------------------**/ #include -#include -#include #include #include -#include #include -#include -#include -#include -#include -#include +#include +#include -#include +#include +#include namespace cugraph { -namespace nvgraph { + +namespace ext_raft { namespace detail { template -void balancedCutClustering_impl(experimental::GraphCSRView const &graph, +void balancedCutClustering_impl(GraphCSRView const &graph, vertex_t n_clusters, vertex_t n_eig_vects, weight_t evs_tolerance, @@ -54,23 +50,28 @@ void balancedCutClustering_impl(experimental::GraphCSRView= weight_t{0.0}, - "API error, evs_tolerance must be between 0.0 and 1.0"); - CUGRAPH_EXPECTS(evs_tolerance < weight_t{1.0}, - "API error, evs_tolerance must be between 0.0 and 1.0"); - CUGRAPH_EXPECTS(kmean_tolerance >= weight_t{0.0}, - "API error, kmean_tolerance must be between 0.0 and 1.0"); - CUGRAPH_EXPECTS(kmean_tolerance < weight_t{1.0}, - "API error, kmean_tolerance must be between 0.0 and 1.0"); - CUGRAPH_EXPECTS(n_clusters > 1, "API error, must specify more than 1 cluster"); - CUGRAPH_EXPECTS(n_clusters < graph.number_of_vertices, - "API error, number of clusters must be smaller than number of vertices"); - CUGRAPH_EXPECTS(n_eig_vects <= n_clusters, - "API error, cannot specify more eigenvectors than clusters"); - CUGRAPH_EXPECTS(clustering != nullptr, "API error, must specify valid clustering"); - CUGRAPH_EXPECTS(eig_vals != nullptr, "API error, must specify valid eigenvalues"); - CUGRAPH_EXPECTS(eig_vects != nullptr, "API error, must specify valid eigenvectors"); + RAFT_EXPECTS(graph.edge_data != nullptr, "API error, graph must have weights"); + RAFT_EXPECTS(evs_tolerance >= weight_t{0.0}, + "API error, evs_tolerance must be between 0.0 and 1.0"); + RAFT_EXPECTS(evs_tolerance < weight_t{1.0}, + "API error, evs_tolerance must be between 0.0 and 1.0"); + RAFT_EXPECTS(kmean_tolerance >= weight_t{0.0}, + "API error, kmean_tolerance must be between 0.0 and 1.0"); + RAFT_EXPECTS(kmean_tolerance < weight_t{1.0}, + "API error, kmean_tolerance must be between 0.0 and 1.0"); + RAFT_EXPECTS(n_clusters > 1, "API error, must specify more than 1 cluster"); + RAFT_EXPECTS(n_clusters < graph.number_of_vertices, + "API error, number of clusters must be smaller than number of vertices"); + RAFT_EXPECTS(n_eig_vects <= n_clusters, + "API error, cannot specify more eigenvectors than clusters"); + RAFT_EXPECTS(clustering != nullptr, "API error, must specify valid clustering"); + RAFT_EXPECTS(eig_vals != nullptr, "API error, must specify valid eigenvalues"); + RAFT_EXPECTS(eig_vects != nullptr, "API error, must specify valid eigenvectors"); + + raft::handle_t handle; + auto stream = handle.get_stream(); + auto exec = rmm::exec_policy(stream); + auto t_exe_p = exec->on(stream); int evs_max_it{4000}; int kmean_max_it{200}; @@ -87,57 +88,66 @@ void balancedCutClustering_impl(experimental::GraphCSRView(graph, - n_clusters, - n_eig_vects, - evs_max_it, - restartIter_lanczos, - evs_tol, - kmean_max_it, - kmean_tol, - clustering, - eig_vals, - eig_vects); + unsigned long long seed{1234567}; + bool reorthog{false}; + + using index_type = vertex_t; + using value_type = weight_t; + + raft::matrix::sparse_matrix_t const r_csr_m{handle, graph}; + + raft::eigen_solver_config_t eig_cfg{ + n_eig_vects, evs_max_it, restartIter_lanczos, evs_tol, reorthog, seed}; + raft::lanczos_solver_t eig_solver{eig_cfg}; + + raft::cluster_solver_config_t clust_cfg{ + n_clusters, kmean_max_it, kmean_tol, seed}; + raft::kmeans_solver_t cluster_solver{clust_cfg}; + + raft::spectral::partition( + handle, t_exe_p, r_csr_m, eig_solver, cluster_solver, clustering, eig_vals, eig_vects); } template -void spectralModularityMaximization_impl( - experimental::GraphCSRView const &graph, - vertex_t n_clusters, - vertex_t n_eig_vects, - weight_t evs_tolerance, - int evs_max_iter, - weight_t kmean_tolerance, - int kmean_max_iter, - vertex_t *clustering, - weight_t *eig_vals, - weight_t *eig_vects) +void spectralModularityMaximization_impl(GraphCSRView const &graph, + vertex_t n_clusters, + vertex_t n_eig_vects, + weight_t evs_tolerance, + int evs_max_iter, + weight_t kmean_tolerance, + int kmean_max_iter, + vertex_t *clustering, + weight_t *eig_vals, + weight_t *eig_vects) { - CUGRAPH_EXPECTS(graph.edge_data != nullptr, "API error, graph must have weights"); - CUGRAPH_EXPECTS(evs_tolerance >= weight_t{0.0}, - "API error, evs_tolerance must be between 0.0 and 1.0"); - CUGRAPH_EXPECTS(evs_tolerance < weight_t{1.0}, - "API error, evs_tolerance must be between 0.0 and 1.0"); - CUGRAPH_EXPECTS(kmean_tolerance >= weight_t{0.0}, - "API error, kmean_tolerance must be between 0.0 and 1.0"); - CUGRAPH_EXPECTS(kmean_tolerance < weight_t{1.0}, - "API error, kmean_tolerance must be between 0.0 and 1.0"); - CUGRAPH_EXPECTS(n_clusters > 1, "API error, must specify more than 1 cluster"); - CUGRAPH_EXPECTS(n_clusters < graph.number_of_vertices, - "API error, number of clusters must be smaller than number of vertices"); - CUGRAPH_EXPECTS(n_eig_vects <= n_clusters, - "API error, cannot specify more eigenvectors than clusters"); - CUGRAPH_EXPECTS(clustering != nullptr, "API error, must specify valid clustering"); - CUGRAPH_EXPECTS(eig_vals != nullptr, "API error, must specify valid eigenvalues"); - CUGRAPH_EXPECTS(eig_vects != nullptr, "API error, must specify valid eigenvectors"); + RAFT_EXPECTS(graph.edge_data != nullptr, "API error, graph must have weights"); + RAFT_EXPECTS(evs_tolerance >= weight_t{0.0}, + "API error, evs_tolerance must be between 0.0 and 1.0"); + RAFT_EXPECTS(evs_tolerance < weight_t{1.0}, + "API error, evs_tolerance must be between 0.0 and 1.0"); + RAFT_EXPECTS(kmean_tolerance >= weight_t{0.0}, + "API error, kmean_tolerance must be between 0.0 and 1.0"); + RAFT_EXPECTS(kmean_tolerance < weight_t{1.0}, + "API error, kmean_tolerance must be between 0.0 and 1.0"); + RAFT_EXPECTS(n_clusters > 1, "API error, must specify more than 1 cluster"); + RAFT_EXPECTS(n_clusters < graph.number_of_vertices, + "API error, number of clusters must be smaller than number of vertices"); + RAFT_EXPECTS(n_eig_vects <= n_clusters, + "API error, cannot specify more eigenvectors than clusters"); + RAFT_EXPECTS(clustering != nullptr, "API error, must specify valid clustering"); + RAFT_EXPECTS(eig_vals != nullptr, "API error, must specify valid eigenvalues"); + RAFT_EXPECTS(eig_vects != nullptr, "API error, must specify valid eigenvectors"); + + raft::handle_t handle; + auto stream = handle.get_stream(); + auto exec = rmm::exec_policy(stream); + auto t_exe_p = exec->on(stream); int evs_max_it{4000}; int kmean_max_it{200}; weight_t evs_tol{1.0E-3}; weight_t kmean_tol{1.0E-2}; - int iters_lanczos, iters_kmeans; - if (evs_max_iter > 0) evs_max_it = evs_max_iter; if (evs_tolerance > weight_t{0.0}) evs_tol = evs_tolerance; @@ -147,56 +157,90 @@ void spectralModularityMaximization_impl( if (kmean_tolerance > weight_t{0.0}) kmean_tol = kmean_tolerance; int restartIter_lanczos = 15 + n_eig_vects; - ::nvgraph::modularity_maximization(graph, - n_clusters, - n_eig_vects, - evs_max_it, - restartIter_lanczos, - evs_tol, - kmean_max_it, - kmean_tol, - clustering, - eig_vals, - eig_vects, - iters_lanczos, - iters_kmeans); + + unsigned long long seed{123456}; + bool reorthog{false}; + + using index_type = vertex_t; + using value_type = weight_t; + + raft::matrix::sparse_matrix_t const r_csr_m{handle, graph}; + + raft::eigen_solver_config_t eig_cfg{ + n_eig_vects, evs_max_it, restartIter_lanczos, evs_tol, reorthog, seed}; + raft::lanczos_solver_t eig_solver{eig_cfg}; + + raft::cluster_solver_config_t clust_cfg{ + n_clusters, kmean_max_it, kmean_tol, seed}; + raft::kmeans_solver_t cluster_solver{clust_cfg}; + + // not returned... + // auto result = + raft::spectral::modularity_maximization( + handle, t_exe_p, r_csr_m, eig_solver, cluster_solver, clustering, eig_vals, eig_vects); + + // not returned... + // int iters_lanczos, iters_kmeans; + // iters_lanczos = std::get<0>(result); + // iters_kmeans = std::get<2>(result); } template -void analyzeModularityClustering_impl( - experimental::GraphCSRView const &graph, - int n_clusters, - vertex_t const *clustering, - weight_t *modularity) +void analyzeModularityClustering_impl(GraphCSRView const &graph, + int n_clusters, + vertex_t const *clustering, + weight_t *modularity) { + raft::handle_t handle; + auto stream = handle.get_stream(); + auto exec = rmm::exec_policy(stream); + auto t_exe_p = exec->on(stream); + + using index_type = vertex_t; + using value_type = weight_t; + + raft::matrix::sparse_matrix_t const r_csr_m{handle, graph}; + weight_t mod; - ::nvgraph::analyzeModularity(graph, n_clusters, clustering, mod); + raft::spectral::analyzeModularity(handle, t_exe_p, r_csr_m, n_clusters, clustering, mod); *modularity = mod; } template -void analyzeBalancedCut_impl(experimental::GraphCSRView const &graph, +void analyzeBalancedCut_impl(GraphCSRView const &graph, vertex_t n_clusters, vertex_t const *clustering, weight_t *edgeCut, weight_t *ratioCut) { - CUGRAPH_EXPECTS(n_clusters <= graph.number_of_vertices, - "API error: number of clusters must be <= number of vertices"); - CUGRAPH_EXPECTS(n_clusters > 0, "API error: number of clusters must be > 0)"); + raft::handle_t handle; + auto stream = handle.get_stream(); + auto exec = rmm::exec_policy(stream); + auto t_exe_p = exec->on(stream); + + RAFT_EXPECTS(n_clusters <= graph.number_of_vertices, + "API error: number of clusters must be <= number of vertices"); + RAFT_EXPECTS(n_clusters > 0, "API error: number of clusters must be > 0)"); + + weight_t edge_cut; + weight_t cost{0}; + + using index_type = vertex_t; + using value_type = weight_t; - weight_t edge_cut, ratio_cut; + raft::matrix::sparse_matrix_t const r_csr_m{handle, graph}; - ::nvgraph::analyzePartition(graph, n_clusters, clustering, edge_cut, ratio_cut); + raft::spectral::analyzePartition( + handle, t_exe_p, r_csr_m, n_clusters, clustering, edge_cut, cost); *edgeCut = edge_cut; - *ratioCut = ratio_cut; + *ratioCut = cost; } } // namespace detail template -void balancedCutClustering(experimental::GraphCSRView const &graph, +void balancedCutClustering(GraphCSRView const &graph, VT num_clusters, VT num_eigen_vects, WT evs_tolerance, @@ -221,7 +265,7 @@ void balancedCutClustering(experimental::GraphCSRView const &graph, } template -void spectralModularityMaximization(experimental::GraphCSRView const &graph, +void spectralModularityMaximization(GraphCSRView const &graph, VT n_clusters, VT n_eigen_vects, WT evs_tolerance, @@ -246,7 +290,7 @@ void spectralModularityMaximization(experimental::GraphCSRView const } template -void analyzeClustering_modularity(experimental::GraphCSRView const &graph, +void analyzeClustering_modularity(GraphCSRView const &graph, int n_clusters, VT const *clustering, WT *score) @@ -255,7 +299,7 @@ void analyzeClustering_modularity(experimental::GraphCSRView const & } template -void analyzeClustering_edge_cut(experimental::GraphCSRView const &graph, +void analyzeClustering_edge_cut(GraphCSRView const &graph, int n_clusters, VT const *clustering, WT *score) @@ -265,7 +309,7 @@ void analyzeClustering_edge_cut(experimental::GraphCSRView const &gr } template -void analyzeClustering_ratio_cut(experimental::GraphCSRView const &graph, +void analyzeClustering_ratio_cut(GraphCSRView const &graph, int n_clusters, VT const *clustering, WT *score) @@ -275,25 +319,37 @@ void analyzeClustering_ratio_cut(experimental::GraphCSRView const &g } template void balancedCutClustering( - experimental::GraphCSRView const &, int, int, float, int, float, int, int *); + GraphCSRView const &, int, int, float, int, float, int, int *); template void balancedCutClustering( - experimental::GraphCSRView const &, int, int, double, int, double, int, int *); + GraphCSRView const &, int, int, double, int, double, int, int *); template void spectralModularityMaximization( - experimental::GraphCSRView const &, int, int, float, int, float, int, int *); + GraphCSRView const &, int, int, float, int, float, int, int *); template void spectralModularityMaximization( - experimental::GraphCSRView const &, int, int, double, int, double, int, int *); -template void analyzeClustering_modularity( - experimental::GraphCSRView const &, int, int const *, float *); -template void analyzeClustering_modularity( - experimental::GraphCSRView const &, int, int const *, double *); -template void analyzeClustering_edge_cut( - experimental::GraphCSRView const &, int, int const *, float *); -template void analyzeClustering_edge_cut( - experimental::GraphCSRView const &, int, int const *, double *); -template void analyzeClustering_ratio_cut( - experimental::GraphCSRView const &, int, int const *, float *); -template void analyzeClustering_ratio_cut( - experimental::GraphCSRView const &, int, int const *, double *); - -} // namespace nvgraph + GraphCSRView const &, int, int, double, int, double, int, int *); +template void analyzeClustering_modularity(GraphCSRView const &, + int, + int const *, + float *); +template void analyzeClustering_modularity(GraphCSRView const &, + int, + int const *, + double *); +template void analyzeClustering_edge_cut(GraphCSRView const &, + int, + int const *, + float *); +template void analyzeClustering_edge_cut(GraphCSRView const &, + int, + int const *, + double *); +template void analyzeClustering_ratio_cut(GraphCSRView const &, + int, + int const *, + float *); +template void analyzeClustering_ratio_cut(GraphCSRView const &, + int, + int const *, + double *); + +} // namespace ext_raft } // namespace cugraph diff --git a/cpp/src/community/triangles_counting.cu b/cpp/src/community/triangles_counting.cu index 27b19e2e2a8..f6670365652 100644 --- a/cpp/src/community/triangles_counting.cu +++ b/cpp/src/community/triangles_counting.cu @@ -16,17 +16,18 @@ #include +#include #include #include -#include -#include +#include #include #include #include +#include #include "cub/cub.cuh" #define TH_CENT_K_LOCLEN (34) @@ -49,7 +50,10 @@ #define DEG_THR1 (3.5) #define DEG_THR2 (38.0) -namespace nvgraph { +namespace cugraph { +namespace triangle { + +namespace { // anonym. template struct type_utils; @@ -95,13 +99,13 @@ static inline void cubSum(InputIteratorT d_in, cub::DeviceReduce::Sum( nullptr, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous); - cudaCheckError(); + CHECK_CUDA(stream); rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); cub::DeviceReduce::Sum( d_temp_storage.data(), temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous); - cudaCheckError(); + CHECK_CUDA(stream); return; } @@ -129,7 +133,7 @@ static inline void cubIf(InputIteratorT d_in, select_op, stream, debug_synchronous); - cudaCheckError(); + CHECK_CUDA(stream); rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); @@ -142,7 +146,7 @@ static inline void cubIf(InputIteratorT d_in, select_op, stream, debug_synchronous); - cudaCheckError(); + CHECK_CUDA(stream); return; } @@ -169,7 +173,7 @@ __device__ __forceinline__ T block_sum(T v) const int wid = threadIdx.x / 32 + ((BDIM_Y > 1) ? threadIdx.y * (BDIM_X / 32) : 0); #pragma unroll - for (int i = WSIZE / 2; i; i >>= 1) { v += utils::shfl_down(v, i); } + for (int i = WSIZE / 2; i; i >>= 1) { v += __shfl_down_sync(raft::warp_full_mask(), v, i); } if (lid == 0) sh[wid] = v; __syncthreads(); @@ -177,7 +181,9 @@ __device__ __forceinline__ T block_sum(T v) v = (lid < (BDIM_X * BDIM_Y / WSIZE)) ? sh[lid] : 0; #pragma unroll - for (int i = (BDIM_X * BDIM_Y / WSIZE) / 2; i; i >>= 1) { v += utils::shfl_down(v, i); } + for (int i = (BDIM_X * BDIM_Y / WSIZE) / 2; i; i >>= 1) { + v += __shfl_down_sync(raft::warp_full_mask(), v, i); + } } return v; } @@ -282,7 +288,7 @@ void tricnt_b2b(T nblock, // still best overall (with no psum) tricnt_b2b_k<<>>( m->nrows, m->rows_d, m->roff_d, m->cols_d, ocnt_d, bmapL0_d, bmldL0, bmapL1_d, bmldL1); - cudaCheckError(); + CHECK_CUDA(stream); return; } @@ -294,7 +300,7 @@ __device__ __forceinline__ T block_sum_sh(T v, T *sh) const int wid = threadIdx.x / 32 + ((BDIM_Y > 1) ? threadIdx.y * (BDIM_X / 32) : 0); #pragma unroll - for (int i = WSIZE / 2; i; i >>= 1) { v += utils::shfl_down(v, i); } + for (int i = WSIZE / 2; i; i >>= 1) { v += __shfl_down_sync(raft::warp_full_mask(), v, i); } if (lid == 0) sh[wid] = v; __syncthreads(); @@ -302,7 +308,9 @@ __device__ __forceinline__ T block_sum_sh(T v, T *sh) v = (lid < (BDIM_X * BDIM_Y / WSIZE)) ? sh[lid] : 0; #pragma unroll - for (int i = (BDIM_X * BDIM_Y / WSIZE) / 2; i; i >>= 1) { v += utils::shfl_down(v, i); } + for (int i = (BDIM_X * BDIM_Y / WSIZE) / 2; i; i >>= 1) { + v += __shfl_down_sync(raft::warp_full_mask(), v, i); + } } return v; } @@ -386,7 +394,7 @@ void tricnt_bsh(T nblock, spmat_t *m, uint64_t *ocnt_d, size_t bmld, cudaStre { tricnt_bsh_k<<>>( m->nrows, m->rows_d, m->roff_d, m->cols_d, ocnt_d, bmld); - cudaCheckError(); + CHECK_CUDA(stream); return; } @@ -438,8 +446,8 @@ __global__ void tricnt_wrp_ps_k(const ROW_T ner, for (int i = 1; i < RLEN_THR1; i++) { if (i == nloc) break; - const OFF_T csoff = utils::shfl(soff, i); - const OFF_T ceoff = utils::shfl(eoff, i); + const OFF_T csoff = __shfl_sync(raft::warp_full_mask(), soff, i); + const OFF_T ceoff = __shfl_sync(raft::warp_full_mask(), eoff, i); if (ceoff - csoff < RLEN_THR2) { if (threadIdx.x == i) mysm = i; @@ -483,11 +491,11 @@ __global__ void tricnt_wrp_ps_k(const ROW_T ner, #pragma unroll for (int j = 1; j < 32; j <<= 1) { - lensum += (threadIdx.x >= j) * (utils::shfl_up(lensum, j)); + lensum += (threadIdx.x >= j) * (__shfl_up_sync(raft::warp_full_mask(), lensum, j)); } shs[threadIdx.y][threadIdx.x] = lensum - len; - lensum = utils::shfl(lensum, 31); + lensum = __shfl_sync(raft::warp_full_mask(), lensum, 31); int k = WSIZE - 1; for (int j = lensum - 1; j >= 0; j -= WSIZE) { @@ -534,7 +542,7 @@ void tricnt_wrp( dim3 block(32, THREADS / 32); tricnt_wrp_ps_k<32, THREADS / 32, WP_LEN_TH1, WP_LEN_TH2> <<>>(m->nrows, m->rows_d, m->roff_d, m->cols_d, ocnt_d, bmap_d, bmld); - cudaCheckError(); + CHECK_CUDA(stream); return; } @@ -622,7 +630,7 @@ void tricnt_thr(T nblock, spmat_t *m, uint64_t *ocnt_d, cudaStream_t stream) tricnt_thr_k <<>>(m->nrows, m->rows_d, m->roff_d, m->cols_d, ocnt_d); - cudaCheckError(); + CHECK_CUDA(stream); return; } @@ -648,7 +656,7 @@ void create_nondangling_vector( cubIf(it, p_nonempty, out_num.data().get(), n, temp_func, stream); cudaMemcpy(n_nonempty, out_num.data().get(), sizeof(*n_nonempty), cudaMemcpyDeviceToHost); - cudaCheckError(); + CHECK_CUDA(stream); } template @@ -657,7 +665,7 @@ uint64_t reduce(uint64_t *v_d, T n, cudaStream_t stream) rmm::device_vector tmp(1); cubSum(v_d, tmp.data().get(), n, stream); - cudaCheckError(); + CHECK_CUDA(stream); return tmp[0]; } @@ -700,27 +708,20 @@ TrianglesCount::TrianglesCount(IndexType num_vertices, IndexType const *row_offsets, IndexType const *col_indices, cudaStream_t stream) + : m_mat{num_vertices, num_edges, num_vertices, row_offsets, nullptr, col_indices}, + m_stream{stream}, + m_done{true} { - m_stream = stream; - m_done = true; - int device_id; cudaGetDevice(&device_id); cudaDeviceGetAttribute(&m_shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id); - cudaCheckError(); + CHECK_CUDA(m_stream); cudaDeviceGetAttribute(&m_multi_processor_count, cudaDevAttrMultiProcessorCount, device_id); - cudaCheckError(); + CHECK_CUDA(m_stream); cudaDeviceGetAttribute( &m_max_threads_per_multi_processor, cudaDevAttrMaxThreadsPerMultiProcessor, device_id); - cudaCheckError(); - - // fill spmat struct; - m_mat.nnz = num_edges; - m_mat.N = num_vertices; - m_mat.nrows = num_vertices; - m_mat.roff_d = row_offsets; - m_mat.cols_d = col_indices; + CHECK_CUDA(m_stream); m_seq.resize(m_mat.N, IndexType{0}); create_nondangling_vector(m_mat.roff_d, m_seq.data().get(), &(m_mat.nrows), m_mat.N, m_stream); @@ -730,9 +731,11 @@ TrianglesCount::TrianglesCount(IndexType num_vertices, template void TrianglesCount::tcount_bsh() { - if (m_shared_mem_per_block * 8 < (size_t)m_mat.nrows) { - FatalError("Number of vertices too high to use this kernel!", NVGRAPH_ERR_BAD_PARAMETERS); - } + CUGRAPH_EXPECTS(not(m_shared_mem_per_block * 8 < m_mat.nrows), + "Number of vertices too high for TrainglesCount."); + /// if (m_shared_mem_per_block * 8 < (size_t)m_mat.nrows) { + /// FatalError("Number of vertices too high to use this kernel!", NVGRAPH_ERR_BAD_PARAMETERS); + ///} size_t bmld = bitmap_roundup(m_mat.N); int nblock = m_mat.nrows; @@ -754,7 +757,7 @@ void TrianglesCount::tcount_b2b() size_t free_bytes, total_bytes; cudaMemGetInfo(&free_bytes, &total_bytes); - cudaCheckError(); + CHECK_CUDA(m_stream); int nblock = (free_bytes * 95 / 100) / (sizeof(uint32_t) * bmldL1); //@TODO: what? nblock = MIN(nblock, m_mat.nrows); @@ -788,7 +791,7 @@ void TrianglesCount::tcount_wrp() // number of blocks limited by birmap size size_t free_bytes, total_bytes; cudaMemGetInfo(&free_bytes, &total_bytes); - cudaCheckError(); + CHECK_CUDA(m_stream); int nblock = (free_bytes * 95 / 100) / (sizeof(uint32_t) * bmld * (THREADS / 32)); nblock = MIN(nblock, DIV_UP(m_mat.nrows, (THREADS / 32))); @@ -831,15 +834,12 @@ void TrianglesCount::count() } } -} // namespace nvgraph - -namespace cugraph { -namespace nvgraph { +} // namespace template -uint64_t triangle_count(experimental::GraphCSRView const &graph) +uint64_t triangle_count(GraphCSRView const &graph) { - ::nvgraph::TrianglesCount counter( + TrianglesCount counter( graph.number_of_vertices, graph.number_of_edges, graph.offsets, graph.indices); counter.count(); @@ -847,7 +847,7 @@ uint64_t triangle_count(experimental::GraphCSRView const &graph) } template uint64_t triangle_count( - experimental::GraphCSRView const &); + GraphCSRView const &); -} // namespace nvgraph +} // namespace triangle } // namespace cugraph diff --git a/cpp/src/components/connectivity.cu b/cpp/src/components/connectivity.cu index 5dcbfcfadc2..2cc1da017a9 100644 --- a/cpp/src/components/connectivity.cu +++ b/cpp/src/components/connectivity.cu @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include "scc_matrix.cuh" #include "weak_cc.cuh" @@ -8,7 +24,7 @@ #include #include #include -#include "utilities/error_utils.h" +#include "utilities/error.hpp" #include "utilities/graph_utils.cuh" #include "topology/topology.cuh" @@ -41,7 +57,7 @@ namespace detail { */ template std::enable_if_t::value> connected_components_impl( - experimental::GraphCSRView const &graph, + GraphCSRView const &graph, cugraph_cc_t connectivity_type, VT *labels, cudaStream_t stream) @@ -68,7 +84,7 @@ std::enable_if_t::value> connected_components_impl( } // namespace detail template -void connected_components(experimental::GraphCSRView const &graph, +void connected_components(GraphCSRView const &graph, cugraph_cc_t connectivity_type, VT *labels) { @@ -80,8 +96,8 @@ void connected_components(experimental::GraphCSRView const &graph, } template void connected_components( - experimental::GraphCSRView const &, cugraph_cc_t, int32_t *); + GraphCSRView const &, cugraph_cc_t, int32_t *); template void connected_components( - experimental::GraphCSRView const &, cugraph_cc_t, int64_t *); + GraphCSRView const &, cugraph_cc_t, int64_t *); } // namespace cugraph diff --git a/cpp/src/components/scc_matrix.cuh b/cpp/src/components/scc_matrix.cuh index ce15e8d3c98..801f1fe0fad 100644 --- a/cpp/src/components/scc_matrix.cuh +++ b/cpp/src/components/scc_matrix.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/components/utils.h b/cpp/src/components/utils.h index dfc56434357..c9ebb6ac4d1 100644 --- a/cpp/src/components/utils.h +++ b/cpp/src/components/utils.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,7 +25,9 @@ #include #include -#include +#include + +#include namespace MLCommon { @@ -77,35 +79,6 @@ class Exception : public std::exception { } }; -/** macro to throw a runtime error */ -#define THROW(fmt, ...) \ - do { \ - std::string msg; \ - char errMsg[2048]; \ - std::sprintf(errMsg, "Exception occured! file=%s line=%d: ", __FILE__, __LINE__); \ - msg += errMsg; \ - std::sprintf(errMsg, fmt, ##__VA_ARGS__); \ - msg += errMsg; \ - throw MLCommon::Exception(msg); \ - } while (0) - -/** macro to check for a conditional and assert on failure */ -#define ASSERT(check, fmt, ...) \ - do { \ - if (!(check)) THROW(fmt, ##__VA_ARGS__); \ - } while (0) - -/** check for cuda runtime API errors and assert accordingly */ -#define CUDA_CHECK(call) \ - do { \ - cudaError_t status = call; \ - ASSERT( \ - status == cudaSuccess, "FAIL: call='%s'. Reason:%s\n", #call, cudaGetErrorString(status)); \ - } while (0) - -///@todo: add a similar CUDA_CHECK_NO_THROW -/// (Ref: https://github.com/rapidsai/cuml/issues/229) - /** * @brief Generic copy method for all kinds of transfers * @tparam Type data type @@ -117,7 +90,7 @@ class Exception : public std::exception { template void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) { - CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); + CUDA_TRY(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); } /** @@ -143,7 +116,7 @@ void updateHost(Type* hPtr, const Type* dPtr, size_t len, cudaStream_t stream) template void copyAsync(Type* dPtr1, const Type* dPtr2, size_t len, cudaStream_t stream) { - CUDA_CHECK(cudaMemcpyAsync(dPtr1, dPtr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(dPtr1, dPtr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream)); } /** @} */ @@ -214,8 +187,7 @@ void myPrintDevVector(const char* variableName, OutStream& out) { std::vector hostMem(componentsCount); - CUDA_CHECK( - cudaMemcpy(hostMem.data(), devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost)); + CUDA_TRY(cudaMemcpy(hostMem.data(), devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost)); myPrintHostVector(variableName, hostMem.data(), componentsCount, out); } diff --git a/cpp/src/components/weak_cc.cuh b/cpp/src/components/weak_cc.cuh index 291831d2c37..d644a988117 100644 --- a/cpp/src/components/weak_cc.cuh +++ b/cpp/src/components/weak_cc.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,8 +25,10 @@ #include #include +#include +#include + #include -#include "utilities/cuda_utils.cuh" #include "utils.h" namespace MLCommon { @@ -94,7 +96,7 @@ __global__ void weak_cc_label_device(vertex_t *labels, vertex_t j_ind = indices[j]; cj = labels[j_ind]; if (ci < cj) { - cugraph::atomicMin(labels + j_ind, ci); + atomicMin(labels + j_ind, ci); xa[j_ind] = true; m[0] = true; } else if (ci > cj) { @@ -104,7 +106,7 @@ __global__ void weak_cc_label_device(vertex_t *labels, } if (ci_mod) { - cugraph::atomicMin(labels + startVertexId + tid, ci); + atomicMin(labels + startVertexId + tid, ci); xa[startVertexId + tid] = true; m[0] = true; } @@ -163,22 +165,22 @@ void weak_cc_label_batched(vertex_t *labels, weak_cc_init_label_kernel <<>>(labels, startVertexId, batchSize, MAX_LABEL, filter_op); - CUDA_CHECK(cudaPeekAtLastError()); + CUDA_TRY(cudaPeekAtLastError()); int n_iters = 0; do { - CUDA_CHECK(cudaMemsetAsync(state.m, false, sizeof(bool), stream)); + CUDA_TRY(cudaMemsetAsync(state.m, false, sizeof(bool), stream)); weak_cc_label_device<<>>( labels, offsets, indices, nnz, state.fa, state.xa, state.m, startVertexId, batchSize); - CUDA_CHECK(cudaPeekAtLastError()); - CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_TRY(cudaPeekAtLastError()); + CUDA_TRY(cudaStreamSynchronize(stream)); thrust::swap(state.fa, state.xa); //** Updating m * MLCommon::updateHost(&host_m, state.m, 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + CUDA_TRY(cudaStreamSynchronize(stream)); n_iters++; } while (host_m); @@ -233,7 +235,7 @@ void weak_cc_batched(vertex_t *labels, if (startVertexId == 0) { weak_cc_init_all_kernel <<>>(labels, state.fa, state.xa, N, MAX_LABEL); - CUDA_CHECK(cudaPeekAtLastError()); + CUDA_TRY(cudaPeekAtLastError()); } weak_cc_label_batched( diff --git a/cpp/src/converters/COOtoCSR.cu b/cpp/src/converters/COOtoCSR.cu index 96143e6ba24..f52be206015 100644 --- a/cpp/src/converters/COOtoCSR.cu +++ b/cpp/src/converters/COOtoCSR.cu @@ -19,13 +19,29 @@ namespace cugraph { -template std::unique_ptr> -coo_to_csr( - experimental::GraphCOOView const &graph, - rmm::mr::device_memory_resource *); -template std::unique_ptr> -coo_to_csr( - experimental::GraphCOOView const &graph, - rmm::mr::device_memory_resource *); +// Explicit instantiation for uint32_t + float +template std::unique_ptr> coo_to_csr( + GraphCOOView const &graph, rmm::mr::device_memory_resource *); + +// Explicit instantiation for uint32_t + double +template std::unique_ptr> +coo_to_csr(GraphCOOView const &graph, + rmm::mr::device_memory_resource *); + +// Explicit instantiation for int + float +template std::unique_ptr> coo_to_csr( + GraphCOOView const &graph, rmm::mr::device_memory_resource *); + +// Explicit instantiation for int + double +template std::unique_ptr> coo_to_csr( + GraphCOOView const &graph, rmm::mr::device_memory_resource *); + +// Explicit instantiation for int64_t + float +template std::unique_ptr> coo_to_csr( + GraphCOOView const &graph, rmm::mr::device_memory_resource *); + +// Explicit instantiation for int64_t + double +template std::unique_ptr> coo_to_csr( + GraphCOOView const &graph, rmm::mr::device_memory_resource *); } // namespace cugraph diff --git a/cpp/src/converters/COOtoCSR.cuh b/cpp/src/converters/COOtoCSR.cuh index 5ba884f4a74..f636e387aa1 100644 --- a/cpp/src/converters/COOtoCSR.cuh +++ b/cpp/src/converters/COOtoCSR.cuh @@ -31,7 +31,7 @@ #include #include -#include +#include #include #include @@ -60,7 +60,7 @@ namespace detail { * @param[out] result Total number of vertices */ template -VT sort(experimental::GraphCOOView& graph, cudaStream_t stream) +VT sort(GraphCOOView& graph, cudaStream_t stream) { VT max_src_id; VT max_dst_id; @@ -111,8 +111,10 @@ void fill_offset( VT id = source[index]; if (id != source[index - 1]) { offsets[id] = index; } }); - ET zero = 0; - CUDA_TRY(cudaMemcpy(offsets, &zero, sizeof(ET), cudaMemcpyDefault)); + thrust::device_ptr src = thrust::device_pointer_cast(source); + thrust::device_ptr off = thrust::device_pointer_cast(offsets); + off[src[0]] = ET{0}; + auto iter = thrust::make_reverse_iterator(offsets + number_of_vertices + 1); thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream), iter, @@ -141,13 +143,10 @@ rmm::device_buffer create_offset(VT* source, } // namespace detail template -std::unique_ptr> coo_to_csr( - experimental::GraphCOOView const& graph, rmm::mr::device_memory_resource* mr) +std::unique_ptr> coo_to_csr(GraphCOOView const& graph, + rmm::mr::device_memory_resource* mr) { cudaStream_t stream{nullptr}; - using experimental::GraphCOO; - using experimental::GraphCOOView; - using experimental::GraphSparseContents; GraphCOO temp_graph(graph, stream, mr); GraphCOOView temp_graph_view = temp_graph.view(); @@ -162,12 +161,11 @@ std::unique_ptr> coo_to_csr( std::move(coo_contents.dst_indices), std::move(coo_contents.edge_data)}; - return std::make_unique>(std::move(csr_contents)); + return std::make_unique>(std::move(csr_contents)); } template -void coo_to_csr_inplace(experimental::GraphCOOView& graph, - experimental::GraphCSRView& result) +void coo_to_csr_inplace(GraphCOOView& graph, GraphCSRView& result) { cudaStream_t stream{nullptr}; diff --git a/cpp/src/converters/permute_graph.cuh b/cpp/src/converters/permute_graph.cuh index edf97ddc212..b5b2de83e9b 100644 --- a/cpp/src/converters/permute_graph.cuh +++ b/cpp/src/converters/permute_graph.cuh @@ -14,8 +14,8 @@ * limitations under the License. */ #include -#include #include +#include #include "converters/COOtoCSR.cuh" #include "utilities/graph_utils.cuh" @@ -42,9 +42,9 @@ struct permutation_functor { * @return The permuted graph. */ template -void permute_graph(experimental::GraphCSRView const &graph, +void permute_graph(GraphCSRView const &graph, vertex_t const *permutation, - experimental::GraphCSRView result, + GraphCSRView result, cudaStream_t stream = 0) { // Create a COO out of the CSR @@ -76,7 +76,7 @@ void permute_graph(experimental::GraphCSRView const d_dst, pf); - cugraph::experimental::GraphCOOView graph_coo; + GraphCOOView graph_coo; graph_coo.number_of_vertices = graph.number_of_vertices; graph_coo.number_of_edges = graph.number_of_edges; diff --git a/cpp/src/converters/renumber.cuh b/cpp/src/converters/renumber.cuh index 02ce10a1f20..263d7199c10 100644 --- a/cpp/src/converters/renumber.cuh +++ b/cpp/src/converters/renumber.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,10 +27,11 @@ #include #include +#include #include +#include #include "sort/bitonic.cuh" -#include "utilities/error_utils.h" #include "utilities/graph_utils.cuh" namespace cugraph { diff --git a/cpp/src/cores/core_number.cu b/cpp/src/cores/core_number.cu index f3770147db8..40b1b7bf943 100644 --- a/cpp/src/cores/core_number.cu +++ b/cpp/src/cores/core_number.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,19 +14,18 @@ * limitations under the License. */ -#include #include -#include #include #include #include +#include //#include namespace cugraph { namespace detail { template -void core_number(experimental::GraphCSRView const &graph, int *core_number) +void core_number(GraphCSRView const &graph, int *core_number) { using HornetGraph = hornet::gpu::HornetStatic; using HornetInit = hornet::HornetInit; @@ -53,8 +52,8 @@ struct FilterEdges { }; template -void extract_edges(experimental::GraphCOOView const &i_graph, - experimental::GraphCOOView &o_graph, +void extract_edges(GraphCOOView const &i_graph, + GraphCOOView &o_graph, VT *d_core, int k) { @@ -97,8 +96,8 @@ void extract_edges(experimental::GraphCOOView const &i_graph, // i.e. All edges (s,d,w) in in_graph are copied over to out_graph // if core_num[s] and core_num[d] are greater than or equal to k. template -std::unique_ptr> extract_subgraph( - experimental::GraphCOOView const &in_graph, +std::unique_ptr> extract_subgraph( + GraphCOOView const &in_graph, int const *vid, int const *core_num, int k, @@ -120,7 +119,7 @@ std::unique_ptr> extract_subgraph( auto edge = thrust::make_zip_iterator(thrust::make_tuple(in_graph.src_indices, in_graph.dst_indices)); - auto out_graph = std::make_unique>( + auto out_graph = std::make_unique>( in_graph.number_of_vertices, thrust::count_if(rmm::exec_policy(stream)->on(stream), edge, @@ -130,7 +129,7 @@ std::unique_ptr> extract_subgraph( stream, mr); - experimental::GraphCOOView out_graph_view = out_graph->view(); + GraphCOOView out_graph_view = out_graph->view(); extract_edges(in_graph, out_graph_view, d_sorted_core_num, k); return out_graph; @@ -139,19 +138,18 @@ std::unique_ptr> extract_subgraph( } // namespace detail template -void core_number(experimental::GraphCSRView const &graph, VT *core_number) +void core_number(GraphCSRView const &graph, VT *core_number) { return detail::core_number(graph, core_number); } template -std::unique_ptr> k_core( - experimental::GraphCOOView const &in_graph, - int k, - VT const *vertex_id, - VT const *core_number, - VT num_vertex_ids, - rmm::mr::device_memory_resource *mr) +std::unique_ptr> k_core(GraphCOOView const &in_graph, + int k, + VT const *vertex_id, + VT const *core_number, + VT num_vertex_ids, + rmm::mr::device_memory_resource *mr) { CUGRAPH_EXPECTS(vertex_id != nullptr, "Invalid API parameter: vertex_id is NULL"); CUGRAPH_EXPECTS(core_number != nullptr, "Invalid API parameter: core_number is NULL"); @@ -160,21 +158,21 @@ std::unique_ptr> k_core( return detail::extract_subgraph(in_graph, vertex_id, core_number, k, num_vertex_ids, mr); } -template void core_number( - experimental::GraphCSRView const &, int32_t *core_number); -template std::unique_ptr> -k_core(experimental::GraphCOOView const &, - int, - int32_t const *, - int32_t const *, - int32_t, - rmm::mr::device_memory_resource *); -template std::unique_ptr> -k_core(experimental::GraphCOOView const &, - int, - int32_t const *, - int32_t const *, - int32_t, - rmm::mr::device_memory_resource *); +template void core_number(GraphCSRView const &, + int32_t *core_number); +template std::unique_ptr> k_core( + GraphCOOView const &, + int, + int32_t const *, + int32_t const *, + int32_t, + rmm::mr::device_memory_resource *); +template std::unique_ptr> k_core( + GraphCOOView const &, + int, + int32_t const *, + int32_t const *, + int32_t, + rmm::mr::device_memory_resource *); } // namespace cugraph diff --git a/cpp/src/db/db_object.cu b/cpp/src/db/db_object.cu index 391df5e6dbd..31c149f3503 100644 --- a/cpp/src/db/db_object.cu +++ b/cpp/src/db/db_object.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,12 +14,16 @@ * limitations under the License. */ -#include +#include + +#include + +#include #include + #include -#include #include -#include + #include namespace cugraph { diff --git a/cpp/src/db/db_object.cuh b/cpp/src/db/db_object.cuh index fe007a69020..a9b1f461f85 100644 --- a/cpp/src/db/db_object.cuh +++ b/cpp/src/db/db_object.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/db/db_operators.cu b/cpp/src/db/db_operators.cu index c6d7163a47f..d67f7ef9140 100644 --- a/cpp/src/db/db_operators.cu +++ b/cpp/src/db/db_operators.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,11 +14,14 @@ * limitations under the License. */ -#include -#include -#include #include +#include + +#include + +#include + namespace cugraph { namespace db { template diff --git a/cpp/src/db/db_operators.cuh b/cpp/src/db/db_operators.cuh index f960a465099..6a2e8322069 100644 --- a/cpp/src/db/db_operators.cuh +++ b/cpp/src/db/db_operators.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/db/db_parser_integration_test.cu b/cpp/src/db/db_parser_integration_test.cu index e1539910bc5..aa395bf8a4c 100644 --- a/cpp/src/db/db_parser_integration_test.cu +++ b/cpp/src/db/db_parser_integration_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/db/db_parser_integration_test.cuh b/cpp/src/db/db_parser_integration_test.cuh index 517c79dd5f4..63da8805164 100644 --- a/cpp/src/db/db_parser_integration_test.cuh +++ b/cpp/src/db/db_parser_integration_test.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/layout/barnes_hut.hpp b/cpp/src/layout/barnes_hut.hpp index dab98642c91..f8c200648e1 100644 --- a/cpp/src/layout/barnes_hut.hpp +++ b/cpp/src/layout/barnes_hut.hpp @@ -17,7 +17,7 @@ #pragma once #include -#include +#include #include #include @@ -33,7 +33,7 @@ namespace cugraph { namespace detail { template -void barnes_hut(experimental::GraphCOOView &graph, +void barnes_hut(GraphCOOView &graph, float *pos, const int max_iter = 1000, float *x_start = nullptr, @@ -74,8 +74,11 @@ void barnes_hut(experimental::GraphCOOView &graph, int *bottomd = d_bottomd.data().get(); float *radiusd = d_radiusd.data().get(); + cudaStream_t stream = {nullptr}; + + // FIXME: this should work on "stream" InitializationKernel<<<1, 1>>>(limiter, maxdepthd, radiusd); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); const int FOUR_NNODES = 4 * nnodes; const int FOUR_N = 4 * n; @@ -147,11 +150,11 @@ void barnes_hut(experimental::GraphCOOView &graph, traction = d_traction.data().get(); // Sort COO for coalesced memory access. - cudaStream_t stream = {nullptr}; sort(graph, stream); - CUDA_CHECK_LAST(); - graph.degree(massl, cugraph::experimental::DegreeDirection::OUT); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); + // FIXME: this should work on "stream" + graph.degree(massl, cugraph::DegreeDirection::OUT); + CHECK_CUDA(stream); const vertex_t *row = graph.src_indices; const vertex_t *col = graph.dst_indices; @@ -194,9 +197,11 @@ void barnes_hut(experimental::GraphCOOView &graph, fill(n, swinging, 0.f); fill(n, traction, 0.f); + // FIXME: this should work on "stream" ResetKernel<<<1, 1>>>(radiusd_squared, bottomd, NNODES, radiusd); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); + // FIXME: this should work on "stream" // Compute bounding box arround all bodies BoundingBoxKernel<<>>(startl, childl, @@ -212,28 +217,34 @@ void barnes_hut(experimental::GraphCOOView &graph, n, limiter, radiusd); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); + // FIXME: this should work on "stream" ClearKernel1<<>>(childl, FOUR_NNODES, FOUR_N); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); + // FIXME: this should work on "stream" // Build quadtree TreeBuildingKernel<<>>( childl, nodes_pos, nodes_pos + nnodes + 1, NNODES, n, maxdepthd, bottomd, radiusd); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); + // FIXME: this should work on "stream" ClearKernel2<<>>(startl, massl, NNODES, bottomd); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); + // FIXME: this should work on "stream" // Summarizes mass and position for each cell, bottom up approach SummarizationKernel<<>>( countl, childl, massl, nodes_pos, nodes_pos + nnodes + 1, NNODES, n, bottomd); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); + // FIXME: this should work on "stream" // Group closed bodies together, used to speed up Repulsion kernel SortKernel<<>>(sortl, countl, startl, childl, NNODES, n, bottomd); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); + // FIXME: this should work on "stream" // Force computation O(n . log(n)) RepulsionKernel<<>>(scaling_ratio, theta, @@ -251,7 +262,7 @@ void barnes_hut(experimental::GraphCOOView &graph, n, radiusd_squared, maxdepthd); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); apply_gravity(nodes_pos, nodes_pos + nnodes + 1, @@ -324,7 +335,7 @@ void barnes_hut(experimental::GraphCOOView &graph, copy(n, nodes_pos, pos); copy(n, nodes_pos + nnodes + 1, pos + n); - if (callback) callback->on_epoch_end(nodes_pos); + if (callback) callback->on_train_end(nodes_pos); } } // namespace detail diff --git a/cpp/src/layout/exact_fa2.hpp b/cpp/src/layout/exact_fa2.hpp index d138b5dd57c..e9f73e04cd5 100644 --- a/cpp/src/layout/exact_fa2.hpp +++ b/cpp/src/layout/exact_fa2.hpp @@ -17,7 +17,7 @@ #pragma once #include -#include +#include #include #include @@ -32,7 +32,7 @@ namespace cugraph { namespace detail { template -void exact_fa2(experimental::GraphCOOView &graph, +void exact_fa2(GraphCOOView &graph, float *pos, const int max_iter = 500, float *x_start = nullptr, @@ -84,9 +84,10 @@ void exact_fa2(experimental::GraphCOOView &graph, // Sort COO for coalesced memory access. cudaStream_t stream = {nullptr}; sort(graph, stream); - CUDA_CHECK_LAST(); - graph.degree(d_mass, cugraph::experimental::DegreeDirection::OUT); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); + // FIXME: this function should work on "stream" + graph.degree(d_mass, cugraph::DegreeDirection::OUT); + CHECK_CUDA(stream); const vertex_t *row = graph.src_indices; const vertex_t *col = graph.dst_indices; diff --git a/cpp/src/layout/exact_repulsion.hpp b/cpp/src/layout/exact_repulsion.hpp index 1a7db88f782..713ac654326 100644 --- a/cpp/src/layout/exact_repulsion.hpp +++ b/cpp/src/layout/exact_repulsion.hpp @@ -62,9 +62,10 @@ void apply_repulsion(const float *restrict x_pos, dim3 nblocks(min((n + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS_2D), min((n + nthreads.y - 1) / nthreads.y, CUDA_MAX_BLOCKS_2D)); + // FIXME: apply repulsion should take stream as an input argument repulsion_kernel <<>>(x_pos, y_pos, repel_x, repel_y, mass, scaling_ratio, n); - CUDA_CHECK_LAST(); + CHECK_CUDA(nullptr); } } // namespace detail diff --git a/cpp/src/layout/fa2_kernels.hpp b/cpp/src/layout/fa2_kernels.hpp index 7ecbb961000..06e73c3dda4 100644 --- a/cpp/src/layout/fa2_kernels.hpp +++ b/cpp/src/layout/fa2_kernels.hpp @@ -23,20 +23,19 @@ namespace cugraph { namespace detail { template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - attraction_kernel(const vertex_t *restrict row, - const vertex_t *restrict col, - const weight_t *restrict v, - const edge_t e, - const float *restrict x_pos, - const float *restrict y_pos, - float *restrict attract_x, - float *restrict attract_y, - const int *restrict mass, - bool outbound_attraction_distribution, - bool lin_log_mode, - const float edge_weight_influence, - const float coef) +__global__ void attraction_kernel(const vertex_t *restrict row, + const vertex_t *restrict col, + const weight_t *restrict v, + const edge_t e, + const float *restrict x_pos, + const float *restrict y_pos, + float *restrict attract_x, + float *restrict attract_y, + const int *restrict mass, + bool outbound_attraction_distribution, + bool lin_log_mode, + const float edge_weight_influence, + const float coef) { vertex_t i, src, dst; weight_t weight = 1; @@ -112,18 +111,17 @@ void apply_attraction(const vertex_t *restrict row, edge_weight_influence, coef); - CUDA_CHECK_LAST(); + CHECK_CUDA(nullptr); } template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - linear_gravity_kernel(const float *restrict x_pos, - const float *restrict y_pos, - float *restrict attract_x, - float *restrict attract_y, - const int *restrict mass, - const float gravity, - const vertex_t n) +__global__ void linear_gravity_kernel(const float *restrict x_pos, + const float *restrict y_pos, + float *restrict attract_x, + float *restrict attract_y, + const int *restrict mass, + const float gravity, + const vertex_t n) { // For every node. for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) { @@ -137,15 +135,14 @@ __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) } template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - strong_gravity_kernel(const float *restrict x_pos, - const float *restrict y_pos, - float *restrict attract_x, - float *restrict attract_y, - const int *restrict mass, - const float gravity, - const float scaling_ratio, - const vertex_t n) +__global__ void strong_gravity_kernel(const float *restrict x_pos, + const float *restrict y_pos, + float *restrict attract_x, + float *restrict attract_y, + const int *restrict mass, + const float gravity, + const float scaling_ratio, + const vertex_t n) { // For every node. for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) { @@ -183,21 +180,20 @@ void apply_gravity(const float *restrict x_pos, else linear_gravity_kernel <<>>(x_pos, y_pos, attract_x, attract_y, mass, gravity, n); - CUDA_CHECK_LAST(); + CHECK_CUDA(nullptr); } template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - local_speed_kernel(const float *restrict repel_x, - const float *restrict repel_y, - const float *restrict attract_x, - const float *restrict attract_y, - const float *restrict old_dx, - const float *restrict old_dy, - const int *restrict mass, - float *restrict swinging, - float *restrict traction, - const vertex_t n) +__global__ void local_speed_kernel(const float *restrict repel_x, + const float *restrict repel_y, + const float *restrict attract_x, + const float *restrict attract_y, + const float *restrict old_dx, + const float *restrict old_dy, + const int *restrict mass, + float *restrict swinging, + float *restrict traction, + const vertex_t n) { // For every node. for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) { @@ -232,7 +228,7 @@ void compute_local_speed(const float *restrict repel_x, local_speed_kernel<<>>( repel_x, repel_y, attract_x, attract_y, old_dx, old_dy, mass, swinging, traction, n); - CUDA_CHECK_LAST(); + CHECK_CUDA(nullptr); } template @@ -272,18 +268,17 @@ void adapt_speed(const float jitter_tolerance, } template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - update_positions_kernel(float *restrict x_pos, - float *restrict y_pos, - const float *restrict repel_x, - const float *restrict repel_y, - const float *restrict attract_x, - const float *restrict attract_y, - float *restrict old_dx, - float *restrict old_dy, - const float *restrict swinging, - const float speed, - const vertex_t n) +__global__ void update_positions_kernel(float *restrict x_pos, + float *restrict y_pos, + const float *restrict repel_x, + const float *restrict repel_y, + const float *restrict attract_x, + const float *restrict attract_y, + float *restrict old_dx, + float *restrict old_dy, + const float *restrict swinging, + const float speed, + const vertex_t n) { // For every node. for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) { @@ -321,7 +316,7 @@ void apply_forces(float *restrict x_pos, update_positions_kernel<<>>( x_pos, y_pos, repel_x, repel_y, attract_x, attract_y, old_dx, old_dy, swinging, speed, n); - CUDA_CHECK_LAST(); + CHECK_CUDA(nullptr); } } // namespace detail diff --git a/cpp/src/layout/force_atlas2.cu b/cpp/src/layout/force_atlas2.cu index 59a5c58aa73..15ac8120ce5 100644 --- a/cpp/src/layout/force_atlas2.cu +++ b/cpp/src/layout/force_atlas2.cu @@ -20,7 +20,7 @@ namespace cugraph { template -void force_atlas2(experimental::GraphCOOView &graph, +void force_atlas2(GraphCOOView &graph, float *pos, const int max_iter, float *x_start, @@ -77,7 +77,7 @@ void force_atlas2(experimental::GraphCOOView &graph, } } -template void force_atlas2(experimental::GraphCOOView &graph, +template void force_atlas2(GraphCOOView &graph, float *pos, const int max_iter, float *x_start, @@ -95,7 +95,7 @@ template void force_atlas2(experimental::GraphCOOView(experimental::GraphCOOView &graph, +template void force_atlas2(GraphCOOView &graph, float *pos, const int max_iter, float *x_start, diff --git a/cpp/src/layout/utils.hpp b/cpp/src/layout/utils.hpp index e26f93e8f71..7d639660831 100644 --- a/cpp/src/layout/utils.hpp +++ b/cpp/src/layout/utils.hpp @@ -16,6 +16,8 @@ #pragma once +#include + #include namespace cugraph { diff --git a/cpp/src/link_analysis/gunrock_hits.cpp b/cpp/src/link_analysis/gunrock_hits.cpp new file mode 100644 index 00000000000..84c6036ad70 --- /dev/null +++ b/cpp/src/link_analysis/gunrock_hits.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * ---------------------------------------------------------------------------* + * @brief wrapper calling gunrock's HITS analytic + * --------------------------------------------------------------------------*/ + +#include +#include + +#include + +#include + +namespace cugraph { + +namespace gunrock { + +template +void hits(cugraph::GraphCSRView const &graph, + int max_iter, + weight_t tolerance, + weight_t const *starting_value, + bool normalized, + weight_t *hubs, + weight_t *authorities) +{ + CUGRAPH_EXPECTS(hubs != nullptr, "Invalid API parameter: hubs array should be of size V"); + CUGRAPH_EXPECTS(authorities != nullptr, + "Invalid API parameter: authorities array should be of size V"); + + // + // NOTE: gunrock doesn't support tolerance parameter + // gunrock doesn't support passing a starting value + // gunrock doesn't support the normalized parameter + // + // FIXME: gunrock uses a 2-norm, while networkx uses a 1-norm. + // They will add a parameter to allow us to specify + // which norm to use. + // + std::vector local_offsets(graph.number_of_vertices + 1); + std::vector local_indices(graph.number_of_edges); + std::vector local_hubs(graph.number_of_vertices); + std::vector local_authorities(graph.number_of_vertices); + + // Ideally: + // + //::hits(graph.number_of_vertices, graph.number_of_edges, graph.offsets, graph.indices, + // max_iter, hubs, authorities, DEVICE); + // + // For now, the following: + + CUDA_TRY(cudaMemcpy(local_offsets.data(), + graph.offsets, + (graph.number_of_vertices + 1) * sizeof(edge_t), + cudaMemcpyDeviceToHost)); + CUDA_TRY(cudaMemcpy(local_indices.data(), + graph.indices, + graph.number_of_edges * sizeof(vertex_t), + cudaMemcpyDeviceToHost)); + + ::hits(graph.number_of_vertices, + graph.number_of_edges, + local_offsets.data(), + local_indices.data(), + max_iter, + local_hubs.data(), + local_authorities.data()); + + CUDA_TRY(cudaMemcpy( + hubs, local_hubs.data(), graph.number_of_vertices * sizeof(weight_t), cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpy(authorities, + local_authorities.data(), + graph.number_of_vertices * sizeof(weight_t), + cudaMemcpyHostToDevice)); +} + +template void hits(cugraph::GraphCSRView const &, + int, + float, + float const *, + bool, + float *, + float *); + +} // namespace gunrock + +} // namespace cugraph diff --git a/cpp/src/link_analysis/pagerank.cu b/cpp/src/link_analysis/pagerank.cu index b989c46cb07..e5da24e328d 100644 --- a/cpp/src/link_analysis/pagerank.cu +++ b/cpp/src/link_analysis/pagerank.cu @@ -22,13 +22,16 @@ #include #include "cub/cub.cuh" -#include +#include #include -#include +#include #include +#include "pagerank_1D.cuh" #include "utilities/graph_utils.cuh" +#include + namespace cugraph { namespace detail { @@ -37,7 +40,8 @@ namespace detail { #endif template -bool pagerankIteration(IndexType n, +bool pagerankIteration(raft::handle_t const &handle, + IndexType n, IndexType e, IndexType const *cscPtr, IndexType const *cscInd, @@ -55,6 +59,14 @@ bool pagerankIteration(IndexType n, ValueType *residual) { ValueType dot_res; +//#if defined(CUDART_VERSION) and CUDART_VERSION >= 11000 +#if 1 + { + raft::matrix::sparse_matrix_t const r_csr_m{ + handle, cscPtr, cscInd, cscVal, n, e}; + r_csr_m.mv(1.0, tmp, 0.0, pr); + } +#else CUDA_TRY(cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, cscVal, @@ -65,7 +77,7 @@ bool pagerankIteration(IndexType n, n, n, e)); - +#endif scal(n, alpha, pr); dot_res = dot(n, a, tmp); axpy(n, dot_res, b, pr); @@ -92,7 +104,8 @@ bool pagerankIteration(IndexType n, } template -int pagerankSolver(IndexType n, +int pagerankSolver(raft::handle_t const &handle, + IndexType n, IndexType e, IndexType const *cscPtr, IndexType const *cscInd, @@ -142,7 +155,8 @@ int pagerankSolver(IndexType n, rmm::device_vector tmp(n); tmp_d = pr.data().get(); #endif - CUDA_CHECK_LAST(); + // FIXME: this should take a passed CUDA strema instead of default nullptr + CHECK_CUDA(nullptr); if (!has_guess) { fill(n, pagerank_vector, randomProbability); @@ -165,6 +179,14 @@ int pagerankSolver(IndexType n, } update_dangling_nodes(n, a, alpha); +//#if defined(CUDART_VERSION) and CUDART_VERSION >= 11000 +#if 1 + { + raft::matrix::sparse_matrix_t const r_csr_m{ + handle, cscPtr, cscInd, cscVal, n, e}; + r_csr_m.mv(1.0, tmp_d, 0.0, pagerank_vector); + } +#else CUDA_TRY(cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, cscVal, @@ -175,6 +197,7 @@ int pagerankSolver(IndexType n, n, n, e)); +#endif // Allocate temporary storage rmm::device_buffer cub_temp_storage(cub_temp_storage_bytes); cub_d_temp_storage = cub_temp_storage.data(); @@ -191,7 +214,8 @@ int pagerankSolver(IndexType n, while (!converged && i < max_it) { i++; - converged = pagerankIteration(n, + converged = pagerankIteration(handle, + n, e, cscPtr, cscInd, @@ -225,7 +249,8 @@ int pagerankSolver(IndexType n, // template int pagerankSolver ( int n, int e, int *cscPtr, int *cscInd,half *cscVal, // half alpha, half *a, bool has_guess, float tolerance, int max_iter, half * &pagerank_vector, half // * &residual); -template int pagerankSolver(int n, +template int pagerankSolver(raft::handle_t const &handle, + int n, int e, int const *cscPtr, int const *cscInd, @@ -241,7 +266,8 @@ template int pagerankSolver(int n, int max_iter, float *&pagerank_vector, float *&residual); -template int pagerankSolver(int n, +template int pagerankSolver(raft::handle_t const &handle, + int n, int e, const int *cscPtr, int const *cscInd, @@ -259,14 +285,15 @@ template int pagerankSolver(int n, double *&residual); template -void pagerank_impl(experimental::GraphCSCView const &graph, +void pagerank_impl(raft::handle_t const &handle, + GraphCSCView const &graph, WT *pagerank, VT personalization_subset_size = 0, VT *personalization_subset = nullptr, WT *personalization_values = nullptr, double alpha = 0.85, - double tolerance = 1e-4, - int64_t max_iter = 200, + double tolerance = 1e-5, + int64_t max_iter = 100, bool has_guess = false) { bool has_personalization = false; @@ -310,7 +337,8 @@ void pagerank_impl(experimental::GraphCSCView const &graph, if (has_guess) { copy(m, (WT *)pagerank, d_pr); } - status = pagerankSolver(m, + status = pagerankSolver(handle, + m, nnz, graph.offsets, graph.indices, @@ -330,7 +358,7 @@ void pagerank_impl(experimental::GraphCSCView const &graph, switch (status) { case 0: break; case -1: CUGRAPH_FAIL("Error : bad parameters in Pagerank"); - case 1: CUGRAPH_FAIL("Warning : Pagerank did not reached the desired tolerance"); + case 1: break; // Warning : Pagerank did not reached the desired tolerance default: CUGRAPH_FAIL("Pagerank exec failed"); } @@ -339,7 +367,8 @@ void pagerank_impl(experimental::GraphCSCView const &graph, } // namespace detail template -void pagerank(experimental::GraphCSCView const &graph, +void pagerank(raft::handle_t const &handle, + GraphCSCView const &graph, WT *pagerank, VT personalization_subset_size, VT *personalization_subset, @@ -350,20 +379,37 @@ void pagerank(experimental::GraphCSCView const &graph, bool has_guess) { CUGRAPH_EXPECTS(pagerank != nullptr, "Invalid API parameter: Pagerank array should be of size V"); - - return detail::pagerank_impl(graph, - pagerank, - personalization_subset_size, - personalization_subset, - personalization_values, - alpha, - tolerance, - max_iter, - has_guess); + // Multi-GPU + if (handle.comms_initialized()) { + CUGRAPH_EXPECTS(has_guess == false, + "Invalid API parameter: Multi-GPU Pagerank does not guess, please use the " + "single GPU version for this feature"); + CUGRAPH_EXPECTS(max_iter > 0, "The number of iteration must be positive"); + cugraph::mg::pagerank(handle, + graph, + pagerank, + personalization_subset_size, + personalization_subset, + personalization_values, + alpha, + max_iter, + tolerance); + } else // Single GPU + return detail::pagerank_impl(handle, + graph, + pagerank, + personalization_subset_size, + personalization_subset, + personalization_values, + alpha, + tolerance, + max_iter, + has_guess); } // explicit instantiation -template void pagerank(experimental::GraphCSCView const &graph, +template void pagerank(raft::handle_t const &handle, + GraphCSCView const &graph, float *pagerank, int personalization_subset_size, int *personalization_subset, @@ -372,7 +418,8 @@ template void pagerank(experimental::GraphCSCView(experimental::GraphCSCView const &graph, +template void pagerank(raft::handle_t const &handle, + GraphCSCView const &graph, double *pagerank, int personalization_subset_size, int *personalization_subset, diff --git a/cpp/src/link_analysis/pagerank_1D.cu b/cpp/src/link_analysis/pagerank_1D.cu new file mode 100644 index 00000000000..27780626480 --- /dev/null +++ b/cpp/src/link_analysis/pagerank_1D.cu @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: Alex Fender afender@nvidia.com + +#include +#include +#include "pagerank_1D.cuh" +#include "utilities/graph_utils.cuh" + +namespace cugraph { +namespace mg { + +template +__global__ void transition_kernel(const size_t e, const VT *ind, const VT *degree, WT *val) +{ + for (auto i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x) + val[i] = 1.0 / degree[ind[i]]; // Degree contains IN degree. So all degree[ind[i]] were + // incremented by definition (no div by 0). +} + +template +Pagerank::Pagerank(const raft::handle_t &handle_, GraphCSCView const &G) + : comm(handle_.get_comms()), + bookmark(G.number_of_vertices), + prev_pr(G.number_of_vertices), + val(G.local_edges[comm.get_rank()]), + handle(handle_), + has_personalization(false) +{ + v_glob = G.number_of_vertices; + v_loc = G.local_vertices[comm.get_rank()]; + e_loc = G.local_edges[comm.get_rank()]; + part_off = G.local_offsets; + local_vertices = G.local_vertices; + off = G.offsets; + ind = G.indices; + blocks = handle_.get_device_properties().maxGridSize[0]; + threads = handle_.get_device_properties().maxThreadsPerBlock; + sm_count = handle_.get_device_properties().multiProcessorCount; + + is_setup = false; +} + +template +Pagerank::~Pagerank() +{ +} + +template +void Pagerank::transition_vals(const VT *degree) +{ + if (e_loc > 0) { + int threads = std::min(e_loc, this->threads); + int blocks = std::min(32 * sm_count, this->blocks); + transition_kernel<<>>(e_loc, ind, degree, val.data().get()); + CHECK_CUDA(nullptr); + } +} + +template +void Pagerank::flag_leafs(const VT *degree) +{ + if (v_glob > 0) { + int threads = std::min(v_glob, this->threads); + int blocks = std::min(32 * sm_count, this->blocks); + cugraph::detail::flag_leafs_kernel + <<>>(v_glob, degree, bookmark.data().get()); + CHECK_CUDA(nullptr); + } +} + +// Artificially create the google matrix by setting val and bookmark +template +void Pagerank::setup(WT _alpha, + VT *degree, + VT personalization_subset_size, + VT *personalization_subset, + WT *personalization_values) +{ + if (!is_setup) { + alpha = _alpha; + WT zero = 0.0; + WT one = 1.0; + // Update dangling node vector + cugraph::detail::fill(v_glob, bookmark.data().get(), zero); + flag_leafs(degree); + cugraph::detail::update_dangling_nodes(v_glob, bookmark.data().get(), alpha); + + // Transition matrix + transition_vals(degree); + + // personalize + if (personalization_subset_size != 0) { + CUGRAPH_EXPECTS(personalization_subset != nullptr, + "Invalid API parameter: personalization_subset array should be of size " + "personalization_subset_size"); + CUGRAPH_EXPECTS(personalization_values != nullptr, + "Invalid API parameter: personalization_values array should be of size " + "personalization_subset_size"); + CUGRAPH_EXPECTS(personalization_subset_size <= v_glob, + "Personalization size should be smaller than V"); + + WT sum = cugraph::detail::nrm1(personalization_subset_size, personalization_values); + if (sum != zero) { + has_personalization = true; + personalization_vector.resize(v_glob); + cugraph::detail::fill(v_glob, personalization_vector.data().get(), zero); + cugraph::detail::scal(v_glob, one / sum, personalization_values); + cugraph::detail::scatter(personalization_subset_size, + personalization_values, + personalization_vector.data().get(), + personalization_subset); + } + } + is_setup = true; + } else + CUGRAPH_FAIL("MG PageRank : Setup can be called only once"); +} + +// run the power iteration on the google matrix +template +int Pagerank::solve(int max_iter, float tolerance, WT *pagerank) +{ + if (is_setup) { + WT dot_res; + WT one = 1.0; + WT *pr = pagerank; + cugraph::detail::fill(v_glob, pagerank, one / v_glob); + cugraph::detail::fill(v_glob, prev_pr.data().get(), one / v_glob); + // This cuda sync was added to fix #426 + // This should not be requiered in theory + // This is not needed on one GPU at this time + cudaDeviceSynchronize(); + dot_res = cugraph::detail::dot(v_glob, bookmark.data().get(), pr); + MGcsrmv spmv_solver( + handle, local_vertices, part_off, off, ind, val.data().get(), pagerank); + + WT residual; + int i; + for (i = 0; i < max_iter; ++i) { + spmv_solver.run(pagerank); + cugraph::detail::scal(v_glob, alpha, pr); + + // personalization + if (has_personalization) + cugraph::detail::axpy(v_glob, dot_res, personalization_vector.data().get(), pr); + else + cugraph::detail::addv(v_glob, dot_res * (one / v_glob), pr); + + dot_res = cugraph::detail::dot(v_glob, bookmark.data().get(), pr); + cugraph::detail::scal(v_glob, one / cugraph::detail::nrm2(v_glob, pr), pr); + + // convergence check + cugraph::detail::axpy(v_glob, (WT)-1.0, pr, prev_pr.data().get()); + residual = cugraph::detail::nrm2(v_glob, prev_pr.data().get()); + if (residual < tolerance) + break; + else + cugraph::detail::copy(v_glob, pr, prev_pr.data().get()); + } + cugraph::detail::scal(v_glob, one / cugraph::detail::nrm1(v_glob, pr), pr); + return i; + } else { + CUGRAPH_FAIL("MG PageRank : Solve was called before setup"); + } +} + +template class Pagerank; +template class Pagerank; + +} // namespace mg +} // namespace cugraph diff --git a/cpp/src/link_analysis/pagerank_1D.cuh b/cpp/src/link_analysis/pagerank_1D.cuh new file mode 100644 index 00000000000..feb410daa9a --- /dev/null +++ b/cpp/src/link_analysis/pagerank_1D.cuh @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: Alex Fender afender@nvidia.com + +#pragma once + +#include +#include +#include + +#include "utilities/error.hpp" +#include "utilities/spmv_1D.cuh" + +namespace cugraph { +namespace mg { + +template +class Pagerank { + private: + VT v_glob{}; // global number of vertices + VT v_loc{}; // local number of vertices + ET e_loc{}; // local number of edges + WT alpha{}; // damping factor + bool has_personalization; + // CUDA + const raft::comms::comms_t &comm; // info about the mg comm setup + cudaStream_t stream; + int blocks; + int threads; + int sm_count; + + // Vertex offsets for each partition. + VT *part_off; + VT *local_vertices; + + // Google matrix + ET *off; + VT *ind; + + rmm::device_vector val; // values of the substochastic matrix + rmm::device_vector bookmark; // constant vector with dangling node info + rmm::device_vector prev_pr; // record the last pagerank for convergence check + rmm::device_vector personalization_vector; // personalization vector after reconstruction + + bool is_setup; + raft::handle_t const &handle; // raft handle propagation for SpMV, etc. + + public: + Pagerank(const raft::handle_t &handle, const GraphCSCView &G); + ~Pagerank(); + + void transition_vals(const VT *degree); + + void flag_leafs(const VT *degree); + + // Artificially create the google matrix by setting val and bookmark + void setup(WT _alpha, + VT *degree, + VT personalization_subset_size, + VT *personalization_subset, + WT *personalization_values); + + // run the power iteration on the google matrix, return the number of iterations + int solve(int max_iter, float tolerance, WT *pagerank); +}; + +template +int pagerank(raft::handle_t const &handle, + const GraphCSCView &G, + WT *pagerank_result, + VT personalization_subset_size, + VT *personalization_subset, + WT *personalization_values, + const double damping_factor = 0.85, + const int64_t n_iter = 100, + const double tolerance = 1e-5) +{ + // null pointers check + CUGRAPH_EXPECTS(G.offsets != nullptr, "Invalid API parameter - offsets is null"); + CUGRAPH_EXPECTS(G.indices != nullptr, "Invalid API parameter - indidices is null"); + CUGRAPH_EXPECTS(pagerank_result != nullptr, + "Invalid API parameter - pagerank output memory must be allocated"); + + // parameter values + CUGRAPH_EXPECTS(damping_factor > 0.0, + "Invalid API parameter - invalid damping factor value (alpha<0)"); + CUGRAPH_EXPECTS(damping_factor < 1.0, + "Invalid API parameter - invalid damping factor value (alpha>1)"); + CUGRAPH_EXPECTS(n_iter > 0, "Invalid API parameter - n_iter must be > 0"); + + rmm::device_vector degree(G.number_of_vertices); + + // in-degree of CSC (equivalent to out-degree of original edge list) + G.degree(degree.data().get(), DegreeDirection::IN); + + // Allocate and intialize Pagerank class + Pagerank pr_solver(handle, G); + + // Set all constants info + pr_solver.setup(damping_factor, + degree.data().get(), + personalization_subset_size, + personalization_subset, + personalization_values); + + // Run pagerank + return pr_solver.solve(n_iter, tolerance, pagerank_result); +} + +} // namespace mg +} // namespace cugraph diff --git a/cpp/src/link_prediction/jaccard.cu b/cpp/src/link_prediction/jaccard.cu index 8462466f9e9..70952974b39 100644 --- a/cpp/src/link_prediction/jaccard.cu +++ b/cpp/src/link_prediction/jaccard.cu @@ -20,7 +20,7 @@ * ---------------------------------------------------------------------------**/ #include -#include +#include #include "graph.hpp" #include "utilities/graph_utils.cuh" @@ -29,7 +29,7 @@ namespace detail { // Volume of neighboors (*weight_s) template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) jaccard_row_sum( +__global__ void jaccard_row_sum( vertex_t n, edge_t const *csrPtr, vertex_t const *csrInd, weight_t const *v, weight_t *work) { vertex_t row; @@ -53,13 +53,13 @@ __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) jaccard_row_sum( // Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) jaccard_is(vertex_t n, - edge_t const *csrPtr, - vertex_t const *csrInd, - weight_t const *v, - weight_t *work, - weight_t *weight_i, - weight_t *weight_s) +__global__ void jaccard_is(vertex_t n, + edge_t const *csrPtr, + vertex_t const *csrInd, + weight_t const *v, + weight_t *work, + weight_t *weight_i, + weight_t *weight_s) { edge_t i, j, Ni, Nj; vertex_t row, col; @@ -117,16 +117,15 @@ __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) jaccard_is(vertex_t n // Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) // Using list of node pairs template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - jaccard_is_pairs(edge_t num_pairs, - edge_t const *csrPtr, - vertex_t const *csrInd, - vertex_t const *first_pair, - vertex_t const *second_pair, - weight_t const *v, - weight_t *work, - weight_t *weight_i, - weight_t *weight_s) +__global__ void jaccard_is_pairs(edge_t num_pairs, + edge_t const *csrPtr, + vertex_t const *csrInd, + vertex_t const *first_pair, + vertex_t const *second_pair, + weight_t const *v, + weight_t *work, + weight_t *weight_i, + weight_t *weight_s) { edge_t i, idx, Ni, Nj, match; vertex_t row, col, ref, cur, ref_col, cur_col; @@ -182,8 +181,10 @@ __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) // Jaccard weights (*weight) template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - jaccard_jw(edge_t e, weight_t const *weight_i, weight_t const *weight_s, weight_t *weight_j) +__global__ void jaccard_jw(edge_t e, + weight_t const *weight_i, + weight_t const *weight_s, + weight_t *weight_j) { edge_t j; weight_t Wi, Ws, Wu; @@ -312,7 +313,7 @@ int jaccard_pairs(vertex_t n, } // namespace detail template -void jaccard(experimental::GraphCSRView const &graph, WT const *weights, WT *result) +void jaccard(GraphCSRView const &graph, WT const *weights, WT *result) { CUGRAPH_EXPECTS(result != nullptr, "Invalid API parameter: result pointer is NULL"); @@ -344,7 +345,7 @@ void jaccard(experimental::GraphCSRView const &graph, WT const *weig } template -void jaccard_list(experimental::GraphCSRView const &graph, +void jaccard_list(GraphCSRView const &graph, WT const *weights, ET num_pairs, VT const *first, @@ -386,41 +387,41 @@ void jaccard_list(experimental::GraphCSRView const &graph, } } -template void jaccard( - experimental::GraphCSRView const &, float const *, float *); -template void jaccard( - experimental::GraphCSRView const &, double const *, double *); -template void jaccard( - experimental::GraphCSRView const &, float const *, float *); -template void jaccard( - experimental::GraphCSRView const &, double const *, double *); -template void jaccard_list( - experimental::GraphCSRView const &, - float const *, - int32_t, - int32_t const *, - int32_t const *, - float *); -template void jaccard_list( - experimental::GraphCSRView const &, - double const *, - int32_t, - int32_t const *, - int32_t const *, - double *); -template void jaccard_list( - experimental::GraphCSRView const &, - float const *, - int64_t, - int64_t const *, - int64_t const *, - float *); -template void jaccard_list( - experimental::GraphCSRView const &, - double const *, - int64_t, - int64_t const *, - int64_t const *, - double *); +template void jaccard(GraphCSRView const &, + float const *, + float *); +template void jaccard(GraphCSRView const &, + double const *, + double *); +template void jaccard(GraphCSRView const &, + float const *, + float *); +template void jaccard(GraphCSRView const &, + double const *, + double *); +template void jaccard_list(GraphCSRView const &, + float const *, + int32_t, + int32_t const *, + int32_t const *, + float *); +template void jaccard_list(GraphCSRView const &, + double const *, + int32_t, + int32_t const *, + int32_t const *, + double *); +template void jaccard_list(GraphCSRView const &, + float const *, + int64_t, + int64_t const *, + int64_t const *, + float *); +template void jaccard_list(GraphCSRView const &, + double const *, + int64_t, + int64_t const *, + int64_t const *, + double *); } // namespace cugraph diff --git a/cpp/src/link_prediction/overlap.cu b/cpp/src/link_prediction/overlap.cu index ed945c378bd..e3f80b50d9a 100644 --- a/cpp/src/link_prediction/overlap.cu +++ b/cpp/src/link_prediction/overlap.cu @@ -20,7 +20,7 @@ * ---------------------------------------------------------------------------**/ #include -#include +#include #include "graph.hpp" #include "utilities/graph_utils.cuh" @@ -30,7 +30,7 @@ namespace detail { // Volume of neighboors (*weight_s) // TODO: Identical kernel to jaccard_row_sum!! template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) overlap_row_sum( +__global__ void overlap_row_sum( vertex_t n, edge_t const *csrPtr, vertex_t const *csrInd, weight_t const *v, weight_t *work) { vertex_t row; @@ -55,13 +55,13 @@ __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) overlap_row_sum( // Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) // TODO: Identical kernel to jaccard_row_sum!! template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) overlap_is(vertex_t n, - edge_t const *csrPtr, - vertex_t const *csrInd, - weight_t const *v, - weight_t *work, - weight_t *weight_i, - weight_t *weight_s) +__global__ void overlap_is(vertex_t n, + edge_t const *csrPtr, + vertex_t const *csrInd, + weight_t const *v, + weight_t *work, + weight_t *weight_i, + weight_t *weight_s) { edge_t i, j, Ni, Nj; vertex_t row, col; @@ -120,16 +120,15 @@ __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) overlap_is(vertex_t n // Using list of node pairs // NOTE: NOT the same as jaccard template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - overlap_is_pairs(edge_t num_pairs, - edge_t const *csrPtr, - vertex_t const *csrInd, - vertex_t const *first_pair, - vertex_t const *second_pair, - weight_t const *v, - weight_t *work, - weight_t *weight_i, - weight_t *weight_s) +__global__ void overlap_is_pairs(edge_t num_pairs, + edge_t const *csrPtr, + vertex_t const *csrInd, + vertex_t const *first_pair, + vertex_t const *second_pair, + weight_t const *v, + weight_t *work, + weight_t *weight_i, + weight_t *weight_s) { edge_t i, idx, Ni, Nj, match; vertex_t row, col, ref, cur, ref_col, cur_col; @@ -185,12 +184,12 @@ __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) // Overlap weights (*weight) template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) overlap_jw(edge_t e, - edge_t const *csrPtr, - vertex_t const *csrInd, - weight_t *weight_i, - weight_t *weight_s, - weight_t *weight_j) +__global__ void overlap_jw(edge_t e, + edge_t const *csrPtr, + vertex_t const *csrInd, + weight_t *weight_i, + weight_t *weight_s, + weight_t *weight_j) { edge_t j; weight_t Wi, Wu; @@ -315,7 +314,7 @@ int overlap_pairs(vertex_t n, } // namespace detail template -void overlap(experimental::GraphCSRView const &graph, WT const *weights, WT *result) +void overlap(GraphCSRView const &graph, WT const *weights, WT *result) { CUGRAPH_EXPECTS(result != nullptr, "Invalid API parameter: result pointer is NULL"); @@ -347,7 +346,7 @@ void overlap(experimental::GraphCSRView const &graph, WT const *weig } template -void overlap_list(experimental::GraphCSRView const &graph, +void overlap_list(GraphCSRView const &graph, WT const *weights, ET num_pairs, VT const *first, @@ -389,41 +388,41 @@ void overlap_list(experimental::GraphCSRView const &graph, } } -template void overlap( - experimental::GraphCSRView const &, float const *, float *); -template void overlap( - experimental::GraphCSRView const &, double const *, double *); -template void overlap( - experimental::GraphCSRView const &, float const *, float *); -template void overlap( - experimental::GraphCSRView const &, double const *, double *); -template void overlap_list( - experimental::GraphCSRView const &, - float const *, - int32_t, - int32_t const *, - int32_t const *, - float *); -template void overlap_list( - experimental::GraphCSRView const &, - double const *, - int32_t, - int32_t const *, - int32_t const *, - double *); -template void overlap_list( - experimental::GraphCSRView const &, - float const *, - int64_t, - int64_t const *, - int64_t const *, - float *); -template void overlap_list( - experimental::GraphCSRView const &, - double const *, - int64_t, - int64_t const *, - int64_t const *, - double *); +template void overlap(GraphCSRView const &, + float const *, + float *); +template void overlap(GraphCSRView const &, + double const *, + double *); +template void overlap(GraphCSRView const &, + float const *, + float *); +template void overlap(GraphCSRView const &, + double const *, + double *); +template void overlap_list(GraphCSRView const &, + float const *, + int32_t, + int32_t const *, + int32_t const *, + float *); +template void overlap_list(GraphCSRView const &, + double const *, + int32_t, + int32_t const *, + int32_t const *, + double *); +template void overlap_list(GraphCSRView const &, + float const *, + int64_t, + int64_t const *, + int64_t const *, + float *); +template void overlap_list(GraphCSRView const &, + double const *, + int64_t, + int64_t const *, + int64_t const *, + double *); } // namespace cugraph diff --git a/cpp/src/nvgraph/include/async_event.cuh b/cpp/src/nvgraph/include/async_event.cuh deleted file mode 100644 index e7bf04fa33f..00000000000 --- a/cpp/src/nvgraph/include/async_event.cuh +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -class AsyncEvent { - public: - AsyncEvent() : async_event(NULL) {} - AsyncEvent(int size) : async_event(NULL) { cudaEventCreate(&async_event); } - ~AsyncEvent() - { - if (async_event != NULL) cudaEventDestroy(async_event); - } - - void create() { cudaEventCreate(&async_event); } - void record(cudaStream_t s = 0) - { - if (async_event == NULL) { - cudaEventCreate(&async_event); // check if we haven't created the event yet - } - - cudaEventRecord(async_event, s); - } - void sync() { cudaEventSynchronize(async_event); } - - private: - cudaEvent_t async_event; -}; diff --git a/cpp/src/nvgraph/include/atomics.hxx b/cpp/src/nvgraph/include/atomics.hxx deleted file mode 100644 index 4cd02764ed7..00000000000 --- a/cpp/src/nvgraph/include/atomics.hxx +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -namespace nvgraph { -//This file contains the atomic operations for floats and doubles from cusparse/src/cusparse_atomics.h - -static __inline__ __device__ double atomicFPAdd(double *addr, double val) -{ -// atomicAdd for double starts with sm_60 -#if __CUDA_ARCH__ >= 600 - return atomicAdd( addr, val ); -#else - unsigned long long old = __double_as_longlong( addr[0] ), assumed; - - do - { - assumed = old; - old = atomicCAS( (unsigned long long *) addr, assumed, __double_as_longlong( val + __longlong_as_double( assumed ) ) ); - } - while ( assumed != old ); - - return old; -#endif -} - -// atomicAdd for float starts with sm_20 -static __inline__ __device__ float atomicFPAdd(float *addr, float val) -{ - return atomicAdd( addr, val ); -} - -static __inline__ __device__ double atomicFPMin(double *addr, double val) -{ - double old, assumed; - old=*addr; - do{ - assumed = old; - old = __longlong_as_double(atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed), - __double_as_longlong(min(val,assumed)))); - } while (__double_as_longlong(assumed) != __double_as_longlong(old)); - return old; -} - -/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */ -static __inline__ __device__ float atomicFPMin(float *addr, float val) -{ - float old, assumed; - old=*addr; - do{ - assumed = old; - old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed),float_as_int(min(val,assumed)))); - } while (float_as_int(assumed) != float_as_int(old)); - - return old; -} - -static __inline__ __device__ double atomicFPMax(double *addr, double val) -{ - double old, assumed; - old=*addr; - do{ - assumed = old; - old = __longlong_as_double(atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed), - __double_as_longlong(max(val,assumed)))); - } while (__double_as_longlong(assumed) != __double_as_longlong(old)); - return old; -} - -/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */ -static __inline__ __device__ float atomicFPMax(float *addr, float val) -{ - float old, assumed; - old=*addr; - do{ - assumed = old; - old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed),float_as_int(max(val,assumed)))); - } while (float_as_int(assumed) != float_as_int(old)); - - return old; -} - -static __inline__ __device__ double atomicFPOr(double *addr, double val) -{ - double old, assumed; - old=*addr; - do{ - assumed = old; - old = __longlong_as_double(atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed), - __double_as_longlong((bool)val | (bool)assumed))); - } while (__double_as_longlong(assumed) != __double_as_longlong(old)); - return old; -} - -/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */ -static __inline__ __device__ float atomicFPOr(float *addr, float val) -{ - float old, assumed; - old=*addr; - do{ - assumed = old; - old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed),float_as_int((bool)val | (bool)assumed))); - } while (float_as_int(assumed) != float_as_int(old)); - - return old; -} - -static __inline__ __device__ double atomicFPLog(double *addr, double val) -{ - double old, assumed; - old=*addr; - do{ - assumed = old; - old = __longlong_as_double(atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed), - __double_as_longlong(-log(exp(-val)+exp(-assumed))))); - } while (__double_as_longlong(assumed) != __double_as_longlong(old)); - return old; -} - -/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */ -static __inline__ __device__ float atomicFPLog(float *addr, float val) -{ - float old, assumed; - old=*addr; - do{ - assumed = old; - old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed),float_as_int(-logf(expf(-val)+expf(-assumed))))); - } while (float_as_int(assumed) != float_as_int(old)); - - return old; -} - -} //end anmespace nvgraph - diff --git a/cpp/src/nvgraph/include/debug_macros.h b/cpp/src/nvgraph/include/debug_macros.h deleted file mode 100644 index 5ee114c0084..00000000000 --- a/cpp/src/nvgraph/include/debug_macros.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "nvgraph_error.hxx" - -#define CHECK_STATUS(...) \ - do { \ - if (__VA_ARGS__) { FatalError(#__VA_ARGS__, NVGRAPH_ERR_UNKNOWN); } \ - } while (0) - -#define CHECK_NVGRAPH(...) \ - do { \ - NVGRAPH_ERROR e = __VA_ARGS__; \ - if (e != NVGRAPH_OK) { FatalError(#__VA_ARGS__, e) } \ - } while (0) - -#ifdef DEBUG -#define COUT() (std::cout) -#define CERR() (std::cerr) -#define WARNING(message) \ - do { \ - std::stringstream ss; \ - ss << "Warning (" << __FILE__ << ":" << __LINE__ << "): " << message; \ - CERR() << ss.str() << std::endl; \ - } while (0) -#else // DEBUG -#define WARNING(message) -#endif diff --git a/cpp/src/nvgraph/include/graph_utils.cuh b/cpp/src/nvgraph/include/graph_utils.cuh deleted file mode 100644 index 106cd875ed1..00000000000 --- a/cpp/src/nvgraph/include/graph_utils.cuh +++ /dev/null @@ -1,339 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -// Helper functions based on Thrust - -#pragma once - -#include -#include -//#include -//#include - -#include -#include -#include -#include -#include -#include - -#include -#include - -#define USE_CG 1 -#define DEBUG 1 - -namespace nvlouvain { - -#define CUDA_MAX_BLOCKS 65535 -#define CUDA_MAX_KERNEL_THREADS 256 // kernel will launch at most 256 threads per block -#define DEFAULT_MASK 0xffffffff -#define US - -//#define DEBUG 1 - -// error check -#undef cudaCheckError -#ifdef DEBUG -#define WHERE " at: " << __FILE__ << ':' << __LINE__ -#define cudaCheckError() \ - { \ - cudaError_t e = cudaGetLastError(); \ - if (e != cudaSuccess) { \ - std::cerr << "Cuda failure: " << cudaGetErrorString(e) << WHERE << std::endl; \ - } \ - } -#else -#define cudaCheckError() -#define WHERE "" -#endif - -// This is a gap filler, and should be replaced with a RAPIDS-wise error handling mechanism. -#undef rmmCheckError -#ifdef DEBUG -#define WHERE " at: " << __FILE__ << ':' << __LINE__ -#define rmmCheckError(e) \ - { \ - if (e != RMM_SUCCESS) { std::cerr << "RMM failure: " << WHERE << std::endl; } \ - } -#else -#define rmmCheckError(e) -#define WHERE "" -#endif - -template -static __device__ __forceinline__ T -shfl_up(T r, int offset, int bound = 32, int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#if USE_CG - return __shfl_up_sync(mask, r, offset, bound); -#else - return __shfl_up(r, offset, bound); -#endif -#else - return 0.0f; -#endif -} - -template -static __device__ __forceinline__ T shfl(T r, int lane, int bound = 32, int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#if USE_CG - return __shfl_sync(mask, r, lane, bound); -#else - return __shfl(r, lane, bound); -#endif -#else - return 0.0f; -#endif -} - -template -__inline__ __device__ T parallel_prefix_sum(int n, int *ind, T *w) -{ - int i, j, mn; - T v, last; - T sum = 0.0; - bool valid; - - // Parallel prefix sum (using __shfl) - mn = (((n + blockDim.x - 1) / blockDim.x) * blockDim.x); // n in multiple of blockDim.x - for (i = threadIdx.x; i < mn; i += blockDim.x) { - // All threads (especially the last one) must always participate - // in the shfl instruction, otherwise their sum will be undefined. - // So, the loop stopping condition is based on multiple of n in loop increments, - // so that all threads enter into the loop and inside we make sure we do not - // read out of bounds memory checking for the actual size n. - - // check if the thread is valid - valid = i < n; - - // Notice that the last thread is used to propagate the prefix sum. - // For all the threads, in the first iteration the last is 0, in the following - // iterations it is the value at the last thread of the previous iterations. - - // get the value of the last thread - last = shfl(sum, blockDim.x - 1, blockDim.x); - - // if you are valid read the value from memory, otherwise set your value to 0 - sum = (valid) ? w[ind[i]] : 0.0; - - // do prefix sum (of size warpSize=blockDim.x =< 32) - for (j = 1; j < blockDim.x; j *= 2) { - v = shfl_up(sum, j, blockDim.x); - if (threadIdx.x >= j) sum += v; - } - // shift by last - sum += last; - // notice that no __threadfence or __syncthreads are needed in this implementation - } - // get the value of the last thread (to all threads) - last = shfl(sum, blockDim.x - 1, blockDim.x); - - return last; -} - -// dot -template -T dot(size_t n, T *x, T *y) -{ - T result = thrust::inner_product(thrust::device_pointer_cast(x), - thrust::device_pointer_cast(x + n), - thrust::device_pointer_cast(y), - 0.0f); - cudaCheckError(); - return result; -} - -// axpy -template -struct axpy_functor : public thrust::binary_function { - const T a; - axpy_functor(T _a) : a(_a) {} - __host__ __device__ T operator()(const T &x, const T &y) const { return a * x + y; } -}; - -template -void axpy(size_t n, T a, T *x, T *y) -{ - thrust::transform(thrust::device_pointer_cast(x), - thrust::device_pointer_cast(x + n), - thrust::device_pointer_cast(y), - thrust::device_pointer_cast(y), - axpy_functor(a)); - cudaCheckError(); -} - -// norm -template -struct square { - __host__ __device__ T operator()(const T &x) const { return x * x; } -}; - -template -T nrm2(size_t n, T *x) -{ - T init = 0; - T result = std::sqrt(thrust::transform_reduce(thrust::device_pointer_cast(x), - thrust::device_pointer_cast(x + n), - square(), - init, - thrust::plus())); - cudaCheckError(); - return result; -} - -template -T nrm1(size_t n, T *x) -{ - T result = thrust::reduce(thrust::device_pointer_cast(x), thrust::device_pointer_cast(x + n)); - cudaCheckError(); - return result; -} - -template -void scal(size_t n, T val, T *x) -{ - thrust::transform(thrust::device_pointer_cast(x), - thrust::device_pointer_cast(x + n), - thrust::make_constant_iterator(val), - thrust::device_pointer_cast(x), - thrust::multiplies()); - cudaCheckError(); -} - -template -void fill(size_t n, T *x, T value) -{ - thrust::fill(thrust::device_pointer_cast(x), thrust::device_pointer_cast(x + n), value); - cudaCheckError(); -} - -template -void printv(size_t n, T *vec, int offset) -{ - thrust::device_ptr dev_ptr(vec); - std::cout.precision(15); - std::cout << "sample size = " << n << ", offset = " << offset << std::endl; - thrust::copy(dev_ptr + offset, dev_ptr + offset + n, std::ostream_iterator(std::cout, " ")); - cudaCheckError(); - std::cout << std::endl; -} - -template -void copy(size_t n, T *x, T *res) -{ - thrust::device_ptr dev_ptr(x); - thrust::device_ptr res_ptr(res); - thrust::copy_n(dev_ptr, n, res_ptr); - cudaCheckError(); -} - -template -struct is_zero { - __host__ __device__ bool operator()(const T x) { return x == 0; } -}; - -template -struct dangling_functor : public thrust::unary_function { - const T val; - dangling_functor(T _val) : val(_val) {} - __host__ __device__ T operator()(const T &x) const { return val + x; } -}; - -template -void update_dangling_nodes(size_t n, T *dangling_nodes, T damping_factor) -{ - thrust::transform_if(thrust::device_pointer_cast(dangling_nodes), - thrust::device_pointer_cast(dangling_nodes + n), - thrust::device_pointer_cast(dangling_nodes), - dangling_functor(1.0 - damping_factor), - is_zero()); - cudaCheckError(); -} - -// google matrix kernels -template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - degree_coo(const IndexType n, const IndexType e, const IndexType *ind, IndexType *degree) -{ - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x) - atomicAdd(°ree[ind[i]], 1.0); -} -template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) equi_prob( - const IndexType n, const IndexType e, const IndexType *ind, ValueType *val, IndexType *degree) -{ - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x) - val[i] = 1.0 / degree[ind[i]]; -} - -template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - flag_leafs(const IndexType n, IndexType *degree, ValueType *bookmark) -{ - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) - if (degree[i] == 0) bookmark[i] = 1.0; -} -// notice that in the transposed matrix/csc a dangling node is a node without incomming edges -template -void google_matrix(const IndexType n, - const IndexType e, - const IndexType *cooColInd, - ValueType *cooVal, - ValueType *bookmark) -{ - rmm::device_vector degree(n, 0); - dim3 nthreads, nblocks; - nthreads.x = min(e, CUDA_MAX_KERNEL_THREADS); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((e + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS); - nblocks.y = 1; - nblocks.z = 1; - degree_coo - <<>>(n, e, cooColInd, thrust::raw_pointer_cast(degree.data())); - equi_prob - <<>>(n, e, cooColInd, cooVal, thrust::raw_pointer_cast(degree.data())); - ValueType val = 0.0; - fill(n, bookmark, val); - nthreads.x = min(n, CUDA_MAX_KERNEL_THREADS); - nblocks.x = min((n + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS); - flag_leafs - <<>>(n, thrust::raw_pointer_cast(degree.data()), bookmark); - // printv(n, thrust::raw_pointer_cast(degree.data()) , 0); - // printv(n, bookmark , 0); - // printv(e, cooVal , 0); -} - -template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - update_clustering_kernel(const IndexType n, IndexType *clustering, IndexType *aggregates_d) -{ - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) - clustering[i] = aggregates_d[clustering[i]]; -} - -template -void update_clustering(const IndexType n, IndexType *clustering, IndexType *aggregates_d) -{ - int nthreads = min(n, CUDA_MAX_KERNEL_THREADS); - int nblocks = min((n + nthreads - 1) / nthreads, CUDA_MAX_BLOCKS); - update_clustering_kernel<<>>(n, clustering, aggregates_d); -} - -} // namespace nvlouvain diff --git a/cpp/src/nvgraph/include/kmeans.hxx b/cpp/src/nvgraph/include/kmeans.hxx deleted file mode 100644 index 386b084706a..00000000000 --- a/cpp/src/nvgraph/include/kmeans.hxx +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "nvgraph_error.hxx" - -namespace nvgraph { - - /// Find clusters with k-means algorithm - /** Initial centroids are chosen with k-means++ algorithm. Empty - * clusters are reinitialized by choosing new centroids with - * k-means++ algorithm. - * - * CNMEM must be initialized before calling this function. - * - * @param cublasHandle_t cuBLAS handle. - * @param n Number of observation vectors. - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param tol Tolerance for convergence. k-means stops when the - * change in residual divided by n is less than tol. - * @param maxiter Maximum number of k-means iterations. - * @param obs (Input, device memory, d*n entries) Observation - * matrix. Matrix is stored column-major and each column is an - * observation vector. Matrix dimensions are d x n. - * @param codes (Output, device memory, n entries) Cluster - * assignments. - * @param residual On exit, residual sum of squares (sum of squares - * of distances between observation vectors and centroids). - * @param On exit, number of k-means iterations. - * @return NVGRAPH error flag. - */ - template - NVGRAPH_ERROR kmeans(IndexType_ n, IndexType_ d, IndexType_ k, - ValueType_ tol, IndexType_ maxiter, - const ValueType_ * __restrict__ obs, - IndexType_ * __restrict__ codes, - ValueType_ & residual, - IndexType_ & iters); - - /// Find clusters with k-means algorithm - /** Initial centroids are chosen with k-means++ algorithm. Empty - * clusters are reinitialized by choosing new centroids with - * k-means++ algorithm. - * - * @param n Number of observation vectors. - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param tol Tolerance for convergence. k-means stops when the - * change in residual divided by n is less than tol. - * @param maxiter Maximum number of k-means iterations. - * @param obs (Input, device memory, d*n entries) Observation - * matrix. Matrix is stored column-major and each column is an - * observation vector. Matrix dimensions are d x n. - * @param codes (Output, device memory, n entries) Cluster - * assignments. - * @param clusterSizes (Output, device memory, k entries) Number of - * points in each cluster. - * @param centroids (Output, device memory, d*k entries) Centroid - * matrix. Matrix is stored column-major and each column is a - * centroid. Matrix dimensions are d x k. - * @param work (Output, device memory, n*max(k,d) entries) - * Workspace. - * @param work_int (Output, device memory, 2*d*n entries) - * Workspace. - * @param residual_host (Output, host memory, 1 entry) Residual sum - * of squares (sum of squares of distances between observation - * vectors and centroids). - * @param iters_host (Output, host memory, 1 entry) Number of - * k-means iterations. - * @return NVGRAPH error flag. - */ - template - NVGRAPH_ERROR kmeans(IndexType_ n, IndexType_ d, IndexType_ k, - ValueType_ tol, IndexType_ maxiter, - const ValueType_ * __restrict__ obs, - IndexType_ * __restrict__ codes, - IndexType_ * __restrict__ clusterSizes, - ValueType_ * __restrict__ centroids, - ValueType_ * __restrict__ work, - IndexType_ * __restrict__ work_int, - ValueType_ * residual_host, - IndexType_ * iters_host); - -} - diff --git a/cpp/src/nvgraph/include/lanczos.hxx b/cpp/src/nvgraph/include/lanczos.hxx deleted file mode 100644 index 58be76a0a45..00000000000 --- a/cpp/src/nvgraph/include/lanczos.hxx +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - #pragma once - -#include "nvgraph_error.hxx" -#include "spectral_matrix.hxx" - -namespace nvgraph { - - /// Compute smallest eigenvectors of symmetric matrix - /** Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are smallest in - * magnitude. - * - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied to A+s*I, where s is negative the largest - * eigenvalue. - * - * CNMEM must be initialized before calling this function. - * - * @param A Pointer to matrix object. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter Maximum number of Lanczos steps. Does not include - * Lanczos steps used to estimate largest eigenvalue. - * @param restartIter Maximum size of Lanczos system before - * performing an implicit restart. Should be at least 4. - * @param tol Convergence tolerance. Lanczos iteration will - * terminate when the residual norm is less than tol*theta, where - * theta is an estimate for the smallest unwanted eigenvalue - * (i.e. the (nEigVecs+1)th smallest eigenvalue). - * @param reorthogonalize Whether to reorthogonalize Lanczos - * vectors. - * @param iter On exit, pointer to total number of Lanczos - * iterations performed. Does not include Lanczos steps used to - * estimate largest eigenvalue. - * @param eigVals_dev (Output, device memory, nEigVecs entries) - * Smallest eigenvalues of matrix. - * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) - * Eigenvectors corresponding to smallest eigenvalues of - * matrix. Vectors are stored as columns of a column-major matrix - * with dimensions n x nEigVecs. - * @return NVGRAPH error flag. - */ - template - NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix & A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ & iter, - ValueType_ * __restrict__ eigVals_dev, - ValueType_ * __restrict__ eigVecs_dev); - - /// Compute largest eigenvectors of symmetric matrix - /** Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are largest in - * magnitude. - * - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied to A+s*I, where s is negative the largest - * eigenvalue. - * - * CNMEM must be initialized before calling this function. - * - * @param A Matrix. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter Maximum number of Lanczos steps. Does not include - * Lanczos steps used to estimate largest eigenvalue. - * @param restartIter Maximum size of Lanczos system before - * performing an implicit restart. Should be at least 4. - * @param tol Convergence tolerance. Lanczos iteration will - * terminate when the residual norm is less than tol*theta, where - * theta is an estimate for the largest unwanted eigenvalue - * (i.e. the (nEigVecs+1)th largest eigenvalue). - * @param reorthogonalize Whether to reorthogonalize Lanczos - * vectors. - * @param iter On exit, pointer to total number of Lanczos - * iterations performed. Does not include Lanczos steps used to - * estimate largest eigenvalue. - * @param eigVals_dev (Output, device memory, nEigVecs entries) - * Largest eigenvalues of matrix. - * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) - * Eigenvectors corresponding to largest eigenvalues of - * matrix. Vectors are stored as columns of a column-major matrix - * with dimensions n x nEigVecs. - * @return NVGRAPH error flag. - */ - template - NVGRAPH_ERROR computeLargestEigenvectors(const Matrix & A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ & iter, - ValueType_ * __restrict__ eigVals_dev, - ValueType_ * __restrict__ eigVecs_dev); - -} - diff --git a/cpp/src/nvgraph/include/modularity_maximization.hxx b/cpp/src/nvgraph/include/modularity_maximization.hxx deleted file mode 100644 index 34720f88341..00000000000 --- a/cpp/src/nvgraph/include/modularity_maximization.hxx +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include "nvgraph_error.hxx" -#include "spectral_matrix.hxx" - - -namespace nvgraph { - /** Compute partition for a weighted undirected graph. This - * partition attempts to minimize the cost function: - * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) - * - * @param G Weighted graph in CSR format - * @param nClusters Number of partitions. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter_lanczos Maximum number of Lanczos iterations. - * @param restartIter_lanczos Maximum size of Lanczos system before - * implicit restart. - * @param tol_lanczos Convergence tolerance for Lanczos method. - * @param maxIter_kmeans Maximum number of k-means iterations. - * @param tol_kmeans Convergence tolerance for k-means algorithm. - * @param parts (Output, device memory, n entries) Cluster - * assignments. - * @param iters_lanczos On exit, number of Lanczos iterations - * performed. - * @param iters_kmeans On exit, number of k-means iterations - * performed. - * @return NVGRAPH error flag. - */ - template - NVGRAPH_ERROR modularity_maximization(cugraph::experimental::GraphCSRView const &graph, - vertex_t nClusters, - vertex_t nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - weight_t tol_lanczos, - int maxIter_kmeans, - weight_t tol_kmeans, - vertex_t * __restrict__ clusters, - weight_t *eigVals, - weight_t *eigVecs, - int & iters_lanczos, - int & iters_kmeans) ; - - - /// Compute modularity - /** This function determines the modularity based on a graph and cluster assignments - * @param G Weighted graph in CSR format - * @param nClusters Number of clusters. - * @param parts (Input, device memory, n entries) Cluster assignments. - * @param modularity On exit, modularity - */ - template - NVGRAPH_ERROR analyzeModularity(cugraph::experimental::GraphCSRView const &graph, - vertex_t nClusters, - const vertex_t * __restrict__ parts, - weight_t & modularity); - -} - diff --git a/cpp/src/nvgraph/include/nvgraph_cublas.hxx b/cpp/src/nvgraph/include/nvgraph_cublas.hxx deleted file mode 100644 index bddbbf18ae1..00000000000 --- a/cpp/src/nvgraph/include/nvgraph_cublas.hxx +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include "debug_macros.h" - -namespace nvgraph -{ -class Cublas; - -class Cublas -{ -private: - static cublasHandle_t m_handle; - // Private ctor to prevent instantiation. - Cublas(); - ~Cublas(); -public: - - // Get the handle. - static cublasHandle_t get_handle() - { - if (m_handle == 0) - CHECK_CUBLAS(cublasCreate(&m_handle)); - return m_handle; - } - - static void destroy_handle() - { - if (m_handle != 0) - CHECK_CUBLAS(cublasDestroy(m_handle)); - m_handle = 0; - } - - static void set_pointer_mode_device(); - static void set_pointer_mode_host(); - static void setStream(cudaStream_t stream) - { - cublasHandle_t handle = Cublas::get_handle(); - CHECK_CUBLAS(cublasSetStream(handle, stream)); - } - - template - static void axpy(int n, T alpha, - const T* x, int incx, - T* y, int incy); - - template - static void copy(int n, const T* x, int incx, - T* y, int incy); - - template - static void dot(int n, const T* x, int incx, - const T* y, int incy, - T* result); - - template - static void gemv(bool transposed, int m, int n, - const T* alpha, const T* A, int lda, - const T* x, int incx, - const T* beta, T* y, int incy); - - template - static void gemv_ext(bool transposed, const int m, const int n, - const T* alpha, const T* A, const int lda, - const T* x, const int incx, - const T* beta, T* y, const int incy, const int offsetx, const int offsety, const int offseta); - - template - static void trsv_v2( cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, - const T *A, int lda, T *x, int incx, int offseta); - - template - static void ger(int m, int n, const T* alpha, - const T* x, int incx, - const T* y, int incy, - T* A, int lda); - - template - static T nrm2(int n, const T* x, int incx); - template - static void nrm2(int n, const T* x, int incx, T* result); - - template - static void scal(int n, T alpha, T* x, int incx); - template - static void scal(int n, T* alpha, T* x, int incx); - - template - static void gemm(bool transa, bool transb, int m, int n, int k, - const T * alpha, const T * A, int lda, - const T * B, int ldb, - const T * beta, T * C, int ldc); - - template - static void geam(bool transa, bool transb, int m, int n, - const T * alpha, const T * A, int lda, - const T * beta, const T * B, int ldb, - T * C, int ldc); - -}; - -} // end namespace nvgraph - diff --git a/cpp/src/nvgraph/include/nvgraph_cusparse.hxx b/cpp/src/nvgraph/include/nvgraph_cusparse.hxx deleted file mode 100644 index a1c86bd1bc8..00000000000 --- a/cpp/src/nvgraph/include/nvgraph_cusparse.hxx +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include "nvgraph_vector.hxx" - -#include -#include "debug_macros.h" - -namespace nvgraph -{ -class Cusparse -{ -private: - // global CUSPARSE handle for nvgraph - static cusparseHandle_t m_handle; // Constructor. - Cusparse(); - // Destructor. - ~Cusparse(); - -public: - - // Get the handle. - static cusparseHandle_t get_handle() - { - if (m_handle == 0) - CHECK_CUSPARSE(cusparseCreate(&m_handle)); - return m_handle; - } - // Destroy handle - static void destroy_handle() - { - if (m_handle != 0) - CHECK_CUSPARSE( cusparseDestroy(m_handle) ); - m_handle = 0; - } - static void setStream(cudaStream_t stream) - { - cusparseHandle_t handle = Cusparse::get_handle(); - CHECK_CUSPARSE(cusparseSetStream(handle, stream)); - } - // Set pointer mode - static void set_pointer_mode_device(); - static void set_pointer_mode_host(); - - // operate on all rows and columns y= alpha*A.x + beta*y - template - static void csrmv( const bool transposed, - const bool sym, - const int m, const int n, const int nnz, - const ValueType_* alpha, - const ValueType_* csrVal, - const IndexType_ *csrRowPtr, - const IndexType_ *csrColInd, - const ValueType_* x, - const ValueType_* beta, - ValueType_* y); - - // future possible features - /* - template - static void csrmv_with_mask( const typename TConfig::MatPrec alphaConst, - Matrix &A, - Vector &x, - const typename TConfig::MatPrec betaConst, - Vector &y ); - - template - static void csrmv_with_mask_restriction( const typename TConfig::MatPrec alphaConst, - Matrix &A, - Vector &x, - const typename TConfig::MatPrec betaConst, - Vector &y, - Matrix &P); - - // E is a vector that represents a diagonal matrix - // operate on all rows and columns - // y= alpha*E.x + beta*y - template - static void csrmv( const typename TConfig::MatPrec alphaConst, - Matrix &A, - const typename Matrix::MVector &E, - Vector &x, - const typename TConfig::MatPrec betaConst, - Vector &y, - ViewType view = OWNED ); - - // operate only on columns specified by columnColorSelector, see enum ColumnColorSelector above - // operate only on rows of specified color, given by A.offsets_rows_per_color, A.sorted_rows_by_color - // y= alpha*A.x + beta*y - template - static void csrmv( ColumnColorSelector columnColorSelector, - const int color, - const typename TConfig::MatPrec alphaConst, - Matrix &A, - Vector &x, - const typename TConfig::MatPrec betaConst, - Vector &y, - ViewType view = OWNED ); - - // E is a vector that represents a diagonal matrix - // operate only on rows of specified color, given by A.offsets_rows_per_color, A.sorted_rows_by_color - // y= alpha*E.x + beta*y - template - static void csrmv( const int color, - typename TConfig::MatPrec alphaConst, - Matrix &A, - const typename Matrix::MVector &E, - Vector &x, - typename TConfig::MatPrec betaConst, - Vector &y, - ViewType view=OWNED ); - - template - static void csrmm(typename TConfig::MatPrec alpha, - Matrix &A, - Vector &V, - typename TConfig::VecPrec beta, - Vector &Res); - -*/ - - template - static void csrmm(const bool transposed, - const bool sym, - const int m, - const int n, - const int k, - const int nnz, - const ValueType_* alpha, - const ValueType_* csrVal, - const IndexType_* csrRowPtr, - const IndexType_* csrColInd, - const ValueType_* x, - const int ldx, - const ValueType_* beta, - ValueType_* y, - const int ldy); - - //template - static void csr2coo( const int n, - const int nnz, - const int *csrRowPtr, - int *cooRowInd); -}; - -} // end namespace nvgraph - diff --git a/cpp/src/nvgraph/include/nvgraph_error.hxx b/cpp/src/nvgraph/include/nvgraph_error.hxx deleted file mode 100644 index cf7dff5b009..00000000000 --- a/cpp/src/nvgraph/include/nvgraph_error.hxx +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -#include "stacktrace.h" - -namespace nvgraph { - -#if defined(DEBUG) || defined(VERBOSE_DIAG) -#define STACKTRACE "\nStack trace:\n" + std::string(e.trace()) -#define WHERE " at: " << __FILE__ << ':' << __LINE__ -#else -#define STACKTRACE "" -#define WHERE "" -#endif - - -enum NVGRAPH_ERROR { -/********************************************************* - * Flags for status reporting - *********************************************************/ - NVGRAPH_OK=0, - NVGRAPH_ERR_BAD_PARAMETERS=1, - NVGRAPH_ERR_UNKNOWN=2, - NVGRAPH_ERR_CUDA_FAILURE=3, - NVGRAPH_ERR_THRUST_FAILURE=4, - NVGRAPH_ERR_IO=5, - NVGRAPH_ERR_NOT_IMPLEMENTED=6, - NVGRAPH_ERR_NO_MEMORY=7, - NVGRAPH_ERR_NOT_CONVERGED=8 -}; - -// define our own bad_alloc so we can set its .what() -class nvgraph_exception: public std::exception -{ - public: - inline nvgraph_exception(const std::string &w, const std::string &where, const std::string &trace, NVGRAPH_ERROR reason) : m_trace(trace), m_what(w), m_reason(reason), m_where(where) - { - } - - inline virtual ~nvgraph_exception(void) throw () {}; - - inline virtual const char *what(void) const throw() - { - return m_what.c_str(); - } - inline virtual const char *where(void) const throw() - { - return m_where.c_str(); - } - inline virtual const char *trace(void) const throw() - { - return m_trace.c_str(); - } - inline virtual NVGRAPH_ERROR reason(void) const throw() - { - return m_reason; - } - - - private: - std::string m_trace; - std::string m_what; - NVGRAPH_ERROR m_reason; - std::string m_where; -}; // end bad_alloc - - -int NVGRAPH_GetErrorString( NVGRAPH_ERROR error, char* buffer, int buf_len); - -/******************************************************** - * Prints the error message, the stack trace, and exits - * ******************************************************/ -#define FatalError(s, reason) { \ - std::stringstream _where; \ - _where << WHERE ; \ - std::stringstream _trace; \ - printStackTrace(_trace); \ - throw nvgraph_exception(std::string(s) + "\n", _where.str(), _trace.str(), reason); \ -} - -#undef cudaCheckError -#if defined(DEBUG) || defined(VERBOSE_DIAG) -#define cudaCheckError() { \ - cudaError_t e=cudaGetLastError(); \ - if(e!=cudaSuccess) { \ - std::stringstream _error; \ - _error << "Cuda failure: '" << cudaGetErrorString(e) << "'"; \ - FatalError(_error.str(), NVGRAPH_ERR_CUDA_FAILURE); \ - } \ -} -#else // NO DEBUG -#define cudaCheckError() \ - { \ - cudaError_t __e = cudaGetLastError(); \ - if (__e != cudaSuccess) { \ - FatalError("", NVGRAPH_ERR_CUDA_FAILURE); \ - } \ - } -#endif - -#define CHECK_CUDA(call) \ - { \ - cudaError_t _e = (call); \ - if (_e != cudaSuccess) \ - { \ - std::stringstream _error; \ - _error << "CUDA Runtime failure: '#" << _e << "'"; \ - FatalError(_error.str(), NVGRAPH_ERR_CUDA_FAILURE); \ - } \ - } - -#define CHECK_CURAND(call) \ - { \ - curandStatus_t _e = (call); \ - if (_e != CURAND_STATUS_SUCCESS) \ - { \ - std::stringstream _error; \ - _error << "CURAND failure: '#" << _e << "'"; \ - FatalError(_error.str(), NVGRAPH_ERR_CUDA_FAILURE); \ - } \ - } - -#define CHECK_CUBLAS(call) \ - { \ - cublasStatus_t _e = (call); \ - if (_e != CUBLAS_STATUS_SUCCESS) \ - { \ - std::stringstream _error; \ - _error << "CUBLAS failure: '#" << _e << "'"; \ - FatalError(_error.str(), NVGRAPH_ERR_CUDA_FAILURE); \ - } \ - } - -#define CHECK_CUSPARSE(call) \ - { \ - cusparseStatus_t _e = (call); \ - if (_e != CUSPARSE_STATUS_SUCCESS) \ - { \ - std::stringstream _error; \ - _error << "CURAND failure: '#" << _e << "'"; \ - FatalError(_error.str(), NVGRAPH_ERR_CUDA_FAILURE); \ - } \ - } - -#define CHECK_CUSOLVER(call) \ - { \ - cusolverStatus_t _e = (call); \ - if (_e != CUSOLVER_STATUS_SUCCESS) \ - { \ - std::stringstream _error; \ - _error << "CURAND failure: '#" << _e << "'"; \ - FatalError(_error.str(), NVGRAPH_ERR_CUDA_FAILURE); \ - } \ - } -} // namespace nvgraph - diff --git a/cpp/src/nvgraph/include/nvgraph_lapack.hxx b/cpp/src/nvgraph/include/nvgraph_lapack.hxx deleted file mode 100644 index a667a3717a2..00000000000 --- a/cpp/src/nvgraph/include/nvgraph_lapack.hxx +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include "nvgraph_error.hxx" -namespace nvgraph -{ -template class Lapack; - -template -class Lapack -{ -private: - Lapack(); - ~Lapack(); -public: - static void check_lapack_enabled(); - - static void gemm(bool transa, bool transb, int m, int n, int k, T alpha, const T * A, int lda, const T * B, int ldb, T beta, T * C, int ldc); - - // special QR for lanczos - static void sterf(int n, T * d, T * e); - static void steqr(char compz, int n, T * d, T * e, T * z, int ldz, T * work); - - // QR - // computes the QR factorization of a general matrix - static void geqrf (int m, int n, T *a, int lda, T *tau, T *work, int *lwork); - // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf. - //static void orgqr( int m, int n, int k, T* a, int lda, const T* tau, T* work, int* lwork ); - // multiply C by implicit Q - static void ormqr (bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork); - //static void unmqr (bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, T *c, int ldc, T *work, int *lwork); - //static void qrf (int n, T *H, T *Q, T *R); - - //static void hseqr (T* Q, T* R, T* eigenvalues,T* eigenvectors, int dim, int ldh, int ldq); - static void geev(T* A, T* eigenvalues, int dim, int lda); - static void geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr); - static void geev(T* A, T* eigenvalues_r, T* eigenvalues_i, T* eigenvectors_r, T* eigenvectors_i, int dim, int lda, int ldvr); - -}; -} // end namespace nvgraph - diff --git a/cpp/src/nvgraph/include/nvgraph_vector.hxx b/cpp/src/nvgraph/include/nvgraph_vector.hxx deleted file mode 100644 index 228c83686dc..00000000000 --- a/cpp/src/nvgraph/include/nvgraph_vector.hxx +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "nvgraph_error.hxx" -#include "nvgraph_vector_kernels.hxx" - -#include - -#include "debug_macros.h" - -namespace nvgraph -{ - -/*! A Vector contains a device vector of size |E| and type T - */ -template -class Vector { -public: - typedef ValueType_ ValueType; - -protected: - rmm::device_vector values; - -public: - /*! Construct an empty \p Vector. - */ - Vector(void) {} - ~Vector(void) {} - /*! Construct a \p Vector of size vertices. - * - * \param vertices The size of the Vector - */ - Vector(size_t vertices, cudaStream_t stream = 0) - : values(vertices) {} - - size_t get_size() const { return values.size(); } - size_t bytes() const { return values.size()*sizeof(ValueType);} - ValueType const *raw() const { return values.data().get(); } - ValueType *raw() { return values.data().get(); } - - void allocate(size_t n, cudaStream_t stream = 0) - { - values.resize(n); - } - - void fill(ValueType val, cudaStream_t stream = 0) - { - fill_raw_vec(this->raw(), this->get_size(), val, stream); - } - - void copy(Vector &vec1, cudaStream_t stream = 0) - { - if (this->get_size() == 0 && vec1.get_size()>0) { - allocate(vec1.get_size(), stream); - copy_vec(vec1.raw(), this->get_size(), this->raw(), stream); - } else if (this->get_size() == vec1.get_size()) - copy_vec(vec1.raw(), this->get_size(), this->raw(), stream); - else if (this->get_size() > vec1.get_size()) { - copy_vec(vec1.raw(), vec1.get_size(), this->raw(), stream); - } else { - FatalError("Cannot copy a vector into a smaller one", NVGRAPH_ERR_BAD_PARAMETERS); - } - } - - ValueType nrm1(cudaStream_t stream = 0) { - ValueType res = 0; - nrm1_raw_vec(this->raw(), this->get_size(), &res, stream); - return res; - } -}; // class Vector -} // end namespace nvgraph - diff --git a/cpp/src/nvgraph/include/nvgraph_vector_kernels.hxx b/cpp/src/nvgraph/include/nvgraph_vector_kernels.hxx deleted file mode 100644 index 9a0e640044a..00000000000 --- a/cpp/src/nvgraph/include/nvgraph_vector_kernels.hxx +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -namespace nvgraph -{ - template - void nrm1_raw_vec (ValueType_* vec, size_t n, ValueType_* res, cudaStream_t stream = 0); - - template - void fill_raw_vec (ValueType_* vec, size_t n, ValueType_ value, cudaStream_t stream = 0); - - template - void dump_raw_vec (ValueType_* vec, size_t n, int offset, cudaStream_t stream = 0); - - template - void dmv (size_t num_vertices, ValueType_ alpha, ValueType_* D, ValueType_* x, ValueType_ beta, ValueType_* y, cudaStream_t stream = 0); - - template - void copy_vec(ValueType_ *vec1, size_t n, ValueType_ *res, cudaStream_t stream = 0); - - template - void flag_zeros_raw_vec(size_t num_vertices, ValueType_* vec, int* flag, cudaStream_t stream = 0 ); - - template - void set_connectivity( size_t n, IndexType_ root, ValueType_ self_loop_val, ValueType_ unreachable_val, ValueType_* res, cudaStream_t stream = 0); - -} // end namespace nvgraph - diff --git a/cpp/src/nvgraph/include/partition.hxx b/cpp/src/nvgraph/include/partition.hxx deleted file mode 100644 index 10673d1eee3..00000000000 --- a/cpp/src/nvgraph/include/partition.hxx +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include "nvgraph_error.hxx" -#include "spectral_matrix.hxx" - - -namespace nvgraph { - #define SPECTRAL_USE_COLORING true - - #define SPECTRAL_USE_LOBPCG true - #define SPECTRAL_USE_PRECONDITIONING true - #define SPECTRAL_USE_SCALING_OF_EIGVECS false - - #define SPECTRAL_USE_MAGMA false - #define SPECTRAL_USE_THROTTLE true - #define SPECTRAL_USE_NORMALIZED_LAPLACIAN true - #define SPECTRAL_USE_R_ORTHOGONALIZATION false - - /// Spectral graph partition - /** Compute partition for a weighted undirected graph. This - * partition attempts to minimize the cost function: - * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) - * - * @param G Weighted graph in CSR format - * @param nParts Number of partitions. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter_lanczos Maximum number of Lanczos iterations. - * @param restartIter_lanczos Maximum size of Lanczos system before - * implicit restart. - * @param tol_lanczos Convergence tolerance for Lanczos method. - * @param maxIter_kmeans Maximum number of k-means iterations. - * @param tol_kmeans Convergence tolerance for k-means algorithm. - * @param parts (Output, device memory, n entries) Partition - * assignments. - * @param iters_lanczos On exit, number of Lanczos iterations - * performed. - * @param iters_kmeans On exit, number of k-means iterations - * performed. - * @return NVGRAPH error flag. - */ - template - NVGRAPH_ERROR partition(cugraph::experimental::GraphCSRView const &graph, - vertex_t nParts, - vertex_t nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - weight_t tol_lanczos, - int maxIter_kmeans, - weight_t tol_kmeans, - vertex_t * __restrict__ parts, - weight_t *eigVals, - weight_t *eig_vects); - - /// Compute cost function for partition - /** This function determines the edges cut by a partition and a cost - * function: - * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) - * Graph is assumed to be weighted and undirected. - * - * @param G Weighted graph in CSR format - * @param nParts Number of partitions. - * @param parts (Input, device memory, n entries) Partition - * assignments. - * @param edgeCut On exit, weight of edges cut by partition. - * @param cost On exit, partition cost function. - * @return NVGRAPH error flag. - */ - template - NVGRAPH_ERROR analyzePartition(cugraph::experimental::GraphCSRView const &graph, - vertex_t nParts, - const vertex_t * __restrict__ parts, - weight_t & edgeCut, weight_t & cost); - -} - diff --git a/cpp/src/nvgraph/include/sm_utils.h b/cpp/src/nvgraph/include/sm_utils.h deleted file mode 100644 index 001bffe136e..00000000000 --- a/cpp/src/nvgraph/include/sm_utils.h +++ /dev/null @@ -1,326 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#ifdef _MSC_VER -#include -#else -#include -#endif - -#define DEFAULT_MASK 0xffffffff - -#define USE_CG 1 -//(__CUDACC_VER__ >= 80500) - -namespace nvgraph { -namespace utils { -static __device__ __forceinline__ int lane_id() -{ - int id; - asm("mov.u32 %0, %%laneid;" : "=r"(id)); - return id; -} - -static __device__ __forceinline__ int lane_mask_lt() -{ - int mask; - asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask)); - return mask; -} - -static __device__ __forceinline__ int lane_mask_le() -{ - int mask; - asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask)); - return mask; -} - -static __device__ __forceinline__ int warp_id() { return threadIdx.x >> 5; } - -static __device__ __forceinline__ unsigned int ballot(int p, int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#if USE_CG - return __ballot_sync(mask, p); -#else - return __ballot(p); -#endif -#else - return 0; -#endif -} - -static __device__ __forceinline__ int shfl(int r, int lane, int bound = 32, int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#if USE_CG - return __shfl_sync(mask, r, lane, bound); -#else - return __shfl(r, lane, bound); -#endif -#else - return 0; -#endif -} - -static __device__ __forceinline__ float shfl(float r, - int lane, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#if USE_CG - return __shfl_sync(mask, r, lane, bound); -#else - return __shfl(r, lane, bound); -#endif -#else - return 0.0f; -#endif -} - -/// Warp shuffle down function -/** Warp shuffle functions on 64-bit floating point values are not - * natively implemented as of Compute Capability 5.0. This - * implementation has been copied from - * (http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler). - * Once this is natively implemented, this function can be replaced - * by __shfl_down. - * - */ -static __device__ __forceinline__ double shfl(double r, - int lane, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - int2 a = *reinterpret_cast(&r); - a.x = __shfl_sync(mask, a.x, lane, bound); - a.y = __shfl_sync(mask, a.y, lane, bound); - return *reinterpret_cast(&a); -#else - int2 a = *reinterpret_cast(&r); - a.x = __shfl(a.x, lane, bound); - a.y = __shfl(a.y, lane, bound); - return *reinterpret_cast(&a); -#endif -#else - return 0.0; -#endif -} - -static __device__ __forceinline__ long long shfl(long long r, - int lane, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - int2 a = *reinterpret_cast(&r); - a.x = __shfl_sync(mask, a.x, lane, bound); - a.y = __shfl_sync(mask, a.y, lane, bound); - return *reinterpret_cast(&a); -#else - int2 a = *reinterpret_cast(&r); - a.x = __shfl(a.x, lane, bound); - a.y = __shfl(a.y, lane, bound); - return *reinterpret_cast(&a); -#endif -#else - return 0.0; -#endif -} - -static __device__ __forceinline__ int shfl_down(int r, - int offset, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - return __shfl_down_sync(mask, r, offset, bound); -#else - return __shfl_down(r, offset, bound); -#endif -#else - return 0.0f; -#endif -} - -static __device__ __forceinline__ float shfl_down(float r, - int offset, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - return __shfl_down_sync(mask, r, offset, bound); -#else - return __shfl_down(r, offset, bound); -#endif -#else - return 0.0f; -#endif -} - -static __device__ __forceinline__ double shfl_down(double r, - int offset, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - int2 a = *reinterpret_cast(&r); - a.x = __shfl_down_sync(mask, a.x, offset, bound); - a.y = __shfl_down_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); -#else - int2 a = *reinterpret_cast(&r); - a.x = __shfl_down(a.x, offset, bound); - a.y = __shfl_down(a.y, offset, bound); - return *reinterpret_cast(&a); -#endif -#else - return 0.0; -#endif -} - -static __device__ __forceinline__ long long shfl_down(long long r, - int offset, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - int2 a = *reinterpret_cast(&r); - a.x = __shfl_down_sync(mask, a.x, offset, bound); - a.y = __shfl_down_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); -#else - int2 a = *reinterpret_cast(&r); - a.x = __shfl_down(a.x, offset, bound); - a.y = __shfl_down(a.y, offset, bound); - return *reinterpret_cast(&a); -#endif -#else - return 0.0; -#endif -} - -// specifically for triangles counting -static __device__ __forceinline__ uint64_t shfl_down(uint64_t r, - int offset, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - int2 a = *reinterpret_cast(&r); - a.x = __shfl_down_sync(mask, a.x, offset, bound); - a.y = __shfl_down_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); -#else - int2 a = *reinterpret_cast(&r); - a.x = __shfl_down(mask, a.x, offset, bound); - a.y = __shfl_down(mask, a.y, offset, bound); - return *reinterpret_cast(&a); -#endif -#else - return 0.0; -#endif -} - -static __device__ __forceinline__ int shfl_up(int r, - int offset, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - return __shfl_up_sync(mask, r, offset, bound); -#else - return __shfl_up(r, offset, bound); -#endif -#else - return 0.0f; -#endif -} - -static __device__ __forceinline__ float shfl_up(float r, - int offset, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - return __shfl_up_sync(mask, r, offset, bound); -#else - return __shfl_up(r, offset, bound); -#endif -#else - return 0.0f; -#endif -} - -static __device__ __forceinline__ double shfl_up(double r, - int offset, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - int2 a = *reinterpret_cast(&r); - a.x = __shfl_up_sync(mask, a.x, offset, bound); - a.y = __shfl_up_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); -#else - int2 a = *reinterpret_cast(&r); - a.x = __shfl_up(a.x, offset, bound); - a.y = __shfl_up(a.y, offset, bound); - return *reinterpret_cast(&a); -#endif -#else - return 0.0; -#endif -} - -static __device__ __forceinline__ long long shfl_up(long long r, - int offset, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - int2 a = *reinterpret_cast(&r); - a.x = __shfl_up_sync(mask, a.x, offset, bound); - a.y = __shfl_up_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); -#else - int2 a = *reinterpret_cast(&r); - a.x = __shfl_up(a.x, offset, bound); - a.y = __shfl_up(a.y, offset, bound); - return *reinterpret_cast(&a); -#endif -#else - return 0.0; -#endif -} -} // namespace utils - -} // namespace nvgraph diff --git a/cpp/src/nvgraph/include/spectral_matrix.hxx b/cpp/src/nvgraph/include/spectral_matrix.hxx deleted file mode 100644 index d3f6e0411da..00000000000 --- a/cpp/src/nvgraph/include/spectral_matrix.hxx +++ /dev/null @@ -1,785 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include - -#include "nvgraph_vector.hxx" - -namespace nvgraph { - - /// Abstract matrix class - /** Derived classes must implement matrix-vector products. - */ - template - class Matrix { - public: - /// Number of rows - const IndexType_ m; - /// Number of columns - const IndexType_ n; - /// CUDA stream - cudaStream_t s; - - /// Constructor - /** @param _m Number of rows. - * @param _n Number of columns. - */ - Matrix(IndexType_ _m, IndexType_ _n) : m(_m), n(_n), s(0){} - - /// Destructor - virtual ~Matrix() {} - - - /// Get and Set CUDA stream - virtual void setCUDAStream(cudaStream_t _s) = 0; - virtual void getCUDAStream(cudaStream_t *_s) = 0; - - /// Matrix-vector product - /** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n entries) Vector. - * @param beta Scalar. - * @param y (Input/output, device memory, m entries) Output - * vector. - */ - virtual void mv(ValueType_ alpha, - const ValueType_ * __restrict__ x, - ValueType_ beta, - ValueType_ * __restrict__ y) const = 0; - - virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const = 0; - /// Color and Reorder - virtual void color(IndexType_ *c, IndexType_ *p) const = 0; - virtual void reorder(IndexType_ *p) const = 0; - - /// Incomplete Cholesky (setup, factor and solve) - virtual void prec_setup(Matrix * _M) = 0; - virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const = 0; - - //Get the sum of all edges - virtual ValueType_ getEdgeSum() const = 0; - }; - - /// Dense matrix class - template - class DenseMatrix : public Matrix { - - private: - /// Whether to transpose matrix - const bool trans; - /// Matrix entries, stored column-major in device memory - const ValueType_ * A; - /// Leading dimension of matrix entry array - const IndexType_ lda; - - public: - /// Constructor - DenseMatrix(bool _trans, - IndexType_ _m, IndexType_ _n, - const ValueType_ * _A, IndexType_ _lda); - - /// Destructor - virtual ~DenseMatrix(); - - /// Get and Set CUDA stream - virtual void setCUDAStream(cudaStream_t _s); - virtual void getCUDAStream(cudaStream_t *_s); - - /// Matrix-vector product - virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, - ValueType_ beta, ValueType_ * __restrict__ y) const; - /// Matrix-set of k vectors product - virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; - - /// Color and Reorder - virtual void color(IndexType_ *c, IndexType_ *p) const; - virtual void reorder(IndexType_ *p) const; - - /// Incomplete Cholesky (setup, factor and solve) - virtual void prec_setup(Matrix * _M); - virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; - - //Get the sum of all edges - virtual ValueType_ getEdgeSum() const; - }; - - /// Sparse matrix class in CSR format - template - class CsrMatrix : public Matrix { - - private: - /// Whether to transpose matrix - const bool trans; - /// Whether matrix is stored in symmetric format - const bool sym; - /// Number of non-zero entries - const IndexType_ nnz; - /// Matrix properties - const cusparseMatDescr_t descrA; - /// Matrix entry values (device memory) - /*const*/ ValueType_ * csrValA; - /// Pointer to first entry in each row (device memory) - const IndexType_ * csrRowPtrA; - /// Column index of each matrix entry (device memory) - const IndexType_ * csrColIndA; - /// Analysis info (pointer to opaque CUSPARSE struct) - cusparseSolveAnalysisInfo_t info_l; - cusparseSolveAnalysisInfo_t info_u; - /// factored flag (originally set to false, then reset to true after factorization), - /// notice we only want to factor once - bool factored; - - public: - /// Constructor - CsrMatrix(bool _trans, bool _sym, - IndexType_ _m, IndexType_ _n, IndexType_ _nnz, - const cusparseMatDescr_t _descrA, - /*const*/ ValueType_ * _csrValA, - const IndexType_ * _csrRowPtrA, - const IndexType_ * _csrColIndA); - - /// Destructor - virtual ~CsrMatrix(); - - /// Get and Set CUDA stream - virtual void setCUDAStream(cudaStream_t _s); - virtual void getCUDAStream(cudaStream_t *_s); - - - /// Matrix-vector product - virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, - ValueType_ beta, ValueType_ * __restrict__ y) const; - /// Matrix-set of k vectors product - virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; - - /// Color and Reorder - virtual void color(IndexType_ *c, IndexType_ *p) const; - virtual void reorder(IndexType_ *p) const; - - /// Incomplete Cholesky (setup, factor and solve) - virtual void prec_setup(Matrix * _M); - virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; - - //Get the sum of all edges - virtual ValueType_ getEdgeSum() const; - }; - - /// Graph Laplacian matrix - template - class LaplacianMatrix - : public Matrix { - - private: - /// Adjacency matrix - /*const*/ Matrix * A; - /// Degree of each vertex - Vector D; - /// Preconditioning matrix - Matrix * M; - - public: - /// Constructor - LaplacianMatrix(/*const*/ Matrix & _A); - - /// Destructor - virtual ~LaplacianMatrix(); - - /// Get and Set CUDA stream - virtual void setCUDAStream(cudaStream_t _s); - virtual void getCUDAStream(cudaStream_t *_s); - - /// Matrix-vector product - virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, - ValueType_ beta, ValueType_ * __restrict__ y) const; - /// Matrix-set of k vectors product - virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; - - /// Scale a set of k vectors by a diagonal - virtual void dm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; - - /// Color and Reorder - virtual void color(IndexType_ *c, IndexType_ *p) const; - virtual void reorder(IndexType_ *p) const; - - /// Solve preconditioned system M x = f for a set of k vectors - virtual void prec_setup(Matrix * _M); - virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; - - //Get the sum of all edges - virtual ValueType_ getEdgeSum() const; - }; - - /// Modularity matrix - template - class ModularityMatrix - : public Matrix { - - private: - /// Adjacency matrix - /*const*/ Matrix * A; - /// Degree of each vertex - Vector D; - IndexType_ nnz; - ValueType_ edge_sum; - - /// Preconditioning matrix - Matrix * M; - - public: - /// Constructor - ModularityMatrix(/*const*/ Matrix & _A, IndexType_ _nnz); - - /// Destructor - virtual ~ModularityMatrix(); - - /// Get and Set CUDA stream - virtual void setCUDAStream(cudaStream_t _s); - virtual void getCUDAStream(cudaStream_t *_s); - - /// Matrix-vector product - virtual void mv(ValueType_ alpha, const ValueType_ * __restrict__ x, - ValueType_ beta, ValueType_ * __restrict__ y) const; - /// Matrix-set of k vectors product - virtual void mm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; - - /// Scale a set of k vectors by a diagonal - virtual void dm(IndexType_ k, ValueType_ alpha, const ValueType_ * __restrict__ x, ValueType_ beta, ValueType_ * __restrict__ y) const; - - /// Color and Reorder - virtual void color(IndexType_ *c, IndexType_ *p) const; - virtual void reorder(IndexType_ *p) const; - - /// Solve preconditioned system M x = f for a set of k vectors - virtual void prec_setup(Matrix * _M); - virtual void prec_solve(IndexType_ k, ValueType_ alpha, ValueType_ * __restrict__ fx, ValueType_ * __restrict__ t) const; - - //Get the sum of all edges - virtual ValueType_ getEdgeSum() const; - }; - -// cublasIxamax -inline -cublasStatus_t cublasIxamax(cublasHandle_t handle, int n, - const float *x, int incx, int *result) { - return cublasIsamax(handle, n, x, incx, result); -} -inline -cublasStatus_t cublasIxamax(cublasHandle_t handle, int n, - const double *x, int incx, int *result) { - return cublasIdamax(handle, n, x, incx, result); -} - -// cublasIxamin -inline -cublasStatus_t cublasIxamin(cublasHandle_t handle, int n, - const float *x, int incx, int *result) { - return cublasIsamin(handle, n, x, incx, result); -} -inline -cublasStatus_t cublasIxamin(cublasHandle_t handle, int n, - const double *x, int incx, int *result) { - return cublasIdamin(handle, n, x, incx, result); -} - -// cublasXasum -inline -cublasStatus_t cublasXasum(cublasHandle_t handle, int n, - const float *x, int incx, - float *result) { - return cublasSasum(handle, n, x, incx, result); -} -inline -cublasStatus_t cublasXasum(cublasHandle_t handle, int n, - const double *x, int incx, - double *result) { - return cublasDasum(handle, n, x, incx, result); -} - -// cublasXaxpy -inline -cublasStatus_t cublasXaxpy(cublasHandle_t handle, int n, - const float * alpha, - const float * x, int incx, - float * y, int incy) { - return cublasSaxpy(handle, n, alpha, x, incx, y, incy); -} -inline -cublasStatus_t cublasXaxpy(cublasHandle_t handle, int n, - const double *alpha, - const double *x, int incx, - double *y, int incy) { - return cublasDaxpy(handle, n, alpha, x, incx, y, incy); -} - -// cublasXcopy -inline -cublasStatus_t cublasXcopy(cublasHandle_t handle, int n, - const float *x, int incx, - float *y, int incy) { - return cublasScopy(handle, n, x, incx, y, incy); -} -inline -cublasStatus_t cublasXcopy(cublasHandle_t handle, int n, - const double *x, int incx, - double *y, int incy) { - return cublasDcopy(handle, n, x, incx, y, incy); -} - -// cublasXdot -inline -cublasStatus_t cublasXdot(cublasHandle_t handle, int n, - const float *x, int incx, - const float *y, int incy, - float *result) { - return cublasSdot(handle, n, x, incx, y, incy, result); -} -inline -cublasStatus_t cublasXdot(cublasHandle_t handle, int n, - const double *x, int incx, - const double *y, int incy, - double *result) { - return cublasDdot(handle, n, x, incx, y, incy, result); -} - -// cublasXnrm2 -inline -cublasStatus_t cublasXnrm2(cublasHandle_t handle, int n, - const float *x, int incx, - float *result) { - return cublasSnrm2(handle, n, x, incx, result); -} -inline -cublasStatus_t cublasXnrm2(cublasHandle_t handle, int n, - const double *x, int incx, - double *result) { - return cublasDnrm2(handle, n, x, incx, result); -} - -// cublasXscal -inline -cublasStatus_t cublasXscal(cublasHandle_t handle, int n, - const float *alpha, - float *x, int incx) { - return cublasSscal(handle, n, alpha, x, incx); -} -inline -cublasStatus_t cublasXscal(cublasHandle_t handle, int n, - const double *alpha, - double *x, int incx) { - return cublasDscal(handle, n, alpha, x, incx); -} - -// cublasXgemv -inline -cublasStatus_t cublasXgemv(cublasHandle_t handle, - cublasOperation_t trans, - int m, int n, - const float *alpha, - const float *A, int lda, - const float *x, int incx, - const float *beta, - float *y, int incy) { - return cublasSgemv(handle, trans, m, n, alpha, A, lda, x, incx, - beta, y, incy); -} -inline -cublasStatus_t cublasXgemv(cublasHandle_t handle, - cublasOperation_t trans, - int m, int n, - const double *alpha, - const double *A, int lda, - const double *x, int incx, - const double *beta, - double *y, int incy) { - return cublasDgemv(handle, trans, m, n, alpha, A, lda, x, incx, - beta, y, incy); -} - -// cublasXger -inline -cublasStatus_t cublasXger(cublasHandle_t handle, int m, int n, - const float *alpha, - const float *x, int incx, - const float *y, int incy, - float *A, int lda) { - return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda); -} -inline -cublasStatus_t cublasXger(cublasHandle_t handle, int m, int n, - const double *alpha, - const double *x, int incx, - const double *y, int incy, - double *A, int lda) { - return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda); -} - -// cublasXgemm -inline -cublasStatus_t cublasXgemm(cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, int n, int k, - const float *alpha, - const float *A, int lda, - const float *B, int ldb, - const float *beta, - float *C, int ldc) { - return cublasSgemm(handle, transa, transb, m, n, k, - alpha, A, lda, B, ldb, beta, C, ldc); -} -inline -cublasStatus_t cublasXgemm(cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, int n, int k, - const double *alpha, - const double *A, int lda, - const double *B, int ldb, - const double *beta, - double *C, int ldc) { - return cublasDgemm(handle, transa, transb, m, n, k, - alpha, A, lda, B, ldb, beta, C, ldc); -} - -// cublasXgeam -inline -cublasStatus_t cublasXgeam(cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, int n, - const float *alpha, - const float *A, int lda, - const float *beta, - const float *B, int ldb, - float *C, int ldc) { - return cublasSgeam(handle, transa, transb, m, n, - alpha, A, lda, beta, B, ldb, C, ldc); -} -inline -cublasStatus_t cublasXgeam(cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, int n, - const double *alpha, - const double *A, int lda, - const double *beta, - const double *B, int ldb, - double *C, int ldc) { - return cublasDgeam(handle, transa, transb, m, n, - alpha, A, lda, beta, B, ldb, C, ldc); -} - -// cublasXtrsm -inline cublasStatus_t cublasXtrsm(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float *alpha, const float *A, int lda, float *B, int ldb) { - return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); -} -inline cublasStatus_t cublasXtrsm(cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double *alpha, const double *A, int lda, double *B, int ldb) { - return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); -} - -// curandGeneratorNormalX -inline -curandStatus_t -curandGenerateNormalX(curandGenerator_t generator, - float * outputPtr, size_t n, - float mean, float stddev) { - return curandGenerateNormal(generator, outputPtr, n, mean, stddev); -} -inline -curandStatus_t -curandGenerateNormalX(curandGenerator_t generator, - double * outputPtr, size_t n, - double mean, double stddev) { - return curandGenerateNormalDouble(generator, outputPtr, - n, mean, stddev); -} - -// cusolverXpotrf_bufferSize -inline cusolverStatus_t cusolverXpotrf_bufferSize(cusolverDnHandle_t handle, int n, float *A, int lda, int *Lwork){ - return cusolverDnSpotrf_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,A,lda,Lwork); -} -inline cusolverStatus_t cusolverXpotrf_bufferSize(cusolverDnHandle_t handle, int n, double *A, int lda, int *Lwork){ - return cusolverDnDpotrf_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,A,lda,Lwork); -} - -// cusolverXpotrf -inline cusolverStatus_t cusolverXpotrf(cusolverDnHandle_t handle, int n, float *A, int lda, float *Workspace, int Lwork, int *devInfo){ - return cusolverDnSpotrf(handle,CUBLAS_FILL_MODE_LOWER,n,A,lda,Workspace,Lwork,devInfo); -} -inline cusolverStatus_t cusolverXpotrf(cusolverDnHandle_t handle, int n, double *A, int lda, double *Workspace, int Lwork, int *devInfo){ - return cusolverDnDpotrf(handle,CUBLAS_FILL_MODE_LOWER,n,A,lda,Workspace,Lwork,devInfo); -} - -// cusolverXgesvd_bufferSize -inline cusolverStatus_t cusolverXgesvd_bufferSize(cusolverDnHandle_t handle, int m, int n, float *A, int lda, float *U, int ldu, float *VT, int ldvt, int *Lwork){ - //ideally - //char jobu = 'O'; - //char jobvt= 'N'; - //only supported - //char jobu = 'A'; - //char jobvt= 'A'; - return cusolverDnSgesvd_bufferSize(handle,m,n,Lwork); -} - -inline cusolverStatus_t cusolverXgesvd_bufferSize(cusolverDnHandle_t handle, int m, int n, double *A, int lda, double *U, int ldu, double *VT, int ldvt, int *Lwork){ - //ideally - //char jobu = 'O'; - //char jobvt= 'N'; - //only supported - //char jobu = 'A'; - //char jobvt= 'A'; - return cusolverDnDgesvd_bufferSize(handle,m,n,Lwork); -} - -// cusolverXgesvd -inline cusolverStatus_t cusolverXgesvd(cusolverDnHandle_t handle, int m, int n, float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt, float *Work, int Lwork, float *rwork, int *devInfo){ - //ideally - //char jobu = 'O'; - //char jobvt= 'N'; - //only supported - char jobu = 'A'; - char jobvt= 'A'; - - return cusolverDnSgesvd(handle,jobu,jobvt,m,n,A,lda,S,U,ldu,VT,ldvt,Work,Lwork,rwork,devInfo); -} - -inline cusolverStatus_t cusolverXgesvd(cusolverDnHandle_t handle, int m, int n, double *A, int lda, double *S, double *U, int ldu, double *VT, int ldvt, double *Work, int Lwork, double *rwork, int *devInfo){ - //ideally - //char jobu = 'O'; - //char jobvt= 'N'; - //only supported - char jobu = 'A'; - char jobvt= 'A'; - return cusolverDnDgesvd(handle,jobu,jobvt,m,n,A,lda,S,U,ldu,VT,ldvt,Work,Lwork,rwork,devInfo); -} - -// cusolverXgesvd_cond -inline cusolverStatus_t cusolverXgesvd_cond(cusolverDnHandle_t handle, int m, int n, float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt, float *Work, int Lwork, float *rwork, int *devInfo){ - //ideally - //char jobu = 'N'; - //char jobvt= 'N'; - //only supported - char jobu = 'A'; - char jobvt= 'A'; - return cusolverDnSgesvd(handle,jobu,jobvt,m,n,A,lda,S,U,ldu,VT,ldvt,Work,Lwork,rwork,devInfo); -} - -inline cusolverStatus_t cusolverXgesvd_cond(cusolverDnHandle_t handle, int m, int n, double *A, int lda, double *S, double *U, int ldu, double *VT, int ldvt, double *Work, int Lwork, double *rwork, int *devInfo){ - //ideally - //char jobu = 'N'; - //char jobvt= 'N'; - //only supported - char jobu = 'A'; - char jobvt= 'A'; - return cusolverDnDgesvd(handle,jobu,jobvt,m,n,A,lda,S,U,ldu,VT,ldvt,Work,Lwork,rwork,devInfo); -} - -// cusparseXcsrmv -inline -cusparseStatus_t cusparseXcsrmv(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, int n, int nnz, - const float * alpha, - const cusparseMatDescr_t descrA, - const float * csrValA, - const int * csrRowPtrA, - const int * csrColIndA, - const float * x, - const float * beta, - float *y) { - return cusparseScsrmv_mp(handle, transA, m, n, nnz, - alpha, descrA, csrValA, csrRowPtrA, csrColIndA, - x, beta, y); -} -inline -cusparseStatus_t cusparseXcsrmv(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, int n, int nnz, - const double * alpha, - const cusparseMatDescr_t descrA, - const double * csrValA, - const int * csrRowPtrA, - const int * csrColIndA, - const double * x, - const double * beta, - double *y) { - return cusparseDcsrmv_mp(handle, transA, m, n, nnz, - alpha, descrA, csrValA, csrRowPtrA, csrColIndA, - x, beta, y); -} - -// cusparseXcsrmm -inline -cusparseStatus_t cusparseXcsrmm(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, int n, int k, int nnz, - const float *alpha, - const cusparseMatDescr_t descrA, - const float *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - const float *B, int ldb, - const float *beta, - float *C, int ldc) { - return cusparseScsrmm(handle, transA, m, n, k, nnz, - alpha, descrA, csrValA, - csrRowPtrA, csrColIndA, - B, ldb, beta, C, ldc); -} -inline -cusparseStatus_t cusparseXcsrmm(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, int n, int k, int nnz, - const double *alpha, - const cusparseMatDescr_t descrA, - const double *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - const double *B, int ldb, - const double *beta, - double *C, int ldc) { - return cusparseDcsrmm(handle, transA, m, n, k, nnz, - alpha, descrA, csrValA, - csrRowPtrA, csrColIndA, - B, ldb, beta, C, ldc); -} - -// cusparseXcsrgeam -inline -cusparseStatus_t cusparseXcsrgeam(cusparseHandle_t handle, - int m, int n, - const float *alpha, - const cusparseMatDescr_t descrA, - int nnzA, const float *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - const float *beta, - const cusparseMatDescr_t descrB, - int nnzB, const float *csrValB, - const int *csrRowPtrB, - const int *csrColIndB, - const cusparseMatDescr_t descrC, - float *csrValC, - int *csrRowPtrC, int *csrColIndC) { - return cusparseScsrgeam(handle,m,n, - alpha,descrA,nnzA,csrValA,csrRowPtrA,csrColIndA, - beta,descrB,nnzB,csrValB,csrRowPtrB,csrColIndB, - descrC,csrValC,csrRowPtrC,csrColIndC); -} -inline -cusparseStatus_t cusparseXcsrgeam(cusparseHandle_t handle, - int m, int n, - const double *alpha, - const cusparseMatDescr_t descrA, - int nnzA, const double *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - const double *beta, - const cusparseMatDescr_t descrB, - int nnzB, const double *csrValB, - const int *csrRowPtrB, - const int *csrColIndB, - const cusparseMatDescr_t descrC, - double *csrValC, - int *csrRowPtrC, int *csrColIndC) { - return cusparseDcsrgeam(handle,m,n, - alpha,descrA,nnzA,csrValA,csrRowPtrA,csrColIndA, - beta,descrB,nnzB,csrValB,csrRowPtrB,csrColIndB, - descrC,csrValC,csrRowPtrC,csrColIndC); -} - -//ILU0, incomplete-LU with 0 threshhold (CUSPARSE) -inline cusparseStatus_t cusparseXcsrilu0(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - const cusparseMatDescr_t descrA, - float *csrValM, - const int *csrRowPtrA, - const int *csrColIndA, - cusparseSolveAnalysisInfo_t info){ - return cusparseScsrilu0(handle,trans,m,descrA,csrValM,csrRowPtrA,csrColIndA,info); -} - -inline cusparseStatus_t cusparseXcsrilu0(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - const cusparseMatDescr_t descrA, - double *csrValM, - const int *csrRowPtrA, - const int *csrColIndA, - cusparseSolveAnalysisInfo_t info){ - return cusparseDcsrilu0(handle,trans,m,descrA,csrValM,csrRowPtrA,csrColIndA,info); -} - -//IC0, incomplete-Cholesky with 0 threshhold (CUSPARSE) -inline cusparseStatus_t cusparseXcsric0(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - const cusparseMatDescr_t descrA, - float *csrValM, - const int *csrRowPtrA, - const int *csrColIndA, - cusparseSolveAnalysisInfo_t info){ - return cusparseScsric0(handle,trans,m,descrA,csrValM,csrRowPtrA,csrColIndA,info); -} -inline cusparseStatus_t cusparseXcsric0(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - const cusparseMatDescr_t descrA, - double *csrValM, - const int *csrRowPtrA, - const int *csrColIndA, - cusparseSolveAnalysisInfo_t info){ - return cusparseDcsric0(handle,trans,m,descrA,csrValM,csrRowPtrA,csrColIndA,info); -} - -//sparse triangular solve (CUSPARSE) -//analysis phase -inline cusparseStatus_t cusparseXcsrsm_analysis (cusparseHandle_t handle, cusparseOperation_t transa, int m, int nnz, const cusparseMatDescr_t descra, - const float *a, const int *ia, const int *ja, cusparseSolveAnalysisInfo_t info){ - return cusparseScsrsm_analysis(handle,transa,m,nnz,descra,a,ia,ja,info); -} -inline cusparseStatus_t cusparseXcsrsm_analysis (cusparseHandle_t handle, cusparseOperation_t transa, int m, int nnz, const cusparseMatDescr_t descra, - const double *a, const int *ia, const int *ja, cusparseSolveAnalysisInfo_t info){ - return cusparseDcsrsm_analysis(handle,transa,m,nnz,descra,a,ia,ja,info); -} -//solve phase -inline cusparseStatus_t cusparseXcsrsm_solve (cusparseHandle_t handle, cusparseOperation_t transa, int m, int k, float alpha, const cusparseMatDescr_t descra, - const float *a, const int *ia, const int *ja, cusparseSolveAnalysisInfo_t info, const float *x, int ldx, float *y, int ldy){ - return cusparseScsrsm_solve(handle,transa,m,k,&alpha,descra,a,ia,ja,info,x,ldx,y,ldy); -} -inline cusparseStatus_t cusparseXcsrsm_solve (cusparseHandle_t handle, cusparseOperation_t transa, int m, int k, double alpha, const cusparseMatDescr_t descra, - const double *a, const int *ia, const int *ja, cusparseSolveAnalysisInfo_t info, const double *x, int ldx, double *y, int ldy){ - return cusparseDcsrsm_solve(handle,transa,m,k,&alpha,descra,a,ia,ja,info,x,ldx,y,ldy); -} - - -inline cusparseStatus_t cusparseXcsrcolor(cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA, const int *csrColIndA, const float *fractionToColor, int *ncolors, int *coloring, int *reordering,cusparseColorInfo_t info) { - return cusparseScsrcolor(handle,m,nnz,descrA,csrValA,csrRowPtrA,csrColIndA,fractionToColor,ncolors,coloring,reordering,info); -} -inline cusparseStatus_t cusparseXcsrcolor(cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA, const int *csrColIndA, const double *fractionToColor, int *ncolors, int *coloring, int *reordering,cusparseColorInfo_t info) { - return cusparseDcsrcolor(handle,m,nnz,descrA,csrValA,csrRowPtrA,csrColIndA,fractionToColor,ncolors,coloring,reordering,info); -} - - -} - diff --git a/cpp/src/nvgraph/include/stacktrace.h b/cpp/src/nvgraph/include/stacktrace.h deleted file mode 100644 index b00824547e6..00000000000 --- a/cpp/src/nvgraph/include/stacktrace.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// adapted from https://idlebox.net/2008/0901-stacktrace-demangled/ and licensed under WTFPL v2.0 -#pragma once - -#if defined(_WIN32) || defined(__ANDROID__) || defined(ANDROID) || defined(__QNX__) || \ - defined(__QNXNTO__) -#else -#include -#include -#include -#include -#include -#endif - -#include -#include -#include -#include -#include -#include - -namespace nvgraph { - -/** Print a demangled stack backtrace of the caller function to FILE* out. */ -static inline void printStackTrace(std::ostream &eout = std::cerr, unsigned int max_frames = 63) -{ -#if defined(_WIN32) || defined(__ANDROID__) || defined(ANDROID) || defined(__QNX__) || \ - defined(__QNXNTO__) - // TODO add code for windows stack trace and android stack trace -#else - std::stringstream out; - - // storage array for stack trace address data - void *addrlist[max_frames + 1]; - - // retrieve current stack addresses - int addrlen = backtrace(addrlist, sizeof(addrlist) / sizeof(void *)); - if (addrlen == 0) { - out << " \n"; - return; - } - - // resolve addresses into strings containing "filename(function+address)", - // this array must be free()-ed - std::unique_ptr symbollist(backtrace_symbols(addrlist, addrlen), - &::free); - // char** symbollist = backtrace_symbols(addrlist, addrlen); - - // allocate string which will be filled with the demangled function name - size_t funcnamesize = 256; - std::vector funcname_v(funcnamesize); - char *funcname = funcname_v.data(); - - // iterate over the returned symbol lines. skip the first, it is the - // address of this function. - for (int i = 1; i < addrlen; i++) { - char *begin_name = 0, *begin_offset = 0, *end_offset = 0; - - // find parentheses and +address offset surrounding the mangled name: - // ./module(function+0x15c) [0x8048a6d] - for (char *p = symbollist.get()[i]; *p; ++p) { - if (*p == '(') - begin_name = p; - else if (*p == '+') - begin_offset = p; - else if (*p == ')' && begin_offset) { - end_offset = p; - break; - } - } - - if (begin_name && begin_offset && end_offset && begin_name < begin_offset) { - *begin_name++ = '\0'; - *begin_offset++ = '\0'; - *end_offset = '\0'; - - // mangled name is now in [begin_name, begin_offset) and caller - // offset in [begin_offset, end_offset). now apply - // __cxa_demangle(): - - int status; - char *ret = abi::__cxa_demangle(begin_name, funcname, &funcnamesize, &status); - if (status == 0) { - funcname = ret; // use possibly realloc()-ed string - out << " " << symbollist.get()[i] << " : " << funcname << "+" << begin_offset << "\n"; - } else { - // demangling failed. Output function name as a C function with - // no arguments. - out << " " << symbollist.get()[i] << " : " << begin_name << "()+" << begin_offset << "\n"; - } - } else { - // couldn't parse the line? print the whole line. - out << " " << symbollist.get()[i] << "\n"; - } - } - eout << out.str(); - // error_output(out.str().c_str(),out.str().size()); - // free(symbollist); - // printf("PID of failing process: %d\n",getpid()); - // while(1); -#endif -} - -} // end namespace nvgraph diff --git a/cpp/src/nvgraph/include/util.cuh b/cpp/src/nvgraph/include/util.cuh deleted file mode 100644 index ac6b3a898ba..00000000000 --- a/cpp/src/nvgraph/include/util.cuh +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -#include -#include -#include -#include -#include - -namespace nvlouvain { - -#define BLOCK_SIZE_1D 64 -#define BLOCK_SIZE_2D 16 -#define CUDA_MAX_KERNEL_THREADS 256 -#define CUDA_MAX_BLOCKS_1D 65535 -#define CUDA_MAX_BLOCKS_2D 256 -#define LOCAL_MEM_MAX 512 -#define GRID_MAX_SIZE 65535 -#define WARP_SIZE 32 - -#define CUDA_CALL(call) \ - { \ - cudaError_t cudaStatus = call; \ - if (cudaSuccess != cudaStatus) \ - fprintf(stderr, \ - "ERROR: CUDA call \"%s\" in line %d of file %s failed with %s (%d).\n", \ - #call, \ - __LINE__, \ - __FILE__, \ - cudaGetErrorString(cudaStatus), \ - cudaStatus); \ - } - -#define THRUST_SAFE_CALL(call) \ - { \ - try { \ - call; \ - } catch (std::bad_alloc & e) { \ - fprintf(stderr, "ERROR: THRUST call \"%s\".\n" #call); \ - exit(-1); \ - } \ - } - -#define COLOR_GRN "\033[0;32m" -#define COLOR_MGT "\033[0;35m" -#define COLOR_WHT "\033[0;0m" - -inline std::string time_now() -{ - struct timespec ts; - timespec_get(&ts, TIME_UTC); - char buff[100]; - strftime(buff, sizeof buff, "%T", gmtime(&ts.tv_sec)); - std::string s = buff; - s += "." + std::to_string(ts.tv_nsec).substr(0, 6); - - return s; -} - -typedef enum { - NVLOUVAIN_OK = 0, - NVLOUVAIN_ERR_BAD_PARAMETERS = 1, -} NVLOUVAIN_STATUS; - -using nvlouvainStatus_t = NVLOUVAIN_STATUS; - -const char* nvlouvainStatusGetString(nvlouvainStatus_t status) -{ - std::string s; - switch (status) { - case 0: s = "NVLOUVAIN_OK"; break; - case 1: s = "NVLOUVAIN_ERR_BAD_PARAMETERS"; break; - default: break; - } - return s.c_str(); -} - -template -void display_vec(VecType vec, std::ostream& ouf = std::cout) -{ - auto it = vec.begin(); - ouf << vec.front(); - for (it = vec.begin() + 1; it != vec.end(); ++it) { ouf << ", " << (*it); } - ouf << "\n"; -} - -template -void display_intvec_size(VecType vec, unsigned size) -{ - printf("%d", (int)vec[0]); - for (unsigned i = 1; i < size; ++i) { printf(", %d", (int)vec[i]); } - printf("\n"); -} - -template -void display_vec_size(VecType vec, unsigned size) -{ - for (unsigned i = 0; i < size; ++i) { printf("%f ", vec[i]); } - printf("\n"); -} - -template -__host__ __device__ void display_vec(VecIter vec, int size) -{ - for (unsigned i = 0; i < size; ++i) { printf("%f ", (*(vec + i))); } - printf("\n"); -} - -template -__host__ __device__ void display_vec_with_idx(VecType vec, int size, int offset = 0) -{ - for (unsigned i = 0; i < size; ++i) { printf("idx:%d %f\n", i + offset, (*(vec + i))); } - printf("\n"); -} - -template -void display_cluster(std::vector& vec, std::ostream& ouf = std::cout) -{ - for (const auto& it : vec) { - for (unsigned idx = 0; idx < it.size(); ++idx) { ouf << idx << " " << it[idx] << std::endl; } - } -} - -template -int folded_print_float(VecType s) -{ - return printf("%f\n", s); -} - -template -int folded_print_float(VecType1 s, VecType2... vec) -{ - return printf("%f ", s) + folded_print_float(vec...); -} - -template -int folded_print_int(VecType s) -{ - return printf("%d\n", (int)s); -} - -template -int folded_print_int(VecType1 s, VecType2... vec) -{ - return printf("%d ", (int)s) + folded_print_int(vec...); -} - -} // namespace nvlouvain diff --git a/cpp/src/nvgraph/kmeans.cu b/cpp/src/nvgraph/kmeans.cu deleted file mode 100644 index 691df3e5ced..00000000000 --- a/cpp/src/nvgraph/kmeans.cu +++ /dev/null @@ -1,935 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -//#ifdef NVGRAPH_PARTITION -//#ifdef DEBUG - -#include "include/kmeans.hxx" - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "include/atomics.hxx" -#include "include/debug_macros.h" -#include "include/nvgraph_cublas.hxx" -#include "include/nvgraph_vector.hxx" -#include "include/sm_utils.h" - -using namespace nvgraph; - -// ========================================================= -// Useful macros -// ========================================================= - -#define BLOCK_SIZE 1024 -#define WARP_SIZE 32 -#define BSIZE_DIV_WSIZE (BLOCK_SIZE / WARP_SIZE) - -// Get index of matrix entry -#define IDX(i, j, lda) ((i) + (j) * (lda)) - -namespace { - -// ========================================================= -// CUDA kernels -// ========================================================= - -/// Compute distances between observation vectors and centroids -/** Block dimensions should be (warpSize, 1, - * blockSize/warpSize). Ideally, the grid is large enough so there - * are d threads in the x-direction, k threads in the y-direction, - * and n threads in the z-direction. - * - * @param n Number of observation vectors. - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param obs (Input, d*n entries) Observation matrix. Matrix is - * stored column-major and each column is an observation - * vector. Matrix dimensions are d x n. - * @param centroids (Input, d*k entries) Centroid matrix. Matrix is - * stored column-major and each column is a centroid. Matrix - * dimensions are d x k. - * @param dists (Output, n*k entries) Distance matrix. Matrix is - * stored column-major and the (i,j)-entry is the square of the - * Euclidean distance between the ith observation vector and jth - * centroid. Matrix dimensions are n x k. Entries must be - * initialized to zero. - */ -template -static __global__ void computeDistances(IndexType_ n, - IndexType_ d, - IndexType_ k, - const ValueType_* __restrict__ obs, - const ValueType_* __restrict__ centroids, - ValueType_* __restrict__ dists) -{ - // Loop index - IndexType_ i; - - // Block indices - IndexType_ bidx; - // Global indices - IndexType_ gidx, gidy, gidz; - - // Private memory - ValueType_ centroid_private, dist_private; - - // Global x-index indicates index of vector entry - bidx = blockIdx.x; - while (bidx * blockDim.x < d) { - gidx = threadIdx.x + bidx * blockDim.x; - - // Global y-index indicates centroid - gidy = threadIdx.y + blockIdx.y * blockDim.y; - while (gidy < k) { - // Load centroid coordinate from global memory - centroid_private = (gidx < d) ? centroids[IDX(gidx, gidy, d)] : 0; - - // Global z-index indicates observation vector - gidz = threadIdx.z + blockIdx.z * blockDim.z; - while (gidz < n) { - // Load observation vector coordinate from global memory - dist_private = (gidx < d) ? obs[IDX(gidx, gidz, d)] : 0; - - // Compute contribution of current entry to distance - dist_private = centroid_private - dist_private; - dist_private = dist_private * dist_private; - - // Perform reduction on warp - for (i = WARP_SIZE / 2; i > 0; i /= 2) - dist_private += utils::shfl_down(dist_private, i, 2 * i); - - // Write result to global memory - if (threadIdx.x == 0) atomicFPAdd(dists + IDX(gidz, gidy, n), dist_private); - - // Move to another observation vector - gidz += blockDim.z * gridDim.z; - } - - // Move to another centroid - gidy += blockDim.y * gridDim.y; - } - - // Move to another vector entry - bidx += gridDim.x; - } -} - -/// Find closest centroid to observation vectors -/** Block and grid dimensions should be 1-dimensional. Ideally the - * grid is large enough so there are n threads. - * - * @param n Number of observation vectors. - * @param k Number of clusters. - * @param centroids (Input, d*k entries) Centroid matrix. Matrix is - * stored column-major and each column is a centroid. Matrix - * dimensions are d x k. - * @param dists (Input/output, n*k entries) Distance matrix. Matrix - * is stored column-major and the (i,j)-entry is the square of - * the Euclidean distance between the ith observation vector and - * jth centroid. Matrix dimensions are n x k. On exit, the first - * n entries give the square of the Euclidean distance between - * observation vectors and closest centroids. - * @param codes (Output, n entries) Cluster assignments. - * @param clusterSizes (Output, k entries) Number of points in each - * cluster. Entries must be initialized to zero. - */ -template -static __global__ void minDistances(IndexType_ n, - IndexType_ k, - ValueType_* __restrict__ dists, - IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes) -{ - // Loop index - IndexType_ i, j; - - // Current matrix entry - ValueType_ dist_curr; - - // Smallest entry in row - ValueType_ dist_min; - IndexType_ code_min; - - // Each row in observation matrix is processed by a thread - i = threadIdx.x + blockIdx.x * blockDim.x; - while (i < n) { - // Find minimum entry in row - code_min = 0; - dist_min = dists[IDX(i, 0, n)]; - for (j = 1; j < k; ++j) { - dist_curr = dists[IDX(i, j, n)]; - code_min = (dist_curr < dist_min) ? j : code_min; - dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; - } - - // Transfer result to global memory - dists[i] = dist_min; - codes[i] = code_min; - - // Increment cluster sizes - atomicAdd(clusterSizes + code_min, 1); - - // Move to another row - i += blockDim.x * gridDim.x; - } -} - -/// Check if newly computed distances are smaller than old distances -/** Block and grid dimensions should be 1-dimensional. Ideally the - * grid is large enough so there are n threads. - * - * @param n Number of observation vectors. - * @param dists_old (Input/output, n entries) Distances between - * observation vectors and closest centroids. On exit, entries - * are replaced by entries in 'dists_new' if the corresponding - * observation vectors are closest to the new centroid. - * @param dists_new (Input, n entries) Distance between observation - * vectors and new centroid. - * @param codes_old (Input/output, n entries) Cluster - * assignments. On exit, entries are replaced with 'code_new' if - * the corresponding observation vectors are closest to the new - * centroid. - * @param code_new Index associated with new centroid. - */ -template -static __global__ void minDistances2(IndexType_ n, - ValueType_* __restrict__ dists_old, - const ValueType_* __restrict__ dists_new, - IndexType_* __restrict__ codes_old, - IndexType_ code_new) -{ - // Loop index - IndexType_ i; - - // Distances - ValueType_ dist_old_private; - ValueType_ dist_new_private; - - // Each row is processed by a thread - i = threadIdx.x + blockIdx.x * blockDim.x; - while (i < n) { - // Get old and new distances - dist_old_private = dists_old[i]; - dist_new_private = dists_new[i]; - - // Update if new distance is smaller than old distance - if (dist_new_private < dist_old_private) { - dists_old[i] = dist_new_private; - codes_old[i] = code_new; - } - - // Move to another row - i += blockDim.x * gridDim.x; - } -} - -/// Compute size of k-means clusters -/** Block and grid dimensions should be 1-dimensional. Ideally the - * grid is large enough so there are n threads. - * - * @param n Number of observation vectors. - * @param k Number of clusters. - * @param codes (Input, n entries) Cluster assignments. - * @param clusterSizes (Output, k entries) Number of points in each - * cluster. Entries must be initialized to zero. - */ -template -static __global__ void computeClusterSizes(IndexType_ n, - IndexType_ k, - const IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes) -{ - IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; - while (i < n) { - atomicAdd(clusterSizes + codes[i], 1); - i += blockDim.x * gridDim.x; - } -} - -/// Divide rows of centroid matrix by cluster sizes -/** Divides the ith column of the sum matrix by the size of the ith - * cluster. If the sum matrix has been initialized so that the ith - * row is the sum of all observation vectors in the ith cluster, - * this kernel produces cluster centroids. The grid and block - * dimensions should be 2-dimensional. Ideally the grid is large - * enough so there are d threads in the x-direction and k threads - * in the y-direction. - * - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param clusterSizes (Input, k entries) Number of points in each - * cluster. - * @param centroids (Input/output, d*k entries) Sum matrix. Matrix - * is stored column-major and matrix dimensions are d x k. The - * ith column is the sum of all observation vectors in the ith - * cluster. On exit, the matrix is the centroid matrix (each - * column is the mean position of a cluster). - */ -template -static __global__ void divideCentroids(IndexType_ d, - IndexType_ k, - const IndexType_* __restrict__ clusterSizes, - ValueType_* __restrict__ centroids) -{ - // Global indices - IndexType_ gidx, gidy; - - // Current cluster size - IndexType_ clusterSize_private; - - // Observation vector is determined by global y-index - gidy = threadIdx.y + blockIdx.y * blockDim.y; - while (gidy < k) { - // Get cluster size from global memory - clusterSize_private = clusterSizes[gidy]; - - // Add vector entries to centroid matrix - // Vector entris are determined by global x-index - gidx = threadIdx.x + blockIdx.x * blockDim.x; - while (gidx < d) { - centroids[IDX(gidx, gidy, d)] /= clusterSize_private; - gidx += blockDim.x * gridDim.x; - } - - // Move to another centroid - gidy += blockDim.y * gridDim.y; - } -} - -// ========================================================= -// Helper functions -// ========================================================= - -/// Randomly choose new centroids -/** Centroid is randomly chosen with k-means++ algorithm. - * - * @param n Number of observation vectors. - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param rand Random number drawn uniformly from [0,1). - * @param obs (Input, device memory, d*n entries) Observation - * matrix. Matrix is stored column-major and each column is an - * observation vector. Matrix dimensions are n x d. - * @param dists (Input, device memory, 2*n entries) Workspace. The - * first n entries should be the distance between observation - * vectors and the closest centroid. - * @param centroid (Output, device memory, d entries) Centroid - * coordinates. - * @return Zero if successful. Otherwise non-zero. - */ -template -static int chooseNewCentroid(IndexType_ n, - IndexType_ d, - IndexType_ k, - ValueType_ rand, - const ValueType_* __restrict__ obs, - ValueType_* __restrict__ dists, - ValueType_* __restrict__ centroid) -{ - using namespace thrust; - - // Cumulative sum of distances - ValueType_* distsCumSum = dists + n; - // Residual sum of squares - ValueType_ distsSum; - // Observation vector that is chosen as new centroid - IndexType_ obsIndex; - - // Compute cumulative sum of distances - inclusive_scan( - device_pointer_cast(dists), device_pointer_cast(dists + n), device_pointer_cast(distsCumSum)); - cudaCheckError(); - CHECK_CUDA( - cudaMemcpy(&distsSum, distsCumSum + n - 1, sizeof(ValueType_), cudaMemcpyDeviceToHost)); - - // Randomly choose observation vector - // Probabilities are proportional to square of distance to closest - // centroid (see k-means++ algorithm) - obsIndex = - (lower_bound( - device_pointer_cast(distsCumSum), device_pointer_cast(distsCumSum + n), distsSum * rand) - - device_pointer_cast(distsCumSum)); - cudaCheckError(); - obsIndex = max(obsIndex, 0); - obsIndex = min(obsIndex, n - 1); - - // Record new centroid position - CHECK_CUDA(cudaMemcpyAsync( - centroid, obs + IDX(0, obsIndex, d), d * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); - - return 0; -} - -/// Choose initial cluster centroids for k-means algorithm -/** Centroids are randomly chosen with k-means++ algorithm - * - * @param n Number of observation vectors. - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param obs (Input, device memory, d*n entries) Observation - * matrix. Matrix is stored column-major and each column is an - * observation vector. Matrix dimensions are d x n. - * @param centroids (Output, device memory, d*k entries) Centroid - * matrix. Matrix is stored column-major and each column is a - * centroid. Matrix dimensions are d x k. - * @param codes (Output, device memory, n entries) Cluster - * assignments. - * @param clusterSizes (Output, device memory, k entries) Number of - * points in each cluster. - * @param dists (Output, device memory, 2*n entries) Workspace. On - * exit, the first n entries give the square of the Euclidean - * distance between observation vectors and the closest centroid. - * @return Zero if successful. Otherwise non-zero. - */ -template -static int initializeCentroids(IndexType_ n, - IndexType_ d, - IndexType_ k, - const ValueType_* __restrict__ obs, - ValueType_* __restrict__ centroids, - IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes, - ValueType_* __restrict__ dists) -{ - // ------------------------------------------------------- - // Variable declarations - // ------------------------------------------------------- - - // Loop index - IndexType_ i; - - // CUDA grid dimensions - dim3 blockDim_warp, gridDim_warp, gridDim_block; - - // Random number generator - thrust::default_random_engine rng(123456); - thrust::uniform_real_distribution uniformDist(0, 1); - - // ------------------------------------------------------- - // Implementation - // ------------------------------------------------------- - - // Initialize grid dimensions - blockDim_warp.x = WARP_SIZE; - blockDim_warp.y = 1; - blockDim_warp.z = BSIZE_DIV_WSIZE; - gridDim_warp.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim_warp.y = 1; - gridDim_warp.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); - gridDim_block.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); - gridDim_block.y = 1; - gridDim_block.z = 1; - - // Assign observation vectors to code 0 - CHECK_CUDA(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_))); - - // Choose first centroid - thrust::fill(thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), 1); - cudaCheckError(); - if (chooseNewCentroid(n, d, k, uniformDist(rng), obs, dists, centroids)) - WARNING("error in k-means++ (could not pick centroid)"); - - // Compute distances from first centroid - CHECK_CUDA(cudaMemsetAsync(dists, 0, n * sizeof(ValueType_))); - computeDistances<<>>(n, d, 1, obs, centroids, dists); - cudaCheckError() - - // Choose remaining centroids - for (i = 1; i < k; ++i) - { - // Choose ith centroid - if (chooseNewCentroid(n, d, k, uniformDist(rng), obs, dists, centroids + IDX(0, i, d))) - WARNING("error in k-means++ (could not pick centroid)"); - - // Compute distances from ith centroid - CHECK_CUDA(cudaMemsetAsync(dists + n, 0, n * sizeof(ValueType_))); - computeDistances<<>>( - n, d, 1, obs, centroids + IDX(0, i, d), dists + n); - cudaCheckError(); - - // Recompute minimum distances - minDistances2<<>>(n, dists, dists + n, codes, i); - cudaCheckError(); - } - - // Compute cluster sizes - CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_))); - computeClusterSizes<<>>(n, k, codes, clusterSizes); - cudaCheckError(); - - return 0; -} - -/// Find cluster centroids closest to observation vectors -/** Distance is measured with Euclidean norm. - * - * @param n Number of observation vectors. - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param obs (Input, device memory, d*n entries) Observation - * matrix. Matrix is stored column-major and each column is an - * observation vector. Matrix dimensions are d x n. - * @param centroids (Input, device memory, d*k entries) Centroid - * matrix. Matrix is stored column-major and each column is a - * centroid. Matrix dimensions are d x k. - * @param dists (Output, device memory, n*k entries) Workspace. On - * exit, the first n entries give the square of the Euclidean - * distance between observation vectors and the closest centroid. - * @param codes (Output, device memory, n entries) Cluster - * assignments. - * @param clusterSizes (Output, device memory, k entries) Number of - * points in each cluster. - * @param residual_host (Output, host memory, 1 entry) Residual sum - * of squares of assignment. - * @return Zero if successful. Otherwise non-zero. - */ -template -static int assignCentroids(IndexType_ n, - IndexType_ d, - IndexType_ k, - const ValueType_* __restrict__ obs, - const ValueType_* __restrict__ centroids, - ValueType_* __restrict__ dists, - IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes, - ValueType_* residual_host) -{ - // CUDA grid dimensions - dim3 blockDim, gridDim; - - // Compute distance between centroids and observation vectors - CHECK_CUDA(cudaMemsetAsync(dists, 0, n * k * sizeof(ValueType_))); - blockDim.x = WARP_SIZE; - blockDim.y = 1; - blockDim.z = BLOCK_SIZE / WARP_SIZE; - gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim.y = min(k, 65535); - gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); - computeDistances<<>>(n, d, k, obs, centroids, dists); - cudaCheckError(); - - // Find centroid closest to each observation vector - CHECK_CUDA(cudaMemsetAsync(clusterSizes, 0, k * sizeof(IndexType_))); - blockDim.x = BLOCK_SIZE; - blockDim.y = 1; - blockDim.z = 1; - gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); - gridDim.y = 1; - gridDim.z = 1; - minDistances<<>>(n, k, dists, codes, clusterSizes); - cudaCheckError(); - - // Compute residual sum of squares - *residual_host = - thrust::reduce(thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n)); - - return 0; -} - -/// Update cluster centroids for k-means algorithm -/** All clusters are assumed to be non-empty. - * - * @param n Number of observation vectors. - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param obs (Input, device memory, d*n entries) Observation - * matrix. Matrix is stored column-major and each column is an - * observation vector. Matrix dimensions are d x n. - * @param codes (Input, device memory, n entries) Cluster - * assignments. - * @param clusterSizes (Input, device memory, k entries) Number of - * points in each cluster. - * @param centroids (Output, device memory, d*k entries) Centroid - * matrix. Matrix is stored column-major and each column is a - * centroid. Matrix dimensions are d x k. - * @param work (Output, device memory, n*d entries) Workspace. - * @param work_int (Output, device memory, 2*d*n entries) - * Workspace. - * @return Zero if successful. Otherwise non-zero. - */ -template -static int updateCentroids(IndexType_ n, - IndexType_ d, - IndexType_ k, - const ValueType_* __restrict__ obs, - const IndexType_* __restrict__ codes, - const IndexType_* __restrict__ clusterSizes, - ValueType_* __restrict__ centroids, - ValueType_* __restrict__ work, - IndexType_* __restrict__ work_int) -{ - using namespace thrust; - - // ------------------------------------------------------- - // Variable declarations - // ------------------------------------------------------- - - // Useful constants - const ValueType_ one = 1; - const ValueType_ zero = 0; - - // CUDA grid dimensions - dim3 blockDim, gridDim; - - // Device memory - device_ptr obs_copy(work); - device_ptr codes_copy(work_int); - device_ptr rows(work_int + d * n); - - // Take transpose of observation matrix - Cublas::geam( - true, false, n, d, &one, obs, d, &zero, (ValueType_*)NULL, n, raw_pointer_cast(obs_copy), n); - - // Cluster assigned to each observation matrix entry - sequence(rows, rows + d * n); - cudaCheckError(); - transform(rows, rows + d * n, make_constant_iterator(n), rows, modulus()); - cudaCheckError(); - gather(rows, rows + d * n, device_pointer_cast(codes), codes_copy); - cudaCheckError(); - - // Row associated with each observation matrix entry - sequence(rows, rows + d * n); - cudaCheckError(); - transform(rows, rows + d * n, make_constant_iterator(n), rows, divides()); - cudaCheckError(); - - // Sort and reduce to add observation vectors in same cluster - stable_sort_by_key(codes_copy, codes_copy + d * n, make_zip_iterator(make_tuple(obs_copy, rows))); - cudaCheckError(); - reduce_by_key(rows, - rows + d * n, - obs_copy, - codes_copy, // Output to codes_copy is ignored - device_pointer_cast(centroids)); - cudaCheckError(); - - // Divide sums by cluster size to get centroid matrix - blockDim.x = WARP_SIZE; - blockDim.y = BLOCK_SIZE / WARP_SIZE; - blockDim.z = 1; - gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim.y = min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, 65535); - gridDim.z = 1; - divideCentroids<<>>(d, k, clusterSizes, centroids); - cudaCheckError(); - - return 0; -} - -} // namespace - -namespace nvgraph { - -// ========================================================= -// k-means algorithm -// ========================================================= - -/// Find clusters with k-means algorithm -/** Initial centroids are chosen with k-means++ algorithm. Empty - * clusters are reinitialized by choosing new centroids with - * k-means++ algorithm. - * - * @param n Number of observation vectors. - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param tol Tolerance for convergence. k-means stops when the - * change in residual divided by n is less than tol. - * @param maxiter Maximum number of k-means iterations. - * @param obs (Input, device memory, d*n entries) Observation - * matrix. Matrix is stored column-major and each column is an - * observation vector. Matrix dimensions are d x n. - * @param codes (Output, device memory, n entries) Cluster - * assignments. - * @param clusterSizes (Output, device memory, k entries) Number of - * points in each cluster. - * @param centroids (Output, device memory, d*k entries) Centroid - * matrix. Matrix is stored column-major and each column is a - * centroid. Matrix dimensions are d x k. - * @param work (Output, device memory, n*max(k,d) entries) - * Workspace. - * @param work_int (Output, device memory, 2*d*n entries) - * Workspace. - * @param residual_host (Output, host memory, 1 entry) Residual sum - * of squares (sum of squares of distances between observation - * vectors and centroids). - * @param iters_host (Output, host memory, 1 entry) Number of - * k-means iterations. - * @return NVGRAPH error flag. - */ -template -NVGRAPH_ERROR kmeans(IndexType_ n, - IndexType_ d, - IndexType_ k, - ValueType_ tol, - IndexType_ maxiter, - const ValueType_* __restrict__ obs, - IndexType_* __restrict__ codes, - IndexType_* __restrict__ clusterSizes, - ValueType_* __restrict__ centroids, - ValueType_* __restrict__ work, - IndexType_* __restrict__ work_int, - ValueType_* residual_host, - IndexType_* iters_host) -{ - // ------------------------------------------------------- - // Variable declarations - // ------------------------------------------------------- - - // Current iteration - IndexType_ iter; - - // Residual sum of squares at previous iteration - ValueType_ residualPrev = 0; - - // Random number generator - thrust::default_random_engine rng(123456); - thrust::uniform_real_distribution uniformDist(0, 1); - - // ------------------------------------------------------- - // Initialization - // ------------------------------------------------------- - - // Check that parameters are valid - if (n < 1) { - WARNING("invalid parameter (n<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (d < 1) { - WARNING("invalid parameter (d<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (k < 1) { - WARNING("invalid parameter (k<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (tol < 0) { - WARNING("invalid parameter (tol<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (maxiter < 0) { - WARNING("invalid parameter (maxiter<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - - // Trivial cases - if (k == 1) { - CHECK_CUDA(cudaMemsetAsync(codes, 0, n * sizeof(IndexType_))); - CHECK_CUDA(cudaMemcpyAsync(clusterSizes, &n, sizeof(IndexType_), cudaMemcpyHostToDevice)); - if (updateCentroids(n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) - WARNING("could not compute k-means centroids"); - dim3 blockDim, gridDim; - blockDim.x = WARP_SIZE; - blockDim.y = 1; - blockDim.z = BLOCK_SIZE / WARP_SIZE; - gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, 65535); - gridDim.y = 1; - gridDim.z = min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), 65535); - CHECK_CUDA(cudaMemsetAsync(work, 0, n * k * sizeof(ValueType_))); - computeDistances<<>>(n, d, 1, obs, centroids, work); - cudaCheckError(); - *residual_host = - thrust::reduce(thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); - cudaCheckError(); - return NVGRAPH_OK; - } - if (n <= k) { - thrust::sequence(thrust::device_pointer_cast(codes), thrust::device_pointer_cast(codes + n)); - cudaCheckError(); - thrust::fill_n(thrust::device_pointer_cast(clusterSizes), n, 1); - cudaCheckError(); - - if (n < k) CHECK_CUDA(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(IndexType_))); - CHECK_CUDA( - cudaMemcpyAsync(centroids, obs, d * n * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); - *residual_host = 0; - return NVGRAPH_OK; - } - - // Initialize cuBLAS - Cublas::set_pointer_mode_host(); - - // ------------------------------------------------------- - // k-means++ algorithm - // ------------------------------------------------------- - - // Choose initial cluster centroids - if (initializeCentroids(n, d, k, obs, centroids, codes, clusterSizes, work)) - WARNING("could not initialize k-means centroids"); - - // Apply k-means iteration until convergence - for (iter = 0; iter < maxiter; ++iter) { - // Update cluster centroids - if (updateCentroids(n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) - WARNING("could not update k-means centroids"); - - // Determine centroid closest to each observation - residualPrev = *residual_host; - if (assignCentroids(n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) - WARNING("could not assign observation vectors to k-means clusters"); - - // Reinitialize empty clusters with new centroids - IndexType_ emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), - 0) - - thrust::device_pointer_cast(clusterSizes)); - - // FIXME: emptyCentroid never reaches k (infinite loop) under certain - // conditions, such as if obs is corrupt (as seen as a result of a - // DataFrame column of NULL edge vals used to create the Graph) - while (emptyCentroid < k) { - if (chooseNewCentroid( - n, d, k, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d))) - WARNING("could not replace empty centroid"); - if (assignCentroids(n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) - WARNING("could not assign observation vectors to k-means clusters"); - emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), - 0) - - thrust::device_pointer_cast(clusterSizes)); - cudaCheckError(); - } - - // Check for convergence - if (fabs(residualPrev - (*residual_host)) / n < tol) { - ++iter; - break; - } - } - - // Warning if k-means has failed to converge - if (fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge"); - - *iters_host = iter; - return NVGRAPH_OK; -} - -/// Find clusters with k-means algorithm -/** Initial centroids are chosen with k-means++ algorithm. Empty - * clusters are reinitialized by choosing new centroids with - * k-means++ algorithm. - * - * CNMEM must be initialized before calling this function. - * - * @param n Number of observation vectors. - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param tol Tolerance for convergence. k-means stops when the - * change in residual divided by n is less than tol. - * @param maxiter Maximum number of k-means iterations. - * @param obs (Input, device memory, d*n entries) Observation - * matrix. Matrix is stored column-major and each column is an - * observation vector. Matrix dimensions are d x n. - * @param codes (Output, device memory, n entries) Cluster - * assignments. - * @param residual On exit, residual sum of squares (sum of squares - * of distances between observation vectors and centroids). - * @param On exit, number of k-means iterations. - * @return NVGRAPH error flag - */ -template -NVGRAPH_ERROR kmeans(IndexType_ n, - IndexType_ d, - IndexType_ k, - ValueType_ tol, - IndexType_ maxiter, - const ValueType_* __restrict__ obs, - IndexType_* __restrict__ codes, - ValueType_& residual, - IndexType_& iters) -{ - // Check that parameters are valid - if (n < 1) { - WARNING("invalid parameter (n<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (d < 1) { - WARNING("invalid parameter (d<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (k < 1) { - WARNING("invalid parameter (k<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (tol < 0) { - WARNING("invalid parameter (tol<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (maxiter < 0) { - WARNING("invalid parameter (maxiter<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - - // Allocate memory - // TODO: handle non-zero CUDA streams - cudaStream_t stream = 0; - Vector clusterSizes(k, stream); - Vector centroids(d * k, stream); - Vector work(n * max(k, d), stream); - Vector work_int(2 * d * n, stream); - - // Perform k-means - return kmeans(n, - d, - k, - tol, - maxiter, - obs, - codes, - clusterSizes.raw(), - centroids.raw(), - work.raw(), - work_int.raw(), - &residual, - &iters); -} - -// ========================================================= -// Explicit instantiations -// ========================================================= - -template NVGRAPH_ERROR kmeans(int n, - int d, - int k, - float tol, - int maxiter, - const float* __restrict__ obs, - int* __restrict__ codes, - float& residual, - int& iters); -template NVGRAPH_ERROR kmeans(int n, - int d, - int k, - double tol, - int maxiter, - const double* __restrict__ obs, - int* __restrict__ codes, - double& residual, - int& iters); -} // namespace nvgraph -//#endif //NVGRAPH_PARTITION -//#endif //debug diff --git a/cpp/src/nvgraph/lanczos.cu b/cpp/src/nvgraph/lanczos.cu deleted file mode 100644 index ad49be1c059..00000000000 --- a/cpp/src/nvgraph/lanczos.cu +++ /dev/null @@ -1,1487 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -//#ifdef NVGRAPH_PARTITION - -#define _USE_MATH_DEFINES -#include -#include "include/lanczos.hxx" - -#include -#include -#include - -#include -#include - -#include "include/debug_macros.h" -#include "include/nvgraph_cublas.hxx" -#include "include/nvgraph_error.hxx" -#include "include/nvgraph_lapack.hxx" -#include "include/nvgraph_vector.hxx" -#include "include/nvgraph_vector_kernels.hxx" -// ========================================================= -// Useful macros -// ========================================================= - -// Get index of matrix entry -#define IDX(i, j, lda) ((i) + (j) * (lda)) - -namespace nvgraph { - -namespace { - -// ========================================================= -// Helper functions -// ========================================================= - -/// Perform Lanczos iteration -/** Lanczos iteration is performed on a shifted matrix A+shift*I. - * - * @param A Matrix. - * @param iter Pointer to current Lanczos iteration. On exit, the - * variable is set equal to the final Lanczos iteration. - * @param maxIter Maximum Lanczos iteration. This function will - * perform a maximum of maxIter-*iter iterations. - * @param shift Matrix shift. - * @param tol Convergence tolerance. Lanczos iteration will - * terminate when the residual norm (i.e. entry in beta_host) is - * less than tol. - * @param reorthogonalize Whether to reorthogonalize Lanczos - * vectors. - * @param alpha_host (Output, host memory, maxIter entries) - * Diagonal entries of Lanczos system. - * @param beta_host (Output, host memory, maxIter entries) - * Off-diagonal entries of Lanczos system. - * @param lanczosVecs_dev (Input/output, device memory, - * n*(maxIter+1) entries) Lanczos vectors. Vectors are stored as - * columns of a column-major matrix with dimensions - * n x (maxIter+1). - * @param work_dev (Output, device memory, maxIter entries) - * Workspace. Not needed if full reorthogonalization is disabled. - * @return Zero if successful. Otherwise non-zero. - */ -template -static int performLanczosIteration(const Matrix *A, - IndexType_ *iter, - IndexType_ maxIter, - ValueType_ shift, - ValueType_ tol, - bool reorthogonalize, - ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev) -{ - // ------------------------------------------------------- - // Variable declaration - // ------------------------------------------------------- - - // Useful variables - const ValueType_ one = 1; - const ValueType_ negOne = -1; - const ValueType_ zero = 0; - - IndexType_ n = A->n; - - // ------------------------------------------------------- - // Compute second Lanczos vector - // ------------------------------------------------------- - if (*iter <= 0) { - *iter = 1; - - // Apply matrix - if (shift != 0) - CHECK_CUDA(cudaMemcpyAsync( - lanczosVecs_dev + n, lanczosVecs_dev, n * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); - A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); - - // Orthogonalize Lanczos vector - Cublas::dot(n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host); - Cublas::axpy(n, -alpha_host[0], lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1); - beta_host[0] = Cublas::nrm2(n, lanczosVecs_dev + IDX(0, 1, n), 1); - - // Check if Lanczos has converged - if (beta_host[0] <= tol) return 0; - - // Normalize Lanczos vector - Cublas::scal(n, 1 / beta_host[0], lanczosVecs_dev + IDX(0, 1, n), 1); - } - - // ------------------------------------------------------- - // Compute remaining Lanczos vectors - // ------------------------------------------------------- - - while (*iter < maxIter) { - ++(*iter); - - // Apply matrix - if (shift != 0) - CHECK_CUDA(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, - lanczosVecs_dev + (*iter - 1) * n, - n * sizeof(ValueType_), - cudaMemcpyDeviceToDevice)); - A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); - - // Full reorthogonalization - // "Twice is enough" algorithm per Kahan and Parlett - if (reorthogonalize) { - Cublas::gemv(true, - n, - *iter, - &one, - lanczosVecs_dev, - n, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - &zero, - work_dev, - 1); - Cublas::gemv(false, - n, - *iter, - &negOne, - lanczosVecs_dev, - n, - work_dev, - 1, - &one, - lanczosVecs_dev + IDX(0, *iter, n), - 1); - CHECK_CUDA(cudaMemcpyAsync(alpha_host + (*iter - 1), - work_dev + (*iter - 1), - sizeof(ValueType_), - cudaMemcpyDeviceToHost)); - Cublas::gemv(true, - n, - *iter, - &one, - lanczosVecs_dev, - n, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - &zero, - work_dev, - 1); - Cublas::gemv(false, - n, - *iter, - &negOne, - lanczosVecs_dev, - n, - work_dev, - 1, - &one, - lanczosVecs_dev + IDX(0, *iter, n), - 1); - } - - // Orthogonalization with 3-term recurrence relation - else { - Cublas::dot(n, - lanczosVecs_dev + IDX(0, *iter - 1, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - alpha_host + (*iter - 1)); - Cublas::axpy(n, - -alpha_host[*iter - 1], - lanczosVecs_dev + IDX(0, *iter - 1, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1); - Cublas::axpy(n, - -beta_host[*iter - 2], - lanczosVecs_dev + IDX(0, *iter - 2, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1); - } - - // Compute residual - beta_host[*iter - 1] = Cublas::nrm2(n, lanczosVecs_dev + IDX(0, *iter, n), 1); - - // Check if Lanczos has converged - if (beta_host[*iter - 1] <= tol) break; - // Normalize Lanczos vector - Cublas::scal(n, 1 / beta_host[*iter - 1], lanczosVecs_dev + IDX(0, *iter, n), 1); - } - - CHECK_CUDA(cudaDeviceSynchronize()); - - return 0; -} - -/// Find Householder transform for 3-dimensional system -/** Given an input vector v=[x,y,z]', this function finds a - * Householder transform P such that P*v is a multiple of - * e_1=[1,0,0]'. The input vector v is overwritten with the - * Householder vector such that P=I-2*v*v'. - * - * @param v (Input/output, host memory, 3 entries) Input - * 3-dimensional vector. On exit, the vector is set to the - * Householder vector. - * @param Pv (Output, host memory, 1 entry) First entry of P*v - * (here v is the input vector). Either equal to ||v||_2 or - * -||v||_2. - * @param P (Output, host memory, 9 entries) Householder transform - * matrix. Matrix dimensions are 3 x 3. - */ -template -static void findHouseholder3(ValueType_ *v, ValueType_ *Pv, ValueType_ *P) -{ - // Compute norm of vector - *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); - - // Choose whether to reflect to e_1 or -e_1 - // This choice avoids catastrophic cancellation - if (v[0] >= 0) *Pv = -(*Pv); - v[0] -= *Pv; - - // Normalize Householder vector - ValueType_ normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); - if (normHouseholder != 0) { - v[0] /= normHouseholder; - v[1] /= normHouseholder; - v[2] /= normHouseholder; - } else { - v[0] = 0; - v[1] = 0; - v[2] = 0; - } - - // Construct Householder matrix - IndexType_ i, j; - for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) P[IDX(i, j, 3)] = -2 * v[i] * v[j]; - for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1; -} - -/// Apply 3-dimensional Householder transform to 4 x 4 matrix -/** The Householder transform is pre-applied to the top three rows - * of the matrix and post-applied to the left three columns. The - * 4 x 4 matrix is intended to contain the bulge that is produced - * in the Francis QR algorithm. - * - * @param v (Input, host memory, 3 entries) Householder vector. - * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. - */ -template -static void applyHouseholder3(const ValueType_ *v, ValueType_ *A) -{ - // Loop indices - IndexType_ i, j; - // Dot product between Householder vector and matrix row/column - ValueType_ vDotA; - - // Pre-apply Householder transform - for (j = 0; j < 4; ++j) { - vDotA = 0; - for (i = 0; i < 3; ++i) vDotA += v[i] * A[IDX(i, j, 4)]; - for (i = 0; i < 3; ++i) A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; - } - - // Post-apply Householder transform - for (i = 0; i < 4; ++i) { - vDotA = 0; - for (j = 0; j < 3; ++j) vDotA += A[IDX(i, j, 4)] * v[j]; - for (j = 0; j < 3; ++j) A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; - } -} - -/// Perform one step of Francis QR algorithm -/** Equivalent to two steps of the classical QR algorithm on a - * tridiagonal matrix. - * - * @param n Matrix dimension. - * @param shift1 QR algorithm shift. - * @param shift2 QR algorithm shift. - * @param alpha (Input/output, host memory, n entries) Diagonal - * entries of tridiagonal matrix. - * @param beta (Input/output, host memory, n-1 entries) - * Off-diagonal entries of tridiagonal matrix. - * @param V (Input/output, host memory, n*n entries) Orthonormal - * transforms from previous steps of QR algorithm. Matrix - * dimensions are n x n. On exit, the orthonormal transform from - * this Francis QR step is post-applied to the matrix. - * @param work (Output, host memory, 3*n entries) Workspace. - * @return Zero if successful. Otherwise non-zero. - */ -template -static int francisQRIteration(IndexType_ n, - ValueType_ shift1, - ValueType_ shift2, - ValueType_ *alpha, - ValueType_ *beta, - ValueType_ *V, - ValueType_ *work) -{ - // ------------------------------------------------------- - // Variable declaration - // ------------------------------------------------------- - - // Temporary storage of 4x4 bulge and Householder vector - ValueType_ bulge[16]; - - // Householder vector - ValueType_ householder[3]; - // Householder matrix - ValueType_ householderMatrix[3 * 3]; - - // Shifts are roots of the polynomial p(x)=x^2+b*x+c - ValueType_ b = -shift1 - shift2; - ValueType_ c = shift1 * shift2; - - // Loop indices - IndexType_ i, j, pos; - // Temporary variable - ValueType_ temp; - - // ------------------------------------------------------- - // Implementation - // ------------------------------------------------------- - - // Compute initial Householder transform - householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c; - householder[1] = beta[0] * (alpha[0] + alpha[1] + b); - householder[2] = beta[0] * beta[1]; - findHouseholder3(householder, &temp, householderMatrix); - - // Apply initial Householder transform to create bulge - memset(bulge, 0, 16 * sizeof(ValueType_)); - for (i = 0; i < 4; ++i) bulge[IDX(i, i, 4)] = alpha[i]; - for (i = 0; i < 3; ++i) { - bulge[IDX(i + 1, i, 4)] = beta[i]; - bulge[IDX(i, i + 1, 4)] = beta[i]; - } - applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n); - memcpy(V, work, 3 * n * sizeof(ValueType_)); - - // Chase bulge to bottom-right of matrix with Householder transforms - for (pos = 0; pos < n - 4; ++pos) { - // Move to next position - alpha[pos] = bulge[IDX(0, 0, 4)]; - householder[0] = bulge[IDX(1, 0, 4)]; - householder[1] = bulge[IDX(2, 0, 4)]; - householder[2] = bulge[IDX(3, 0, 4)]; - for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; - bulge[IDX(3, 0, 4)] = 0; - bulge[IDX(3, 1, 4)] = 0; - bulge[IDX(3, 2, 4)] = beta[pos + 3]; - bulge[IDX(0, 3, 4)] = 0; - bulge[IDX(1, 3, 4)] = 0; - bulge[IDX(2, 3, 4)] = beta[pos + 3]; - bulge[IDX(3, 3, 4)] = alpha[pos + 4]; - - // Apply Householder transform - findHouseholder3(householder, beta + pos, householderMatrix); - applyHouseholder3(householder, bulge); - Lapack::gemm( - false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n); - memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(ValueType_)); - } - - // Apply penultimate Householder transform - // Values in the last row and column are zero - alpha[n - 4] = bulge[IDX(0, 0, 4)]; - householder[0] = bulge[IDX(1, 0, 4)]; - householder[1] = bulge[IDX(2, 0, 4)]; - householder[2] = bulge[IDX(3, 0, 4)]; - for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; - bulge[IDX(3, 0, 4)] = 0; - bulge[IDX(3, 1, 4)] = 0; - bulge[IDX(3, 2, 4)] = 0; - bulge[IDX(0, 3, 4)] = 0; - bulge[IDX(1, 3, 4)] = 0; - bulge[IDX(2, 3, 4)] = 0; - bulge[IDX(3, 3, 4)] = 0; - findHouseholder3(householder, beta + n - 4, householderMatrix); - applyHouseholder3(householder, bulge); - Lapack::gemm( - false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n); - memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(ValueType_)); - - // Apply final Householder transform - // Values in the last two rows and columns are zero - alpha[n - 3] = bulge[IDX(0, 0, 4)]; - householder[0] = bulge[IDX(1, 0, 4)]; - householder[1] = bulge[IDX(2, 0, 4)]; - householder[2] = 0; - for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; - findHouseholder3(householder, beta + n - 3, householderMatrix); - applyHouseholder3(householder, bulge); - Lapack::gemm( - false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n); - memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(ValueType_)); - - // Bulge has been eliminated - alpha[n - 2] = bulge[IDX(0, 0, 4)]; - alpha[n - 1] = bulge[IDX(1, 1, 4)]; - beta[n - 2] = bulge[IDX(1, 0, 4)]; - - return 0; -} - -/// Perform implicit restart of Lanczos algorithm -/** Shifts are Chebyshev nodes of unwanted region of matrix spectrum. - * - * @param n Matrix dimension. - * @param iter Current Lanczos iteration. - * @param iter_new Lanczos iteration after restart. - * @param shiftUpper Pointer to upper bound for unwanted - * region. Value is ignored if less than *shiftLower. If a - * stronger upper bound has been found, the value is updated on - * exit. - * @param shiftLower Pointer to lower bound for unwanted - * region. Value is ignored if greater than *shiftUpper. If a - * stronger lower bound has been found, the value is updated on - * exit. - * @param alpha_host (Input/output, host memory, iter entries) - * Diagonal entries of Lanczos system. - * @param beta_host (Input/output, host memory, iter entries) - * Off-diagonal entries of Lanczos system. - * @param V_host (Output, host memory, iter*iter entries) - * Orthonormal transform used to obtain restarted system. Matrix - * dimensions are iter x iter. - * @param work_host (Output, host memory, 4*iter entries) - * Workspace. - * @param lanczosVecs_dev (Input/output, device memory, n*(iter+1) - * entries) Lanczos vectors. Vectors are stored as columns of a - * column-major matrix with dimensions n x (iter+1). - * @param work_dev (Output, device memory, (n+iter)*iter entries) - * Workspace. - */ -template -static int lanczosRestart(IndexType_ n, - IndexType_ iter, - IndexType_ iter_new, - ValueType_ *shiftUpper, - ValueType_ *shiftLower, - ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ V_host, - ValueType_ *__restrict__ work_host, - ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev, - bool smallest_eig) -{ - // ------------------------------------------------------- - // Variable declaration - // ------------------------------------------------------- - - // Useful constants - const ValueType_ zero = 0; - const ValueType_ one = 1; - - // Loop index - IndexType_ i; - - // Number of implicit restart steps - // Assumed to be even since each call to Francis algorithm is - // equivalent to two calls of QR algorithm - IndexType_ restartSteps = iter - iter_new; - - // Ritz values from Lanczos method - ValueType_ *ritzVals_host = work_host + 3 * iter; - // Shifts for implicit restart - ValueType_ *shifts_host; - - // Orthonormal matrix for similarity transform - ValueType_ *V_dev = work_dev + n * iter; - - // ------------------------------------------------------- - // Implementation - // ------------------------------------------------------- - - // Compute Ritz values - memcpy(ritzVals_host, alpha_host, iter * sizeof(ValueType_)); - memcpy(work_host, beta_host, (iter - 1) * sizeof(ValueType_)); - Lapack::sterf(iter, ritzVals_host, work_host); - - // Debug: Print largest eigenvalues - // for (int i = iter-iter_new; i < iter; ++i) - // std::cout <<*(ritzVals_host+i)<< " "; - // std::cout < *shiftUpper) { - *shiftUpper = ritzVals_host[iter - 1]; - *shiftLower = ritzVals_host[iter_new]; - } else { - *shiftUpper = max(*shiftUpper, ritzVals_host[iter - 1]); - *shiftLower = min(*shiftLower, ritzVals_host[iter_new]); - } - } else { - if (*shiftLower > *shiftUpper) { - *shiftUpper = ritzVals_host[iter - iter_new - 1]; - *shiftLower = ritzVals_host[0]; - } else { - *shiftUpper = max(*shiftUpper, ritzVals_host[iter - iter_new - 1]); - *shiftLower = min(*shiftLower, ritzVals_host[0]); - } - } - - // Calculate Chebyshev nodes as shifts - shifts_host = ritzVals_host; - for (i = 0; i < restartSteps; ++i) { - shifts_host[i] = cos((i + 0.5) * static_cast(M_PI) / restartSteps); - shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower)); - shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower)); - } - - // Apply Francis QR algorithm to implicitly restart Lanczos - for (i = 0; i < restartSteps; i += 2) - if (francisQRIteration( - iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host)) - WARNING("error in implicitly shifted QR algorithm"); - - // Obtain new residual - CHECK_CUDA( - cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(ValueType_), cudaMemcpyHostToDevice)); - - beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; - Cublas::gemv(false, - n, - iter, - beta_host + iter_new - 1, - lanczosVecs_dev, - n, - V_dev + IDX(0, iter_new, iter), - 1, - beta_host + iter - 1, - lanczosVecs_dev + IDX(0, iter, n), - 1); - - // Obtain new Lanczos vectors - Cublas::gemm( - false, false, n, iter_new, iter, &one, lanczosVecs_dev, n, V_dev, iter, &zero, work_dev, n); - - CHECK_CUDA(cudaMemcpyAsync( - lanczosVecs_dev, work_dev, n * iter_new * sizeof(ValueType_), cudaMemcpyDeviceToDevice)); - - // Normalize residual to obtain new Lanczos vector - CHECK_CUDA(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), - lanczosVecs_dev + IDX(0, iter, n), - n * sizeof(ValueType_), - cudaMemcpyDeviceToDevice)); - beta_host[iter_new - 1] = Cublas::nrm2(n, lanczosVecs_dev + IDX(0, iter_new, n), 1); - Cublas::scal(n, 1 / beta_host[iter_new - 1], lanczosVecs_dev + IDX(0, iter_new, n), 1); - - return 0; -} - -} // namespace - -// ========================================================= -// Eigensolver -// ========================================================= - -/// Compute smallest eigenvectors of symmetric matrix -/** Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are smallest in - * magnitude. - * - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied to A+s*I, where s is negative the largest - * eigenvalue. - * - * @param A Matrix. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter Maximum number of Lanczos steps. Does not include - * Lanczos steps used to estimate largest eigenvalue. - * @param restartIter Maximum size of Lanczos system before - * performing an implicit restart. Should be at least 4. - * @param tol Convergence tolerance. Lanczos iteration will - * terminate when the residual norm is less than tol*theta, where - * theta is an estimate for the smallest unwanted eigenvalue - * (i.e. the (nEigVecs+1)th smallest eigenvalue). - * @param reorthogonalize Whether to reorthogonalize Lanczos - * vectors. - * @param effIter On exit, pointer to final size of Lanczos system. - * @param totalIter On exit, pointer to total number of Lanczos - * iterations performed. Does not include Lanczos steps used to - * estimate largest eigenvalue. - * @param shift On exit, pointer to matrix shift (estimate for - * largest eigenvalue). - * @param alpha_host (Output, host memory, restartIter entries) - * Diagonal entries of Lanczos system. - * @param beta_host (Output, host memory, restartIter entries) - * Off-diagonal entries of Lanczos system. - * @param lanczosVecs_dev (Output, device memory, n*(restartIter+1) - * entries) Lanczos vectors. Vectors are stored as columns of a - * column-major matrix with dimensions n x (restartIter+1). - * @param work_dev (Output, device memory, - * (n+restartIter)*restartIter entries) Workspace. - * @param eigVals_dev (Output, device memory, nEigVecs entries) - * Largest eigenvalues of matrix. - * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) - * Eigenvectors corresponding to smallest eigenvalues of - * matrix. Vectors are stored as columns of a column-major matrix - * with dimensions n x nEigVecs. - * @return NVGRAPH error flag. - */ -template -NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix *A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ *effIter, - IndexType_ *totalIter, - ValueType_ *shift, - ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev) -{ - // ------------------------------------------------------- - // Variable declaration - // ------------------------------------------------------- - - // Useful constants - const ValueType_ one = 1; - const ValueType_ zero = 0; - - // Matrix dimension - IndexType_ n = A->n; - - // Shift for implicit restart - ValueType_ shiftUpper; - ValueType_ shiftLower; - - // Lanczos iteration counters - IndexType_ maxIter_curr = restartIter; // Maximum size of Lanczos system - - // Status flags - int status; - - // Loop index - IndexType_ i; - - // Host memory - ValueType_ *Z_host; // Eigenvectors in Lanczos basis - ValueType_ *work_host; // Workspace - - // ------------------------------------------------------- - // Check that LAPACK is enabled - // ------------------------------------------------------- - // Lapack::check_lapack_enabled(); - - // ------------------------------------------------------- - // Check that parameters are valid - // ------------------------------------------------------- - if (A->m != A->n) { - WARNING("invalid parameter (matrix is not square)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs < 1) { - WARNING("invalid parameter (nEigVecs<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (restartIter < 1) { - WARNING("invalid parameter (restartIter<4)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (tol < 0) { - WARNING("invalid parameter (tol<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs > n) { - WARNING("invalid parameters (nEigVecs>n)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (maxIter < nEigVecs) { - WARNING("invalid parameters (maxIter Z_host_v(restartIter * restartIter); - std::vector work_host_v(4 * restartIter); - - Z_host = Z_host_v.data(); - work_host = work_host_v.data(); - - // Initialize cuBLAS - Cublas::set_pointer_mode_host(); - - // ------------------------------------------------------- - // Compute largest eigenvalue to determine shift - // ------------------------------------------------------- - - // Random number generator - curandGenerator_t randGen; - // Initialize random number generator - CHECK_CURAND(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); - - // FIXME: This is hard coded, which is good for unit testing... - // but should really be a parameter so it could be - // "random" for real runs and "fixed" for tests - CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen, 1234567 /*time(NULL)*/)); - // CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen, time(NULL))); - // Initialize initial Lanczos vector - CHECK_CURAND(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); - ValueType_ normQ1 = Cublas::nrm2(n, lanczosVecs_dev, 1); - Cublas::scal(n, 1 / normQ1, lanczosVecs_dev, 1); - - // Estimate number of Lanczos iterations - // See bounds in Kuczynski and Wozniakowski (1992). - // const ValueType_ relError = 0.25; // Relative error - // const ValueType_ failProb = 1e-4; // Probability of failure - // maxIter_curr = log(n/pow(failProb,2))/(4*std::sqrt(relError)) + 1; - // maxIter_curr = min(maxIter_curr, restartIter); - - // Obtain tridiagonal matrix with Lanczos - *effIter = 0; - *shift = 0; - status = performLanczosIteration(A, - effIter, - maxIter_curr, - *shift, - 0.0, - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); - if (status) WARNING("error in Lanczos iteration"); - - // Determine largest eigenvalue - - Lapack::sterf(*effIter, alpha_host, beta_host); - *shift = -alpha_host[*effIter - 1]; - // std::cout << *shift <(A, - effIter, - maxIter_curr, - *shift, - 0, - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); - if (status) WARNING("error in Lanczos iteration"); - *totalIter += *effIter; - - // Apply Lanczos method until convergence - shiftLower = 1; - shiftUpper = -1; - while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) { - // Determine number of restart steps - // Number of steps must be even due to Francis algorithm - IndexType_ iter_new = nEigVecs + 1; - if (restartIter - (maxIter - *totalIter) > nEigVecs + 1) - iter_new = restartIter - (maxIter - *totalIter); - if ((restartIter - iter_new) % 2) iter_new -= 1; - if (iter_new == *effIter) break; - - // Implicit restart of Lanczos method - status = lanczosRestart(n, - *effIter, - iter_new, - &shiftUpper, - &shiftLower, - alpha_host, - beta_host, - Z_host, - work_host, - lanczosVecs_dev, - work_dev, - true); - if (status) WARNING("error in Lanczos implicit restart"); - *effIter = iter_new; - - // Check for convergence - if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break; - - // Proceed with Lanczos method - // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); - status = performLanczosIteration(A, - effIter, - maxIter_curr, - *shift, - tol * fabs(shiftLower), - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); - if (status) WARNING("error in Lanczos iteration"); - *totalIter += *effIter - iter_new; - } - - // Warning if Lanczos has failed to converge - if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) { - WARNING("implicitly restarted Lanczos failed to converge"); - } - - // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(ValueType_)); - memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(ValueType_)); - Lapack::steqr('I', - *effIter, - work_host + 2 * (*effIter), - work_host + 3 * (*effIter), - Z_host, - *effIter, - work_host); - - // Obtain desired eigenvalues by applying shift - for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; - for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0; - - // Copy results to device memory - CHECK_CUDA(cudaMemcpy(eigVals_dev, - work_host + 2 * (*effIter), - nEigVecs * sizeof(ValueType_), - cudaMemcpyHostToDevice)); - // for (int i = 0; i < nEigVecs; ++i) - //{ - // std::cout <<*(work_host+(2*(*effIter)+i))<< std::endl; - //} - CHECK_CUDA(cudaMemcpy( - work_dev, Z_host, (*effIter) * nEigVecs * sizeof(ValueType_), cudaMemcpyHostToDevice)); - - // Convert eigenvectors from Lanczos basis to standard basis - Cublas::gemm(false, - false, - n, - nEigVecs, - *effIter, - &one, - lanczosVecs_dev, - n, - work_dev, - *effIter, - &zero, - eigVecs_dev, - n); - - // Clean up and exit - CHECK_CURAND(curandDestroyGenerator(randGen)); - return NVGRAPH_OK; -} - -/// Compute smallest eigenvectors of symmetric matrix -/** Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are smallest in - * magnitude. - * - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied to A+s*I, where s is negative the largest - * eigenvalue. - * - * CNMEM must be initialized before calling this function. - * - * @param A Matrix. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter Maximum number of Lanczos steps. Does not include - * Lanczos steps used to estimate largest eigenvalue. - * @param restartIter Maximum size of Lanczos system before - * performing an implicit restart. Should be at least 4. - * @param tol Convergence tolerance. Lanczos iteration will - * terminate when the residual norm is less than tol*theta, where - * theta is an estimate for the smallest unwanted eigenvalue - * (i.e. the (nEigVecs+1)th smallest eigenvalue). - * @param reorthogonalize Whether to reorthogonalize Lanczos - * vectors. - * @param iter On exit, pointer to total number of Lanczos - * iterations performed. Does not include Lanczos steps used to - * estimate largest eigenvalue. - * @param eigVals_dev (Output, device memory, nEigVecs entries) - * Smallest eigenvalues of matrix. - * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) - * Eigenvectors corresponding to smallest eigenvalues of - * matrix. Vectors are stored as columns of a column-major matrix - * with dimensions n x nEigVecs. - * @return NVGRAPH error flag. - */ -template -NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix &A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ &iter, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev) -{ - // CUDA stream - // TODO: handle non-zero streams - cudaStream_t stream = 0; - - // Matrix dimension - IndexType_ n = A.n; - - // Check that parameters are valid - if (A.m != A.n) { - WARNING("invalid parameter (matrix is not square)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs < 1) { - WARNING("invalid parameter (nEigVecs<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (restartIter < 1) { - WARNING("invalid parameter (restartIter<4)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (tol < 0) { - WARNING("invalid parameter (tol<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs > n) { - WARNING("invalid parameters (nEigVecs>n)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (maxIter < nEigVecs) { - WARNING("invalid parameters (maxIter alpha_host_v(restartIter); - std::vector beta_host_v(restartIter); - - ValueType_ *alpha_host = alpha_host_v.data(); - ValueType_ *beta_host = beta_host_v.data(); - - Vector lanczosVecs_dev(n * (restartIter + 1), stream); - Vector work_dev((n + restartIter) * restartIter, stream); - - // Perform Lanczos method - IndexType_ effIter; - ValueType_ shift; - NVGRAPH_ERROR status = computeSmallestEigenvectors(&A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - &effIter, - &iter, - &shift, - alpha_host, - beta_host, - lanczosVecs_dev.raw(), - work_dev.raw(), - eigVals_dev, - eigVecs_dev); - - // Clean up and return - return status; -} - -// ========================================================= -// Eigensolver -// ========================================================= - -/// Compute largest eigenvectors of symmetric matrix -/** Computes eigenvalues and eigenvectors that are least - * positive. If matrix is positive definite or positive - * semidefinite, the computed eigenvalues are largest in - * magnitude. - * - * The largest eigenvalue is estimated by performing several - * Lanczos iterations. An implicitly restarted Lanczos method is - * then applied. - * - * @param A Matrix. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter Maximum number of Lanczos steps. - * @param restartIter Maximum size of Lanczos system before - * performing an implicit restart. Should be at least 4. - * @param tol Convergence tolerance. Lanczos iteration will - * terminate when the residual norm is less than tol*theta, where - * theta is an estimate for the largest unwanted eigenvalue - * (i.e. the (nEigVecs+1)th largest eigenvalue). - * @param reorthogonalize Whether to reorthogonalize Lanczos - * vectors. - * @param effIter On exit, pointer to final size of Lanczos system. - * @param totalIter On exit, pointer to total number of Lanczos - * iterations performed. - * @param alpha_host (Output, host memory, restartIter entries) - * Diagonal entries of Lanczos system. - * @param beta_host (Output, host memory, restartIter entries) - * Off-diagonal entries of Lanczos system. - * @param lanczosVecs_dev (Output, device memory, n*(restartIter+1) - * entries) Lanczos vectors. Vectors are stored as columns of a - * column-major matrix with dimensions n x (restartIter+1). - * @param work_dev (Output, device memory, - * (n+restartIter)*restartIter entries) Workspace. - * @param eigVals_dev (Output, device memory, nEigVecs entries) - * Largest eigenvalues of matrix. - * @param eigVecs_dev (Output, device memory, n*nEigVecs entries) - * Eigenvectors corresponding to largest eigenvalues of - * matrix. Vectors are stored as columns of a column-major matrix - * with dimensions n x nEigVecs. - * @return NVGRAPH error flag. - */ -template -NVGRAPH_ERROR computeLargestEigenvectors(const Matrix *A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ *effIter, - IndexType_ *totalIter, - ValueType_ *__restrict__ alpha_host, - ValueType_ *__restrict__ beta_host, - ValueType_ *__restrict__ lanczosVecs_dev, - ValueType_ *__restrict__ work_dev, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev) -{ - // ------------------------------------------------------- - // Variable declaration - // ------------------------------------------------------- - - // Useful constants - const ValueType_ one = 1; - const ValueType_ zero = 0; - - // Matrix dimension - IndexType_ n = A->n; - - // Lanczos iteration counters - IndexType_ maxIter_curr = restartIter; // Maximum size of Lanczos system - - // Status flags - int status; - - // Loop index - IndexType_ i; - - // Host memory - ValueType_ *Z_host; // Eigenvectors in Lanczos basis - ValueType_ *work_host; // Workspace - - // ------------------------------------------------------- - // Check that LAPACK is enabled - // ------------------------------------------------------- - // Lapack::check_lapack_enabled(); - - // ------------------------------------------------------- - // Check that parameters are valid - // ------------------------------------------------------- - if (A->m != A->n) { - WARNING("invalid parameter (matrix is not square)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs < 1) { - WARNING("invalid parameter (nEigVecs<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (restartIter < 1) { - WARNING("invalid parameter (restartIter<4)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (tol < 0) { - WARNING("invalid parameter (tol<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs > n) { - WARNING("invalid parameters (nEigVecs>n)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (maxIter < nEigVecs) { - WARNING("invalid parameters (maxIter Z_host_v(restartIter * restartIter); - std::vector work_host_v(4 * restartIter); - - Z_host = Z_host_v.data(); - work_host = work_host_v.data(); - - // Initialize cuBLAS - Cublas::set_pointer_mode_host(); - - // ------------------------------------------------------- - // Compute largest eigenvalue - // ------------------------------------------------------- - - // Random number generator - curandGenerator_t randGen; - // Initialize random number generator - CHECK_CURAND(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10)); - CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(randGen, 123456)); - // Initialize initial Lanczos vector - CHECK_CURAND(curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one)); - ValueType_ normQ1 = Cublas::nrm2(n, lanczosVecs_dev, 1); - Cublas::scal(n, 1 / normQ1, lanczosVecs_dev, 1); - - // Estimate number of Lanczos iterations - // See bounds in Kuczynski and Wozniakowski (1992). - // const ValueType_ relError = 0.25; // Relative error - // const ValueType_ failProb = 1e-4; // Probability of failure - // maxIter_curr = log(n/pow(failProb,2))/(4*std::sqrt(relError)) + 1; - // maxIter_curr = min(maxIter_curr, restartIter); - - // Obtain tridiagonal matrix with Lanczos - *effIter = 0; - ValueType_ shift_val = 0.0; - ValueType_ *shift = &shift_val; - // maxIter_curr = min(maxIter, restartIter); - status = performLanczosIteration(A, - effIter, - maxIter_curr, - *shift, - 0, - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); - if (status) WARNING("error in Lanczos iteration"); - *totalIter += *effIter; - - // Apply Lanczos method until convergence - ValueType_ shiftLower = 1; - ValueType_ shiftUpper = -1; - while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) { - // Determine number of restart steps - // Number of steps must be even due to Francis algorithm - IndexType_ iter_new = nEigVecs + 1; - if (restartIter - (maxIter - *totalIter) > nEigVecs + 1) - iter_new = restartIter - (maxIter - *totalIter); - if ((restartIter - iter_new) % 2) iter_new -= 1; - if (iter_new == *effIter) break; - - // Implicit restart of Lanczos method - status = lanczosRestart(n, - *effIter, - iter_new, - &shiftUpper, - &shiftLower, - alpha_host, - beta_host, - Z_host, - work_host, - lanczosVecs_dev, - work_dev, - false); - if (status) WARNING("error in Lanczos implicit restart"); - *effIter = iter_new; - - // Check for convergence - if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break; - - // Proceed with Lanczos method - // maxIter_curr = min(restartIter, maxIter-*totalIter+*effIter); - status = performLanczosIteration(A, - effIter, - maxIter_curr, - *shift, - tol * fabs(shiftLower), - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); - if (status) WARNING("error in Lanczos iteration"); - *totalIter += *effIter - iter_new; - } - - // Warning if Lanczos has failed to converge - if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) { - WARNING("implicitly restarted Lanczos failed to converge"); - } - for (int i = 0; i < restartIter; ++i) { - for (int j = 0; j < restartIter; ++j) Z_host[i * restartIter + j] = 0; - } - // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(ValueType_)); - memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(ValueType_)); - Lapack::steqr('I', - *effIter, - work_host + 2 * (*effIter), - work_host + 3 * (*effIter), - Z_host, - *effIter, - work_host); - - // note: We need to pick the top nEigVecs eigenvalues - // but effItter can be larger than nEigVecs - // hence we add an offset for that case, because we want to access top nEigVecs eigenpairs in the - // matrix of size effIter. remember the array is sorted, so it is not needed for smallest - // eigenvalues case because the first ones are the smallest ones - - IndexType_ top_eigenparis_idx_offset = *effIter - nEigVecs; - - // Debug : print nEigVecs largest eigenvalues - // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i) - // std::cout <<*(work_host+(2*(*effIter)+i))<< " "; - // std::cout < -NVGRAPH_ERROR computeLargestEigenvectors(const Matrix &A, - IndexType_ nEigVecs, - IndexType_ maxIter, - IndexType_ restartIter, - ValueType_ tol, - bool reorthogonalize, - IndexType_ &iter, - ValueType_ *__restrict__ eigVals_dev, - ValueType_ *__restrict__ eigVecs_dev) -{ - // CUDA stream - // TODO: handle non-zero streams - cudaStream_t stream = 0; - - // Matrix dimension - IndexType_ n = A.n; - - // Check that parameters are valid - if (A.m != A.n) { - WARNING("invalid parameter (matrix is not square)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs < 1) { - WARNING("invalid parameter (nEigVecs<1)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (restartIter < 1) { - WARNING("invalid parameter (restartIter<4)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (tol < 0) { - WARNING("invalid parameter (tol<0)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (nEigVecs > n) { - WARNING("invalid parameters (nEigVecs>n)"); - return NVGRAPH_ERR_BAD_PARAMETERS; - } - if (maxIter < nEigVecs) { - WARNING("invalid parameters (maxIter alpha_host_v(restartIter); - std::vector beta_host_v(restartIter); - - ValueType_ *alpha_host = alpha_host_v.data(); - ValueType_ *beta_host = beta_host_v.data(); - - Vector lanczosVecs_dev(n * (restartIter + 1), stream); - Vector work_dev((n + restartIter) * restartIter, stream); - - // Perform Lanczos method - IndexType_ effIter; - NVGRAPH_ERROR status = computeLargestEigenvectors(&A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - &effIter, - &iter, - alpha_host, - beta_host, - lanczosVecs_dev.raw(), - work_dev.raw(), - eigVals_dev, - eigVecs_dev); - - // Clean up and return - return status; -} - -// ========================================================= -// Explicit instantiation -// ========================================================= - -template NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix &A, - int nEigVecs, - int maxIter, - int restartIter, - float tol, - bool reorthogonalize, - int &iter, - float *__restrict__ eigVals_dev, - float *__restrict__ eigVecs_dev); -template NVGRAPH_ERROR computeSmallestEigenvectors(const Matrix &A, - int nEigVecs, - int maxIter, - int restartIter, - double tol, - bool reorthogonalize, - int &iter, - double *__restrict__ eigVals_dev, - double *__restrict__ eigVecs_dev); - -template NVGRAPH_ERROR computeLargestEigenvectors(const Matrix &A, - int nEigVecs, - int maxIter, - int restartIter, - float tol, - bool reorthogonalize, - int &iter, - float *__restrict__ eigVals_dev, - float *__restrict__ eigVecs_dev); -template NVGRAPH_ERROR computeLargestEigenvectors(const Matrix &A, - int nEigVecs, - int maxIter, - int restartIter, - double tol, - bool reorthogonalize, - int &iter, - double *__restrict__ eigVals_dev, - double *__restrict__ eigVecs_dev); - -} // namespace nvgraph diff --git a/cpp/src/nvgraph/modularity_maximization.cu b/cpp/src/nvgraph/modularity_maximization.cu deleted file mode 100644 index bd90f3093aa..00000000000 --- a/cpp/src/nvgraph/modularity_maximization.cu +++ /dev/null @@ -1,436 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -//#ifdef NVGRAPH_PARTITION - -#include "include/modularity_maximization.hxx" - -#include -#include - -#include -#include -#include -#include -#include - -#include "include/debug_macros.h" -#include "include/kmeans.hxx" -#include "include/lanczos.hxx" -#include "include/nvgraph_cublas.hxx" -#include "include/nvgraph_error.hxx" -#include "include/nvgraph_vector.hxx" -#include "include/sm_utils.h" -#include "include/spectral_matrix.hxx" - -//#define COLLECT_TIME_STATISTICS 1 -//#undef COLLECT_TIME_STATISTICS - -#ifdef COLLECT_TIME_STATISTICS -#include -#include -#include -#include -#include "cuda_profiler_api.h" -#endif - -#ifdef COLLECT_TIME_STATISTICS -static double timer(void) -{ - struct timeval tv; - cudaDeviceSynchronize(); - gettimeofday(&tv, NULL); - return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; -} -#endif - -namespace nvgraph { - -// ========================================================= -// Useful macros -// ========================================================= - -// Get index of matrix entry -#define IDX(i, j, lda) ((i) + (j) * (lda)) - -template -static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, ValueType_ *obs) -{ - IndexType_ i, j, k, index, mm; - ValueType_ alpha, v, last; - bool valid; - // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension - - // compute alpha - mm = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x); // m in multiple of blockDim.x - alpha = 0.0; - // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, - // li, mn); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { - for (i = threadIdx.x; i < mm; i += blockDim.x) { - // check if the thread is valid - valid = i < m; - - // get the value of the last thread - last = utils::shfl(alpha, blockDim.x - 1, blockDim.x); - - // if you are valid read the value from memory, otherwise set your value to 0 - alpha = (valid) ? obs[i + j * m] : 0.0; - alpha = alpha * alpha; - - // do prefix sum (of size warpSize=blockDim.x =< 32) - for (k = 1; k < blockDim.x; k *= 2) { - v = utils::shfl_up(alpha, k, blockDim.x); - if (threadIdx.x >= k) alpha += v; - } - // shift by last - alpha += last; - } - } - - // scale by alpha - alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); - alpha = std::sqrt(alpha); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { - for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 - index = i + j * m; - obs[index] = obs[index] / alpha; - } - } -} - -template -IndexType_ next_pow2(IndexType_ n) -{ - IndexType_ v; - // Reference: - // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float - v = n - 1; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - return v + 1; -} - -template -cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) -{ - IndexType_ p2m; - dim3 nthreads, nblocks; - - // find next power of 2 - p2m = next_pow2(m); - // setup launch configuration - nthreads.x = max(2, min(p2m, 32)); - nthreads.y = 256 / nthreads.x; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = (n + nthreads.y - 1) / nthreads.y; - nblocks.z = 1; - // printf("m=%d(%d),n=%d,obs=%p, - // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); - - // launch scaling kernel (scale each column of obs by its norm) - scale_obs_kernel<<>>(m, n, obs); - cudaCheckError(); - - return cudaSuccess; -} - -// ========================================================= -// Spectral modularity_maximization -// ========================================================= - -/** Compute partition for a weighted undirected graph. This - * partition attempts to minimize the cost function: - * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) - * - * @param G Weighted graph in CSR format - * @param nClusters Number of partitions. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter_lanczos Maximum number of Lanczos iterations. - * @param restartIter_lanczos Maximum size of Lanczos system before - * implicit restart. - * @param tol_lanczos Convergence tolerance for Lanczos method. - * @param maxIter_kmeans Maximum number of k-means iterations. - * @param tol_kmeans Convergence tolerance for k-means algorithm. - * @param parts (Output, device memory, n entries) Cluster - * assignments. - * @param iters_lanczos On exit, number of Lanczos iterations - * performed. - * @param iters_kmeans On exit, number of k-means iterations - * performed. - * @return NVGRAPH error flag. - */ -template -NVGRAPH_ERROR modularity_maximization( - cugraph::experimental::GraphCSRView const &graph, - vertex_t nClusters, - vertex_t nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - weight_t tol_lanczos, - int maxIter_kmeans, - weight_t tol_kmeans, - vertex_t *__restrict__ clusters, - weight_t *eigVals, - weight_t *eigVecs, - int &iters_lanczos, - int &iters_kmeans) -{ - cudaStream_t stream = 0; - const weight_t zero{0.0}; - const weight_t one{1.0}; - - edge_t i; - edge_t n = graph.number_of_vertices; - - // k-means residual - weight_t residual_kmeans; - - // Compute eigenvectors of Modularity Matrix - // Initialize Modularity Matrix - CsrMatrix A(false, - false, - graph.number_of_vertices, - graph.number_of_vertices, - graph.number_of_edges, - 0, - graph.edge_data, - graph.offsets, - graph.indices); - ModularityMatrix B(A, graph.number_of_edges); - - // Compute smallest eigenvalues and eigenvectors - CHECK_NVGRAPH(computeLargestEigenvectors(B, - nEigVecs, - maxIter_lanczos, - restartIter_lanczos, - tol_lanczos, - false, - iters_lanczos, - eigVals, - eigVecs)); - - // eigVals.dump(0, nEigVecs); - // eigVecs.dump(0, nEigVecs); - // eigVecs.dump(n, nEigVecs); - // eigVecs.dump(2*n, nEigVecs); - // Whiten eigenvector matrix - for (i = 0; i < nEigVecs; ++i) { - weight_t mean, std; - mean = thrust::reduce(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); - cudaCheckError(); - mean /= n; - thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), - thrust::make_constant_iterator(mean), - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::minus()); - cudaCheckError(); - std = Cublas::nrm2(n, eigVecs + IDX(0, i, n), 1) / std::sqrt(static_cast(n)); - thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), - thrust::make_constant_iterator(std), - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::divides()); - cudaCheckError(); - } - - // Transpose eigenvector matrix - // TODO: in-place transpose - { - Vector work(nEigVecs * n, stream); - Cublas::set_pointer_mode_host(); - Cublas::geam(true, - false, - nEigVecs, - n, - &one, - eigVecs, - n, - &zero, - (weight_t *)NULL, - nEigVecs, - work.raw(), - nEigVecs); - CHECK_CUDA(cudaMemcpyAsync( - eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice)); - } - - // WARNING: notice that at this point the matrix has already been transposed, so we are scaling - // columns - scale_obs(nEigVecs, n, eigVecs); - cudaCheckError(); - - // eigVecs.dump(0, nEigVecs*n); - // Find partition with k-means clustering - CHECK_NVGRAPH(kmeans(n, - nEigVecs, - nClusters, - tol_kmeans, - maxIter_kmeans, - eigVecs, - clusters, - residual_kmeans, - iters_kmeans)); - - return NVGRAPH_OK; -} -//=================================================== -// Analysis of graph partition -// ========================================================= - -namespace { -/// Functor to generate indicator vectors -/** For use in Thrust transform - */ -template -struct equal_to_i_op { - const IndexType_ i; - - public: - equal_to_i_op(IndexType_ _i) : i(_i) {} - template - __host__ __device__ void operator()(Tuple_ t) - { - thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; - } -}; -} // namespace - -/// Compute modularity -/** This function determines the modularity based on a graph and cluster assignments - * @param G Weighted graph in CSR format - * @param nClusters Number of clusters. - * @param parts (Input, device memory, n entries) Cluster assignments. - * @param modularity On exit, modularity - */ -template -NVGRAPH_ERROR analyzeModularity( - cugraph::experimental::GraphCSRView const &graph, - vertex_t nClusters, - const vertex_t *__restrict__ parts, - weight_t &modularity) -{ - cudaStream_t stream = 0; - edge_t i; - edge_t n = graph.number_of_vertices; - weight_t partModularity, partSize; - - // Device memory - Vector part_i(n, stream); - Vector Bx(n, stream); - - // Initialize cuBLAS - Cublas::set_pointer_mode_host(); - - // Initialize Modularity - CsrMatrix A(false, - false, - graph.number_of_vertices, - graph.number_of_vertices, - graph.number_of_edges, - 0, - graph.edge_data, - graph.offsets, - graph.indices); - ModularityMatrix B(A, graph.number_of_edges); - - // Initialize output - modularity = 0; - - // Iterate through partitions - for (i = 0; i < nClusters; ++i) { - // Construct indicator vector for ith partition - thrust::for_each( - thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts), - thrust::device_pointer_cast(part_i.raw()))), - thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts + n), - thrust::device_pointer_cast(part_i.raw() + n))), - equal_to_i_op(i)); - cudaCheckError(); - - // Compute size of ith partition - Cublas::dot(n, part_i.raw(), 1, part_i.raw(), 1, &partSize); - partSize = round(partSize); - if (partSize < 0.5) { - WARNING("empty partition"); - continue; - } - - // Compute modularity - B.mv(1, part_i.raw(), 0, Bx.raw()); - Cublas::dot(n, Bx.raw(), 1, part_i.raw(), 1, &partModularity); - - // Record results - modularity += partModularity; - // std::cout<< "partModularity " <( - cugraph::experimental::GraphCSRView const &graph, - int nClusters, - int nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - float tol_lanczos, - int maxIter_kmeans, - float tol_kmeans, - int *__restrict__ parts, - float *eigVals, - float *eigVecs, - int &iters_lanczos, - int &iters_kmeans); -template NVGRAPH_ERROR modularity_maximization( - cugraph::experimental::GraphCSRView const &graph, - int nClusters, - int nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - double tol_lanczos, - int maxIter_kmeans, - double tol_kmeans, - int *__restrict__ parts, - double *eigVals, - double *eigVecs, - int &iters_lanczos, - int &iters_kmeans); -template NVGRAPH_ERROR analyzeModularity( - cugraph::experimental::GraphCSRView const &graph, - int nClusters, - const int *__restrict__ parts, - float &modularity); -template NVGRAPH_ERROR analyzeModularity( - cugraph::experimental::GraphCSRView const &graph, - int nClusters, - const int *__restrict__ parts, - double &modularity); - -} // namespace nvgraph -//#endif //NVGRAPH_PARTITION diff --git a/cpp/src/nvgraph/nvgraph_cublas.cpp b/cpp/src/nvgraph/nvgraph_cublas.cpp deleted file mode 100644 index ceb3ad25d6b..00000000000 --- a/cpp/src/nvgraph/nvgraph_cublas.cpp +++ /dev/null @@ -1,569 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "include/nvgraph_cublas.hxx" - -namespace nvgraph { - -cublasHandle_t Cublas::m_handle = 0; - -namespace { -cublasStatus_t cublas_axpy( - cublasHandle_t handle, int n, const float* alpha, const float* x, int incx, float* y, int incy) -{ - return cublasSaxpy(handle, n, alpha, x, incx, y, incy); -} - -cublasStatus_t cublas_axpy( - cublasHandle_t handle, int n, const double* alpha, const double* x, int incx, double* y, int incy) -{ - return cublasDaxpy(handle, n, alpha, x, incx, y, incy); -} - -cublasStatus_t cublas_copy( - cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy) -{ - return cublasScopy(handle, n, x, incx, y, incy); -} - -cublasStatus_t cublas_copy( - cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy) -{ - return cublasDcopy(handle, n, x, incx, y, incy); -} - -cublasStatus_t cublas_dot( - cublasHandle_t handle, int n, const float* x, int incx, const float* y, int incy, float* result) -{ - return cublasSdot(handle, n, x, incx, y, incy, result); -} - -cublasStatus_t cublas_dot(cublasHandle_t handle, - int n, - const double* x, - int incx, - const double* y, - int incy, - double* result) -{ - return cublasDdot(handle, n, x, incx, y, incy, result); -} - -cublasStatus_t cublas_trsv_v2(cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const float* A, - int lda, - float* x, - int incx) -{ - return cublasStrsv(handle, uplo, trans, diag, n, A, lda, x, incx); -} -cublasStatus_t cublas_trsv_v2(cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const double* A, - int lda, - double* x, - int incx) -{ - return cublasDtrsv(handle, uplo, trans, diag, n, A, lda, x, incx); -} - -cublasStatus_t cublas_gemm(cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const float* alpha, - const float* A, - int lda, - const float* B, - int ldb, - const float* beta, - float* C, - int ldc) -{ - return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); -} - -cublasStatus_t cublas_gemm(cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const double* alpha, - const double* A, - int lda, - const double* B, - int ldb, - const double* beta, - double* C, - int ldc) -{ - return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); -} - -cublasStatus_t cublas_gemv(cublasHandle_t handle, - cublasOperation_t trans, - int m, - int n, - const float* alpha, - const float* A, - int lda, - const float* x, - int incx, - const float* beta, - float* y, - int incy) -{ - return cublasSgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy); -} - -cublasStatus_t cublas_gemv(cublasHandle_t handle, - cublasOperation_t trans, - int m, - int n, - const double* alpha, - const double* A, - int lda, - const double* x, - int incx, - const double* beta, - double* y, - int incy) -{ - return cublasDgemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy); -} - -cublasStatus_t cublas_ger(cublasHandle_t handle, - int m, - int n, - const float* alpha, - const float* x, - int incx, - const float* y, - int incy, - float* A, - int lda) -{ - return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda); -} - -cublasStatus_t cublas_ger(cublasHandle_t handle, - int m, - int n, - const double* alpha, - const double* x, - int incx, - const double* y, - int incy, - double* A, - int lda) -{ - return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda); -} - -cublasStatus_t cublas_nrm2(cublasHandle_t handle, int n, const float* x, int incx, float* result) -{ - return cublasSnrm2(handle, n, x, incx, result); -} - -cublasStatus_t cublas_nrm2(cublasHandle_t handle, int n, const double* x, int incx, double* result) -{ - return cublasDnrm2(handle, n, x, incx, result); -} - -cublasStatus_t cublas_scal(cublasHandle_t handle, int n, const float* alpha, float* x, int incx) -{ - return cublasSscal(handle, n, alpha, x, incx); -} - -cublasStatus_t cublas_scal(cublasHandle_t handle, int n, const double* alpha, double* x, int incx) -{ - return cublasDscal(handle, n, alpha, x, incx); -} - -cublasStatus_t cublas_geam(cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - const float* alpha, - const float* A, - int lda, - const float* beta, - const float* B, - int ldb, - float* C, - int ldc) -{ - return cublasSgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); -} - -cublasStatus_t cublas_geam(cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - const double* alpha, - const double* A, - int lda, - const double* beta, - const double* B, - int ldb, - double* C, - int ldc) -{ - return cublasDgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); -} - -} // anonymous namespace. - -void Cublas::set_pointer_mode_device() -{ - cublasHandle_t handle = Cublas::get_handle(); - cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); -} - -void Cublas::set_pointer_mode_host() -{ - cublasHandle_t handle = Cublas::get_handle(); - cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST); -} - -template -void Cublas::axpy(int n, T alpha, const T* x, int incx, T* y, int incy) -{ - cublasHandle_t handle = Cublas::get_handle(); - CHECK_CUBLAS(cublas_axpy(handle, n, &alpha, x, incx, y, incy)); -} - -template -void Cublas::copy(int n, const T* x, int incx, T* y, int incy) -{ - cublasHandle_t handle = Cublas::get_handle(); - CHECK_CUBLAS(cublas_copy(handle, n, x, incx, y, incy)); -} - -template -void Cublas::dot(int n, const T* x, int incx, const T* y, int incy, T* result) -{ - cublasHandle_t handle = Cublas::get_handle(); - CHECK_CUBLAS(cublas_dot(handle, n, x, incx, y, incy, result)); -} - -template -T Cublas::nrm2(int n, const T* x, int incx) -{ - Cublas::get_handle(); - T result; - Cublas::nrm2(n, x, incx, &result); - return result; -} - -template -void Cublas::nrm2(int n, const T* x, int incx, T* result) -{ - cublasHandle_t handle = Cublas::get_handle(); - CHECK_CUBLAS(cublas_nrm2(handle, n, x, incx, result)); -} - -template -void Cublas::scal(int n, T alpha, T* x, int incx) -{ - Cublas::scal(n, &alpha, x, incx); -} - -template -void Cublas::scal(int n, T* alpha, T* x, int incx) -{ - cublasHandle_t handle = Cublas::get_handle(); - CHECK_CUBLAS(cublas_scal(handle, n, alpha, x, incx)); -} - -template -void Cublas::gemv(bool transposed, - int m, - int n, - const T* alpha, - const T* A, - int lda, - const T* x, - int incx, - const T* beta, - T* y, - int incy) -{ - cublasHandle_t handle = Cublas::get_handle(); - cublasOperation_t trans = transposed ? CUBLAS_OP_T : CUBLAS_OP_N; - CHECK_CUBLAS(cublas_gemv(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy)); -} - -template -void Cublas::gemv_ext(bool transposed, - const int m, - const int n, - const T* alpha, - const T* A, - const int lda, - const T* x, - const int incx, - const T* beta, - T* y, - const int incy, - const int offsetx, - const int offsety, - const int offseta) -{ - cublasHandle_t handle = Cublas::get_handle(); - cublasOperation_t trans = transposed ? CUBLAS_OP_T : CUBLAS_OP_N; - CHECK_CUBLAS(cublas_gemv( - handle, trans, m, n, alpha, A + offseta, lda, x + offsetx, incx, beta, y + offsety, incy)); -} - -template -void Cublas::trsv_v2(cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const T* A, - int lda, - T* x, - int incx, - int offseta) -{ - cublasHandle_t handle = Cublas::get_handle(); - - CHECK_CUBLAS(cublas_trsv_v2(handle, uplo, trans, diag, n, A + offseta, lda, x, incx)); -} - -template -void Cublas::ger( - int m, int n, const T* alpha, const T* x, int incx, const T* y, int incy, T* A, int lda) -{ - cublasHandle_t handle = Cublas::get_handle(); - CHECK_CUBLAS(cublas_ger(handle, m, n, alpha, x, incx, y, incy, A, lda)); -} - -template -void Cublas::gemm(bool transa, - bool transb, - int m, - int n, - int k, - const T* alpha, - const T* A, - int lda, - const T* B, - int ldb, - const T* beta, - T* C, - int ldc) -{ - cublasHandle_t handle = Cublas::get_handle(); - cublasOperation_t cublasTransA = transa ? CUBLAS_OP_T : CUBLAS_OP_N; - cublasOperation_t cublasTransB = transb ? CUBLAS_OP_T : CUBLAS_OP_N; - CHECK_CUBLAS( - cublas_gemm(handle, cublasTransA, cublasTransB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)); -} - -template -void Cublas::geam(bool transa, - bool transb, - int m, - int n, - const T* alpha, - const T* A, - int lda, - const T* beta, - const T* B, - int ldb, - T* C, - int ldc) -{ - cublasHandle_t handle = Cublas::get_handle(); - cublasOperation_t cublasTransA = transa ? CUBLAS_OP_T : CUBLAS_OP_N; - cublasOperation_t cublasTransB = transb ? CUBLAS_OP_T : CUBLAS_OP_N; - CHECK_CUBLAS( - cublas_geam(handle, cublasTransA, cublasTransB, m, n, alpha, A, lda, beta, B, ldb, C, ldc)); -} - -template void Cublas::axpy(int n, float alpha, const float* x, int incx, float* y, int incy); -template void Cublas::axpy(int n, double alpha, const double* x, int incx, double* y, int incy); - -template void Cublas::copy(int n, const float* x, int incx, float* y, int incy); -template void Cublas::copy(int n, const double* x, int incx, double* y, int incy); - -template void Cublas::dot(int n, const float* x, int incx, const float* y, int incy, float* result); -template void Cublas::dot( - int n, const double* x, int incx, const double* y, int incy, double* result); - -template void Cublas::gemv(bool transposed, - int m, - int n, - const float* alpha, - const float* A, - int lda, - const float* x, - int incx, - const float* beta, - float* y, - int incy); -template void Cublas::gemv(bool transposed, - int m, - int n, - const double* alpha, - const double* A, - int lda, - const double* x, - int incx, - const double* beta, - double* y, - int incy); - -template void Cublas::ger(int m, - int n, - const float* alpha, - const float* x, - int incx, - const float* y, - int incy, - float* A, - int lda); -template void Cublas::ger(int m, - int n, - const double* alpha, - const double* x, - int incx, - const double* y, - int incy, - double* A, - int lda); - -template void Cublas::gemv_ext(bool transposed, - const int m, - const int n, - const float* alpha, - const float* A, - const int lda, - const float* x, - const int incx, - const float* beta, - float* y, - const int incy, - const int offsetx, - const int offsety, - const int offseta); -template void Cublas::gemv_ext(bool transposed, - const int m, - const int n, - const double* alpha, - const double* A, - const int lda, - const double* x, - const int incx, - const double* beta, - double* y, - const int incy, - const int offsetx, - const int offsety, - const int offseta); - -template void Cublas::trsv_v2(cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const float* A, - int lda, - float* x, - int incx, - int offseta); -template void Cublas::trsv_v2(cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const double* A, - int lda, - double* x, - int incx, - int offseta); - -template double Cublas::nrm2(int n, const double* x, int incx); -template float Cublas::nrm2(int n, const float* x, int incx); - -template void Cublas::scal(int n, float alpha, float* x, int incx); -template void Cublas::scal(int n, double alpha, double* x, int incx); - -template void Cublas::gemm(bool transa, - bool transb, - int m, - int n, - int k, - const float* alpha, - const float* A, - int lda, - const float* B, - int ldb, - const float* beta, - float* C, - int ldc); -template void Cublas::gemm(bool transa, - bool transb, - int m, - int n, - int k, - const double* alpha, - const double* A, - int lda, - const double* B, - int ldb, - const double* beta, - double* C, - int ldc); - -template void Cublas::geam(bool transa, - bool transb, - int m, - int n, - const float* alpha, - const float* A, - int lda, - const float* beta, - const float* B, - int ldb, - float* C, - int ldc); -template void Cublas::geam(bool transa, - bool transb, - int m, - int n, - const double* alpha, - const double* A, - int lda, - const double* beta, - const double* B, - int ldb, - double* C, - int ldc); - -} // end namespace nvgraph diff --git a/cpp/src/nvgraph/nvgraph_cusparse.cpp b/cpp/src/nvgraph/nvgraph_cusparse.cpp deleted file mode 100644 index 51a06968455..00000000000 --- a/cpp/src/nvgraph/nvgraph_cusparse.cpp +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "include/nvgraph_cusparse.hxx" - -namespace nvgraph { -cusparseHandle_t Cusparse::m_handle = 0; - -namespace { -cusparseStatus_t cusparse_csrmv(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int nnz, - const float* alpha, - const cusparseMatDescr_t descr, - const float* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const float* x, - const float* beta, - float* y) -{ - return cusparseScsrmv( - handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); -} - -cusparseStatus_t cusparse_csrmv(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int nnz, - const double* alpha, - const cusparseMatDescr_t descr, - const double* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const double* x, - const double* beta, - double* y) -{ - return cusparseDcsrmv( - handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); -} - -cusparseStatus_t cusparse_csrmm(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int k, - int nnz, - const float* alpha, - const cusparseMatDescr_t descr, - const float* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const float* x, - const int ldx, - const float* beta, - float* y, - const int ldy) -{ - return cusparseScsrmm( - handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); -} - -cusparseStatus_t cusparse_csrmm(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int k, - int nnz, - const double* alpha, - const cusparseMatDescr_t descr, - const double* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const double* x, - const int ldx, - const double* beta, - double* y, - const int ldy) -{ - return cusparseDcsrmm( - handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); -} - -} // end anonymous namespace. - -// Set pointer mode -void Cusparse::set_pointer_mode_device() -{ - cusparseHandle_t handle = Cusparse::get_handle(); - cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_DEVICE); -} -void Cusparse::set_pointer_mode_host() -{ - cusparseHandle_t handle = Cusparse::get_handle(); - cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST); -} - -template -void Cusparse::csrmv(const bool transposed, - const bool sym, - const int m, - const int n, - const int nnz, - const ValueType_* alpha, - const ValueType_* csrVal, - const IndexType_* csrRowPtr, - const IndexType_* csrColInd, - const ValueType_* x, - const ValueType_* beta, - ValueType_* y) -{ - cusparseHandle_t handle = Cusparse::get_handle(); - cusparseOperation_t trans = - transposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; - cusparseMatDescr_t descr = 0; - CHECK_CUSPARSE(cusparseCreateMatDescr(&descr)); // we should move that somewhere else - if (sym) { - CHECK_CUSPARSE(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_SYMMETRIC)); - } else { - CHECK_CUSPARSE(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); - } - CHECK_CUSPARSE(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); - CHECK_CUSPARSE(cusparse_csrmv( - handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y)); - CHECK_CUSPARSE(cusparseDestroyMatDescr(descr)); // we should move that somewhere else -} - -template void Cusparse::csrmv(const bool transposed, - const bool sym, - const int m, - const int n, - const int nnz, - const double* alpha, - const double* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const double* x, - const double* beta, - double* y); -template void Cusparse::csrmv(const bool transposed, - const bool sym, - const int m, - const int n, - const int nnz, - const float* alpha, - const float* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const float* x, - const float* beta, - float* y); -/* -template void Cusparse::csrmv( const bool transposed, - const bool sym, - const double* alpha, - const ValuedCsrGraph& G, - const Vector& x, - const double* beta, - Vector& y - ); - - -template void Cusparse::csrmv( const bool transposed, - const bool sym, - const float* alpha, - const ValuedCsrGraph& G, - const Vector& x, - const float* beta, - Vector& y - ); -*/ - -template -void Cusparse::csrmm(const bool transposed, - const bool sym, - const int m, - const int n, - const int k, - const int nnz, - const ValueType_* alpha, - const ValueType_* csrVal, - const IndexType_* csrRowPtr, - const IndexType_* csrColInd, - const ValueType_* x, - const int ldx, - const ValueType_* beta, - ValueType_* y, - const int ldy) -{ - cusparseHandle_t handle = Cusparse::get_handle(); - cusparseOperation_t trans = - transposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; - cusparseMatDescr_t descr = 0; - CHECK_CUSPARSE(cusparseCreateMatDescr(&descr)); // we should move that somewhere else - if (sym) { - CHECK_CUSPARSE(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_SYMMETRIC)); - } else { - CHECK_CUSPARSE(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); - } - CHECK_CUSPARSE(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); - CHECK_CUSPARSE(cusparse_csrmm( - handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy)); - CHECK_CUSPARSE(cusparseDestroyMatDescr(descr)); // we should move that somewhere else -} - -template void Cusparse::csrmm(const bool transposed, - const bool sym, - const int m, - const int n, - const int k, - const int nnz, - const double* alpha, - const double* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const double* x, - const int ldx, - const double* beta, - double* y, - const int ldy); - -template void Cusparse::csrmm(const bool transposed, - const bool sym, - const int m, - const int n, - const int k, - const int nnz, - const float* alpha, - const float* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const float* x, - const int ldx, - const float* beta, - float* y, - const int ldy); - -// template -void Cusparse::csr2coo(const int n, const int nnz, const int* csrRowPtr, int* cooRowInd) -{ - cusparseHandle_t handle = Cusparse::get_handle(); - cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; - CHECK_CUSPARSE(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, idxBase)); -} - -} // end namespace nvgraph diff --git a/cpp/src/nvgraph/nvgraph_lapack.cu b/cpp/src/nvgraph/nvgraph_lapack.cu deleted file mode 100644 index a3f1786a1cd..00000000000 --- a/cpp/src/nvgraph/nvgraph_lapack.cu +++ /dev/null @@ -1,792 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "include/nvgraph_lapack.hxx" - -//#include -//#include - -//#define NVGRAPH_USE_LAPACK 1 - -namespace nvgraph { - -#define lapackCheckError(status) \ - { \ - if (status < 0) { \ - std::stringstream ss; \ - ss << "Lapack error: argument number " << -status << " had an illegal value."; \ - FatalError(ss.str(), NVGRAPH_ERR_UNKNOWN); \ - } else if (status > 0) \ - FatalError("Lapack error: internal error.", NVGRAPH_ERR_UNKNOWN); \ - } - -template -void Lapack::check_lapack_enabled() -{ -#ifndef NVGRAPH_USE_LAPACK - FatalError("Error: LAPACK not enabled.", NVGRAPH_ERR_UNKNOWN); -#endif -} - -typedef enum { - CUSOLVER_STATUS_SUCCESS = 0, - CUSOLVER_STATUS_NOT_INITIALIZED = 1, - CUSOLVER_STATUS_ALLOC_FAILED = 2, - CUSOLVER_STATUS_INVALID_VALUE = 3, - CUSOLVER_STATUS_ARCH_MISMATCH = 4, - CUSOLVER_STATUS_MAPPING_ERROR = 5, - CUSOLVER_STATUS_EXECUTION_FAILED = 6, - CUSOLVER_STATUS_INTERNAL_ERROR = 7, - CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 8, - CUSOLVER_STATUS_NOT_SUPPORTED = 9, - CUSOLVER_STATUS_ZERO_PIVOT = 10, - CUSOLVER_STATUS_INVALID_LICENSE = 11 -} cusolverStatus_t; - -typedef enum { CUBLAS_OP_N = 0, CUBLAS_OP_T = 1, CUBLAS_OP_C = 2 } cublasOperation_t; - -namespace { -// XGEMM -// extern "C" -// void sgemm_(const char *transa, const char *transb, -// const int *m, const int *n, const int *k, -// const float *alpha, const float *a, const int *lda, -// const float *b, const int *ldb, -// const float *beta, float *c, const int *ldc); -// extern "C" -// void dgemm_(const char *transa, const char *transb, -// const int *m, const int *n, const int *k, -// const double *alpha, const double *a, const int *lda, -// const double *b, const int *ldb, -// const double *beta, double *c, const int *ldc); - -extern "C" cusolverStatus_t cusolverDnSgemmHost(cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const float *alpha, - const float *A, - int lda, - const float *B, - int ldb, - const float *beta, - float *C, - int ldc); - -void lapack_gemm(const char transa, - const char transb, - int m, - int n, - int k, - float alpha, - const float *a, - int lda, - const float *b, - int ldb, - float beta, - float *c, - int ldc) -{ - cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cusolverDnSgemmHost( - cublas_transa, cublas_transb, m, n, k, &alpha, (float *)a, lda, (float *)b, ldb, &beta, c, ldc); -} - -extern "C" cusolverStatus_t cusolverDnDgemmHost(cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const double *alpha, - const double *A, - int lda, - const double *B, - int ldb, - const double *beta, - double *C, - int ldc); - -void lapack_gemm(const signed char transa, - const signed char transb, - int m, - int n, - int k, - double alpha, - const double *a, - int lda, - const double *b, - int ldb, - double beta, - double *c, - int ldc) -{ - cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cusolverDnDgemmHost(cublas_transa, - cublas_transb, - m, - n, - k, - &alpha, - (double *)a, - lda, - (double *)b, - ldb, - &beta, - c, - ldc); -} - -// XSTERF -// extern "C" -// void ssterf_(const int *n, float *d, float *e, int *info); -// -// extern "C" -// void dsterf_(const int *n, double *d, double *e, int *info); -// - -extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float *d, float *e, int *info); - -void lapack_sterf(int n, float *d, float *e, int *info) { cusolverDnSsterfHost(n, d, e, info); } - -extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double *d, double *e, int *info); - -void lapack_sterf(int n, double *d, double *e, int *info) { cusolverDnDsterfHost(n, d, e, info); } - -// XSTEQR -// extern "C" -// void ssteqr_(const char *compz, const int *n, float *d, float *e, -// float *z, const int *ldz, float *work, int * info); -// extern "C" -// void dsteqr_(const char *compz, const int *n, double *d, double *e, -// double *z, const int *ldz, double *work, int *info); - -extern "C" cusolverStatus_t cusolverDnSsteqrHost( - const signed char *compz, int n, float *d, float *e, float *z, int ldz, float *work, int *info); - -void lapack_steqr( - const signed char compz, int n, float *d, float *e, float *z, int ldz, float *work, int *info) -{ - cusolverDnSsteqrHost(&compz, n, d, e, z, ldz, work, info); -} - -extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char *compz, - int n, - double *d, - double *e, - double *z, - int ldz, - double *work, - int *info); - -void lapack_steqr( - const signed char compz, int n, double *d, double *e, double *z, int ldz, double *work, int *info) -{ - cusolverDnDsteqrHost(&compz, n, d, e, z, ldz, work, info); -} - -#ifdef NVGRAPH_USE_LAPACK - -extern "C" void sgeqrf_( - int *m, int *n, float *a, int *lda, float *tau, float *work, int *lwork, int *info); -extern "C" void dgeqrf_( - int *m, int *n, double *a, int *lda, double *tau, double *work, int *lwork, int *info); -// extern "C" -// void cgeqrf_(int *m, int *n, std::complex *a, int *lda, std::complex *tau, -// std::complex *work, int *lwork, int *info); extern "C" void zgeqrf_(int *m, int *n, -// std::complex *a, int *lda, std::complex *tau, std::complex *work, int -// *lwork, int *info); - -void lapack_geqrf(int m, int n, float *a, int lda, float *tau, float *work, int *lwork, int *info) -{ - sgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); -} -void lapack_geqrf( - int m, int n, double *a, int lda, double *tau, double *work, int *lwork, int *info) -{ - dgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); -} -// void lapack_geqrf(int m, int n, std::complex *a, int lda, std::complex *tau, -// std::complex *work, int *lwork, int *info) -//{ -// cgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); -//} -// void lapack_geqrf(int m, int n, std::complex *a, int lda, std::complex *tau, -// std::complex *work, int *lwork, int *info) -//{ -// zgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); -//} - -extern "C" void sormqr_(char *side, - char *trans, - int *m, - int *n, - int *k, - float *a, - int *lda, - const float *tau, - float *c, - int *ldc, - float *work, - int *lwork, - int *info); -extern "C" void dormqr_(char *side, - char *trans, - int *m, - int *n, - int *k, - double *a, - int *lda, - const double *tau, - double *c, - int *ldc, - double *work, - int *lwork, - int *info); -// extern "C" -// void cunmqr_ (char* side, char* trans, int *m, int *n, int *k, std::complex *a, int *lda, -// const std::complex *tau, std::complex* c, int *ldc, std::complex *work, int -// *lwork, int *info); extern "C" void zunmqr_(char* side, char* trans, int *m, int *n, int *k, -// std::complex *a, int *lda, const std::complex *tau, std::complex* c, int -// *ldc, std::complex *work, int *lwork, int *info); - -void lapack_ormqr(char side, - char trans, - int m, - int n, - int k, - float *a, - int lda, - float *tau, - float *c, - int ldc, - float *work, - int *lwork, - int *info) -{ - sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); -} -void lapack_ormqr(char side, - char trans, - int m, - int n, - int k, - double *a, - int lda, - double *tau, - double *c, - int ldc, - double *work, - int *lwork, - int *info) -{ - dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); -} -// void lapack_unmqr(char side, char trans, int m, int n, int k, std::complex *a, int lda, -// std::complex *tau, std::complex* c, int ldc, std::complex *work, int *lwork, -// int *info) -//{ -// cunmqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); -//} -// void lapack_unmqr(char side, char trans, int m, int n, int k, std::complex *a, int lda, -// std::complex *tau, std::complex* c, int ldc, std::complex *work, int -// *lwork, int *info) -//{ -// zunmqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); -//} - -// extern "C" -// void sorgqr_ ( int* m, int* n, int* k, float* a, int* lda, const float* tau, float* work, int* -// lwork, int *info ); extern "C" void dorgqr_ ( int* m, int* n, int* k, double* a, int* lda, const -// double* tau, double* work, int* lwork, int *info ); -// -// void lapack_orgqr( int m, int n, int k, float* a, int lda, const float* tau, float* work, int -// *lwork, int *info) -// { -// sorgqr_(&m, &n, &k, a, &lda, tau, work, lwork, info); -// } -// void lapack_orgqr( int m, int n, int k, double* a, int lda, const double* tau, double* work, int* -// lwork, int *info ) -// { -// dorgqr_(&m, &n, &k, a, &lda, tau, work, lwork, info); -// } - -// int lapack_hseqr_dispatch(char *jobvl, char *jobvr, int* n, int*ilo, int*ihi, -// double *h, int* ldh, double *wr, double *wi, double *z, -// int*ldz, double *work, int *lwork, int *info) -//{ -// return dhseqr_(jobvl, jobvr, n, ilo, ihi, h, ldh, wr, wi, z, ldz, work, lwork, info); -//} -// -// int lapack_hseqr_dispatch(char *jobvl, char *jobvr, int* n, int*ilo, int*ihi, -// float *h, int* ldh, float *wr, float *wi, float *z, -// int*ldz, float *work, int *lwork, int *info) -//{ -// return shseqr_(jobvl, jobvr, n, ilo, ihi, h, ldh, wr, wi, z, ldz, work, lwork, info); -//} - -// XGEEV -extern "C" int dgeev_(char *jobvl, - char *jobvr, - int *n, - double *a, - int *lda, - double *wr, - double *wi, - double *vl, - int *ldvl, - double *vr, - int *ldvr, - double *work, - int *lwork, - int *info); - -extern "C" int sgeev_(char *jobvl, - char *jobvr, - int *n, - float *a, - int *lda, - float *wr, - float *wi, - float *vl, - int *ldvl, - float *vr, - int *ldvr, - float *work, - int *lwork, - int *info); - -// extern "C" -// int dhseqr_(char *jobvl, char *jobvr, int* n, int*ilo, int*ihi, -// double *h, int* ldh, double *wr, double *wi, double *z, -// int*ldz, double *work, int *lwork, int *info); -// extern "C" -// int shseqr_(char *jobvl, char *jobvr, int* n, int*ilo, int*ihi, -// float *h, int* ldh, float *wr, float *wi, float *z, -// int*ldz, float *work, int *lwork, int *info); -// -int lapack_geev_dispatch(char *jobvl, - char *jobvr, - int *n, - double *a, - int *lda, - double *wr, - double *wi, - double *vl, - int *ldvl, - double *vr, - int *ldvr, - double *work, - int *lwork, - int *info) -{ - return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); -} - -int lapack_geev_dispatch(char *jobvl, - char *jobvr, - int *n, - float *a, - int *lda, - float *wr, - float *wi, - float *vl, - int *ldvl, - float *vr, - int *ldvr, - float *work, - int *lwork, - int *info) -{ - return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); -} - -// real eigenvalues -template -void lapack_geev(T *A, T *eigenvalues, int dim, int lda) -{ - char job = 'N'; - std::vector WI(dim); - int ldv = 1; - T *vl = 0; - int work_size = 6 * dim; - std::vector work(work_size); - int info; - lapack_geev_dispatch(&job, - &job, - &dim, - A, - &lda, - eigenvalues, - WI.data(), - vl, - &ldv, - vl, - &ldv, - work.data(), - &work_size, - &info); - lapackCheckError(info); -} -// real eigenpairs -template -void lapack_geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, int ldvr) -{ - char jobvl = 'N'; - char jobvr = 'V'; - std::vector WI(dim); - int work_size = 6 * dim; - T *vl = 0; - int ldvl = 1; - std::vector work(work_size); - int info; - lapack_geev_dispatch(&jobvl, - &jobvr, - &dim, - A, - &lda, - eigenvalues, - WI.data(), - vl, - &ldvl, - eigenvectors, - &ldvr, - work.data(), - &work_size, - &info); - lapackCheckError(info); -} -// complex eigenpairs -template -void lapack_geev(T *A, - T *eigenvalues_r, - T *eigenvalues_i, - T *eigenvectors_r, - T *eigenvectors_i, - int dim, - int lda, - int ldvr) -{ - char jobvl = 'N'; - char jobvr = 'V'; - int work_size = 8 * dim; - int ldvl = 1; - std::vector work(work_size); - int info; - lapack_geev_dispatch(&jobvl, - &jobvr, - &dim, - A, - &lda, - eigenvalues_r, - eigenvalues_i, - 0, - &ldvl, - eigenvectors_r, - &ldvr, - work.data(), - &work_size, - &info); - lapackCheckError(info); -} - -// template -// void lapack_hseqr(T* Q, T* H, T* eigenvalues, int dim, int ldh, int ldq) -//{ -// char job = 'S'; // S compute eigenvalues and the Schur form T. On entry, the upper Hessenberg -// matrix H. -// // On exit H contains the upper quasi-triangular matrix T from the Schur -// decomposition -// char jobvr = 'V'; //Take Q on entry, and the product Q*Z is returned. -// //ILO and IHI are normally set by a previous call to DGEBAL, Otherwise ILO and IHI should be -// set to 1 and N int ilo = 1; int ihi = dim; T* WI = new T[dim]; int ldv = 1; T* vl = 0; int -// work_size = 11 * dim; //LWORK as large as 11*N may be required for optimal performance. It is -// CPU memory and the matrix is assumed to be small T* work = new T[work_size]; int info; -// lapack_hseqr_dispatch(&job, &jobvr, &dim, &ilo, &ihi, H, &ldh, eigenvalues, WI, Q, &ldq, work, -// &work_size, &info); lapackCheckError(info); delete [] WI; delete [] work; -//} - -#endif - -} // end anonymous namespace - -template -void Lapack::gemm(bool transa, - bool transb, - int m, - int n, - int k, - T alpha, - const T *A, - int lda, - const T *B, - int ldb, - T beta, - T *C, - int ldc) -{ - // check_lapack_enabled(); - //#ifdef NVGRAPH_USE_LAPACK - const char transA_char = transa ? 'T' : 'N'; - const char transB_char = transb ? 'T' : 'N'; - lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); - //#endif -} - -template -void Lapack::sterf(int n, T *d, T *e) -{ - // check_lapack_enabled(); - //#ifdef NVGRAPH_USE_LAPACK - int info; - lapack_sterf(n, d, e, &info); - lapackCheckError(info); - //#endif -} - -template -void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) -{ - // check_lapack_enabled(); - //#ifdef NVGRAPH_USE_LAPACK - int info; - lapack_steqr(compz, n, d, e, z, ldz, work, &info); - lapackCheckError(info); - //#endif -} - -template -void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork) -{ - check_lapack_enabled(); -#ifdef NVGRAPH_USE_LAPACK - int info; - lapack_geqrf(m, n, a, lda, tau, work, lwork, &info); - lapackCheckError(info); -#endif -} -template -void Lapack::ormqr(bool right_side, - bool transq, - int m, - int n, - int k, - T *a, - int lda, - T *tau, - T *c, - int ldc, - T *work, - int *lwork) -{ - check_lapack_enabled(); -#ifdef NVGRAPH_USE_LAPACK - char side = right_side ? 'R' : 'L'; - char trans = transq ? 'T' : 'N'; - int info; - lapack_ormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info); - lapackCheckError(info); -#endif -} - -// template -// void Lapack< T >::unmqr(bool right_side, bool transq, int m, int n, int k, T *a, int lda, T *tau, -// T *c, int ldc, T *work, int *lwork) -//{ -// check_lapack_enabled(); -// #ifdef NVGRAPH_USE_LAPACK -// char side = right_side ? 'R' : 'L'; -// char trans = transq ? 'T' : 'N'; -// int info; -// lapack_unmqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info); -// lapackCheckError(info); -// #endif -//} - -// template -// void Lapack< T >::orgqr( int m, int n, int k, T* a, int lda, const T* tau, T* work, int* lwork) -//{ -// check_lapack_enabled(); -// #ifdef NVGRAPH_USE_LAPACK -// int info; -// lapack_orgqr(m, n, k, a, lda, tau, work, lwork, &info); -// lapackCheckError(info); -// #endif -//} -// template -// void Lapack< T >::qrf(int n, int k, T *H, T *C, T *Q, T *R) -//{ -// check_lapack_enabled(); -// #ifdef NVGRAPH_USE_LAPACK -// // int m = n, k = n, lda=n, lwork=2*n, info; -// // lapack_geqrf(m, n, H, lda, C, work, lwork, &info); -// // lapackCheckError(info); -// // lapack_ormqr(m, n, k, H, lda, tau, c, ldc, work, lwork, &info); -// // lapackCheckError(info); -// #endif -//} - -// real eigenvalues -template -void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) -{ - check_lapack_enabled(); -#ifdef NVGRAPH_USE_LAPACK - lapack_geev(A, eigenvalues, dim, lda); -#endif -} -// real eigenpairs -template -void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, int ldvr) -{ - check_lapack_enabled(); -#ifdef NVGRAPH_USE_LAPACK - lapack_geev(A, eigenvalues, eigenvectors, dim, lda, ldvr); -#endif -} -// complex eigenpairs -template -void Lapack::geev(T *A, - T *eigenvalues_r, - T *eigenvalues_i, - T *eigenvectors_r, - T *eigenvectors_i, - int dim, - int lda, - int ldvr) -{ - check_lapack_enabled(); -#ifdef NVGRAPH_USE_LAPACK - lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, dim, lda, ldvr); -#endif -} - -// template -// void Lapack< T >::hseqr(T* Q, T* H, T* eigenvalues,T* eigenvectors, int dim, int ldh, int ldq) -//{ -// check_lapack_enabled(); -//#ifdef NVGRAPH_USE_LAPACK -// lapack_hseqr(Q, H, eigenvalues, dim, ldh, ldq); -//#endif -//} - -// Explicit instantiation -template void Lapack::check_lapack_enabled(); -template void Lapack::gemm(bool transa, - bool transb, - int m, - int n, - int k, - float alpha, - const float *A, - int lda, - const float *B, - int ldb, - float beta, - float *C, - int ldc); -template void Lapack::sterf(int n, float *d, float *e); -template void Lapack::geev( - float *A, float *eigenvalues, float *eigenvectors, int dim, int lda, int ldvr); -template void Lapack::geev(float *A, - float *eigenvalues_r, - float *eigenvalues_i, - float *eigenvectors_r, - float *eigenvectors_i, - int dim, - int lda, - int ldvr); -// template void Lapack::hseqr(float* Q, float* H, float* eigenvalues, float* eigenvectors, -// int dim, int ldh, int ldq); -template void Lapack::steqr( - char compz, int n, float *d, float *e, float *z, int ldz, float *work); -template void Lapack::geqrf( - int m, int n, float *a, int lda, float *tau, float *work, int *lwork); -template void Lapack::ormqr(bool right_side, - bool transq, - int m, - int n, - int k, - float *a, - int lda, - float *tau, - float *c, - int ldc, - float *work, - int *lwork); -// template void Lapack::orgqr(int m, int n, int k, float* a, int lda, const float* tau, -// float* work, int* lwork); - -template void Lapack::check_lapack_enabled(); -template void Lapack::gemm(bool transa, - bool transb, - int m, - int n, - int k, - double alpha, - const double *A, - int lda, - const double *B, - int ldb, - double beta, - double *C, - int ldc); -template void Lapack::sterf(int n, double *d, double *e); -template void Lapack::geev( - double *A, double *eigenvalues, double *eigenvectors, int dim, int lda, int ldvr); -template void Lapack::geev(double *A, - double *eigenvalues_r, - double *eigenvalues_i, - double *eigenvectors_r, - double *eigenvectors_i, - int dim, - int lda, - int ldvr); -// template void Lapack::hseqr(double* Q, double* H, double* eigenvalues, double* -// eigenvectors, int dim, int ldh, int ldq); -template void Lapack::steqr( - char compz, int n, double *d, double *e, double *z, int ldz, double *work); -template void Lapack::geqrf( - int m, int n, double *a, int lda, double *tau, double *work, int *lwork); -template void Lapack::ormqr(bool right_side, - bool transq, - int m, - int n, - int k, - double *a, - int lda, - double *tau, - double *c, - int ldc, - double *work, - int *lwork); -// template void Lapack::orgqr(int m, int n, int k, double* a, int lda, const double* tau, -// double* work, int* lwork); - -// template void Lapack >::geqrf(int m, int n, std::complex *a, int lda, -// std::complex *tau, std::complex *work, int *lwork); template void -// Lapack >::geqrf(int m, int n, std::complex *a, int lda, -// std::complex *tau, std::complex *work, int *lwork); template void -// Lapack >::unmqr(bool right_side, bool transq, int m, int n, int k, -// std::complex *a, int lda, std::complex *tau, std::complex *c, int ldc, -// std::complex *work, int *lwork); template void Lapack >::unmqr(bool -// right_side, bool transq, int m, int n, int k, std::complex *a, int lda, -// std::complex *tau, std::complex *c, int ldc, std::complex *work, int -// *lwork); - -} // end namespace nvgraph diff --git a/cpp/src/nvgraph/nvgraph_vector_kernels.cu b/cpp/src/nvgraph/nvgraph_vector_kernels.cu deleted file mode 100644 index a2d8234f9e6..00000000000 --- a/cpp/src/nvgraph/nvgraph_vector_kernels.cu +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include -#include "include/nvgraph_error.hxx" -#include "include/nvgraph_vector_kernels.hxx" - -#include "include/debug_macros.h" - -namespace nvgraph { - -void check_size(size_t sz) -{ - if (sz > INT_MAX) FatalError("Vector larger than INT_MAX", NVGRAPH_ERR_BAD_PARAMETERS); -} -template -void nrm1_raw_vec(ValueType_* vec, size_t n, ValueType_* res, cudaStream_t stream) -{ - thrust::device_ptr dev_ptr(vec); - *res = thrust::reduce(dev_ptr, dev_ptr + n); - cudaCheckError(); -} - -template -void fill_raw_vec(ValueType_* vec, size_t n, ValueType_ value, cudaStream_t stream) -{ - thrust::device_ptr dev_ptr(vec); - thrust::fill(dev_ptr, dev_ptr + n, value); - cudaCheckError(); -} - -template -void dump_raw_vec(ValueType_* vec, size_t n, int offset, cudaStream_t stream) -{ -#ifdef DEBUG - thrust::device_ptr dev_ptr(vec); - COUT().precision(15); - COUT() << "sample size = " << n << ", offset = " << offset << std::endl; - thrust::copy( - dev_ptr + offset, dev_ptr + offset + n, std::ostream_iterator(COUT(), " ")); - cudaCheckError(); - COUT() << std::endl; -#endif -} - -template -__global__ void flag_zeroes_kernel(int num_vertices, ValueType_* vec, int* flags) -{ - int tidx = blockDim.x * blockIdx.x + threadIdx.x; - for (int r = tidx; r < num_vertices; r += blockDim.x * gridDim.x) { - if (vec[r] != 0.0) - flags[r] = 1; // NOTE 2 : alpha*0 + (1-alpha)*1 = (1-alpha) - else - flags[r] = 0; - } -} -template -__global__ void dmv0_kernel(const ValueType_* __restrict__ D, - const ValueType_* __restrict__ x, - ValueType_* __restrict__ y, - int n) -{ - // y=D*x - int tidx = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = tidx; i < n; i += blockDim.x * gridDim.x) y[i] = D[i] * x[i]; -} -template -__global__ void dmv1_kernel(const ValueType_* __restrict__ D, - const ValueType_* __restrict__ x, - ValueType_* __restrict__ y, - int n) -{ - // y+=D*x - int tidx = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = tidx; i < n; i += blockDim.x * gridDim.x) y[i] += D[i] * x[i]; -} -template -void copy_vec(ValueType_* vec1, size_t n, ValueType_* res, cudaStream_t stream) -{ - thrust::device_ptr dev_ptr(vec1); - thrust::device_ptr res_ptr(res); -#ifdef DEBUG - // COUT() << "copy "<< n << " elements" << std::endl; -#endif - thrust::copy_n(dev_ptr, n, res_ptr); - cudaCheckError(); - // dump_raw_vec (res, n, 0); -} - -template -void flag_zeros_raw_vec(size_t num_vertices, ValueType_* vec, int* flags, cudaStream_t stream) -{ - int items_per_thread = 4; - int num_threads = 128; - int max_grid_size = 4096; - check_size(num_vertices); - int n = static_cast(num_vertices); - int num_blocks = std::min(max_grid_size, (n / (items_per_thread * num_threads)) + 1); - flag_zeroes_kernel<<>>(num_vertices, vec, flags); - cudaCheckError(); -} - -template -void dmv(size_t num_vertices, - ValueType_ alpha, - ValueType_* D, - ValueType_* x, - ValueType_ beta, - ValueType_* y, - cudaStream_t stream) -{ - int items_per_thread = 4; - int num_threads = 128; - int max_grid_size = 4096; - check_size(num_vertices); - int n = static_cast(num_vertices); - int num_blocks = std::min(max_grid_size, (n / (items_per_thread * num_threads)) + 1); - if (alpha == 1.0 && beta == 0.0) - dmv0_kernel<<>>(D, x, y, n); - else if (alpha == 1.0 && beta == 1.0) - dmv1_kernel<<>>(D, x, y, n); - else - FatalError("Not implemented case of y = D*x", NVGRAPH_ERR_BAD_PARAMETERS); - - cudaCheckError(); -} - -template -void set_connectivity(size_t n, - IndexType_ root, - ValueType_ self_loop_val, - ValueType_ unreachable_val, - ValueType_* res, - cudaStream_t stream) -{ - fill_raw_vec(res, n, unreachable_val); - cudaMemcpy(&res[root], &self_loop_val, sizeof(self_loop_val), cudaMemcpyHostToDevice); - cudaCheckError(); -} - -template void nrm1_raw_vec(float* vec, size_t n, float* res, cudaStream_t stream); -template void nrm1_raw_vec(double* vec, size_t n, double* res, cudaStream_t stream); - -template void dmv( - size_t num_vertices, float alpha, float* D, float* x, float beta, float* y, cudaStream_t stream); -template void dmv(size_t num_vertices, - double alpha, - double* D, - double* x, - double beta, - double* y, - cudaStream_t stream); - -template void set_connectivity( - size_t n, int root, float self_loop_val, float unreachable_val, float* res, cudaStream_t stream); -template void set_connectivity(size_t n, - int root, - double self_loop_val, - double unreachable_val, - double* res, - cudaStream_t stream); - -template void flag_zeros_raw_vec(size_t num_vertices, - float* vec, - int* flags, - cudaStream_t stream); -template void flag_zeros_raw_vec(size_t num_vertices, - double* vec, - int* flags, - cudaStream_t stream); - -template void fill_raw_vec(float* vec, size_t n, float value, cudaStream_t stream); -template void fill_raw_vec(double* vec, size_t n, double value, cudaStream_t stream); -template void fill_raw_vec(int* vec, size_t n, int value, cudaStream_t stream); -template void fill_raw_vec(char* vec, size_t n, char value, cudaStream_t stream); - -template void copy_vec(float* vec1, size_t n, float* res, cudaStream_t stream); -template void copy_vec(double* vec1, size_t n, double* res, cudaStream_t stream); -template void copy_vec(int* vec1, size_t n, int* res, cudaStream_t stream); -template void copy_vec(char* vec1, size_t n, char* res, cudaStream_t stream); - -template void dump_raw_vec(float* vec, size_t n, int off, cudaStream_t stream); -template void dump_raw_vec(double* vec, size_t n, int off, cudaStream_t stream); -template void dump_raw_vec(int* vec, size_t n, int off, cudaStream_t stream); -template void dump_raw_vec(char* vec, size_t n, int off, cudaStream_t stream); -} // end namespace nvgraph diff --git a/cpp/src/nvgraph/partition.cu b/cpp/src/nvgraph/partition.cu deleted file mode 100644 index e4b9f507908..00000000000 --- a/cpp/src/nvgraph/partition.cu +++ /dev/null @@ -1,424 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "include/partition.hxx" - -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace nvgraph { - -// ========================================================= -// Useful macros -// ========================================================= - -// Get index of matrix entry -#define IDX(i, j, lda) ((i) + (j) * (lda)) - -template -static __global__ void scale_obs_kernel(IndexType_ m, IndexType_ n, ValueType_ *obs) -{ - IndexType_ i, j, k, index, mm; - ValueType_ alpha, v, last; - bool valid; - // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension - - // compute alpha - mm = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x); // m in multiple of blockDim.x - alpha = 0.0; - // printf("[%d,%d,%d,%d] n=%d, li=%d, mn=%d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y, n, - // li, mn); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { - for (i = threadIdx.x; i < mm; i += blockDim.x) { - // check if the thread is valid - valid = i < m; - - // get the value of the last thread - last = utils::shfl(alpha, blockDim.x - 1, blockDim.x); - - // if you are valid read the value from memory, otherwise set your value to 0 - alpha = (valid) ? obs[i + j * m] : 0.0; - alpha = alpha * alpha; - - // do prefix sum (of size warpSize=blockDim.x =< 32) - for (k = 1; k < blockDim.x; k *= 2) { - v = utils::shfl_up(alpha, k, blockDim.x); - if (threadIdx.x >= k) alpha += v; - } - // shift by last - alpha += last; - } - } - - // scale by alpha - alpha = utils::shfl(alpha, blockDim.x - 1, blockDim.x); - alpha = std::sqrt(alpha); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { - for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 - index = i + j * m; - obs[index] = obs[index] / alpha; - } - } -} - -template -IndexType_ next_pow2(IndexType_ n) -{ - IndexType_ v; - // Reference: - // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float - v = n - 1; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - return v + 1; -} - -template -cudaError_t scale_obs(IndexType_ m, IndexType_ n, ValueType_ *obs) -{ - IndexType_ p2m; - dim3 nthreads, nblocks; - - // find next power of 2 - p2m = next_pow2(m); - // setup launch configuration - nthreads.x = max(2, min(p2m, 32)); - nthreads.y = 256 / nthreads.x; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = (n + nthreads.y - 1) / nthreads.y; - nblocks.z = 1; - // printf("m=%d(%d),n=%d,obs=%p, - // nthreads=(%d,%d,%d),nblocks=(%d,%d,%d)\n",m,p2m,n,obs,nthreads.x,nthreads.y,nthreads.z,nblocks.x,nblocks.y,nblocks.z); - - // launch scaling kernel (scale each column of obs by its norm) - scale_obs_kernel<<>>(m, n, obs); - cudaCheckError(); - - return cudaSuccess; -} - -// ========================================================= -// Spectral partitioner -// ========================================================= - -/// Compute spectral graph partition -/** Compute partition for a weighted undirected graph. This - * partition attempts to minimize the cost function: - * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) - * - * @param G Weighted graph in CSR format - * @param nParts Number of partitions. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter_lanczos Maximum number of Lanczos iterations. - * @param restartIter_lanczos Maximum size of Lanczos system before - * implicit restart. - * @param tol_lanczos Convergence tolerance for Lanczos method. - * @param maxIter_kmeans Maximum number of k-means iterations. - * @param tol_kmeans Convergence tolerance for k-means algorithm. - * @param parts (Output, device memory, n entries) Partition - * assignments. - * @param iters_lanczos On exit, number of Lanczos iterations - * performed. - * @param iters_kmeans On exit, number of k-means iterations - * performed. - * @return NVGRAPH error flag. - */ -template -NVGRAPH_ERROR partition( - cugraph::experimental::GraphCSRView const &graph, - vertex_t nParts, - vertex_t nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - weight_t tol_lanczos, - int maxIter_kmeans, - weight_t tol_kmeans, - vertex_t *__restrict__ parts, - weight_t *eigVals, - weight_t *eigVecs) -{ - cudaStream_t stream = 0; - - const weight_t zero{0.0}; - const weight_t one{1.0}; - - int iters_lanczos; - int iters_kmeans; - - edge_t i; - edge_t n = graph.number_of_vertices; - - // k-means residual - weight_t residual_kmeans; - - // ------------------------------------------------------- - // Spectral partitioner - // ------------------------------------------------------- - - // Compute eigenvectors of Laplacian - - // Initialize Laplacian - CsrMatrix A(false, - false, - graph.number_of_vertices, - graph.number_of_vertices, - graph.number_of_edges, - 0, - graph.edge_data, - graph.offsets, - graph.indices); - LaplacianMatrix L(A); - - // Compute smallest eigenvalues and eigenvectors - CHECK_NVGRAPH(computeSmallestEigenvectors(L, - nEigVecs, - maxIter_lanczos, - restartIter_lanczos, - tol_lanczos, - false, - iters_lanczos, - eigVals, - eigVecs)); - - // Whiten eigenvector matrix - for (i = 0; i < nEigVecs; ++i) { - weight_t mean, std; - - mean = thrust::reduce(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); - cudaCheckError(); - mean /= n; - thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), - thrust::make_constant_iterator(mean), - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::minus()); - cudaCheckError(); - std = Cublas::nrm2(n, eigVecs + IDX(0, i, n), 1) / std::sqrt(static_cast(n)); - thrust::transform(thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), - thrust::make_constant_iterator(std), - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::divides()); - cudaCheckError(); - } - - // Transpose eigenvector matrix - // TODO: in-place transpose - { - Vector work(nEigVecs * n, stream); - Cublas::set_pointer_mode_host(); - Cublas::geam(true, - false, - nEigVecs, - n, - &one, - eigVecs, - n, - &zero, - (weight_t *)NULL, - nEigVecs, - work.raw(), - nEigVecs); - CHECK_CUDA(cudaMemcpyAsync( - eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice)); - } - - // Clean up - - // eigVecs.dump(0, nEigVecs*n); - // Find partition with k-means clustering - CHECK_NVGRAPH(kmeans(n, - nEigVecs, - nParts, - tol_kmeans, - maxIter_kmeans, - eigVecs, - parts, - residual_kmeans, - iters_kmeans)); - - return NVGRAPH_OK; -} - -// ========================================================= -// Analysis of graph partition -// ========================================================= - -namespace { -/// Functor to generate indicator vectors -/** For use in Thrust transform - */ -template -struct equal_to_i_op { - const IndexType_ i; - - public: - equal_to_i_op(IndexType_ _i) : i(_i) {} - template - __host__ __device__ void operator()(Tuple_ t) - { - thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (ValueType_)1.0 : (ValueType_)0.0; - } -}; -} // namespace - -/// Compute cost function for partition -/** This function determines the edges cut by a partition and a cost - * function: - * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) - * Graph is assumed to be weighted and undirected. - * - * @param G Weighted graph in CSR format - * @param nParts Number of partitions. - * @param parts (Input, device memory, n entries) Partition - * assignments. - * @param edgeCut On exit, weight of edges cut by partition. - * @param cost On exit, partition cost function. - * @return NVGRAPH error flag. - */ -template -NVGRAPH_ERROR analyzePartition( - cugraph::experimental::GraphCSRView const &graph, - vertex_t nParts, - const vertex_t *__restrict__ parts, - weight_t &edgeCut, - weight_t &cost) -{ - cudaStream_t stream = 0; - - edge_t i; - edge_t n = graph.number_of_vertices; - - weight_t partEdgesCut, partSize; - - // Device memory - Vector part_i(n, stream); - Vector Lx(n, stream); - - // Initialize cuBLAS - Cublas::set_pointer_mode_host(); - - // Initialize Laplacian - CsrMatrix A(false, - false, - graph.number_of_vertices, - graph.number_of_vertices, - graph.number_of_edges, - 0, - graph.edge_data, - graph.offsets, - graph.indices); - LaplacianMatrix L(A); - - // Initialize output - cost = 0; - edgeCut = 0; - - // Iterate through partitions - for (i = 0; i < nParts; ++i) { - // Construct indicator vector for ith partition - thrust::for_each( - thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts), - thrust::device_pointer_cast(part_i.raw()))), - thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(parts + n), - thrust::device_pointer_cast(part_i.raw() + n))), - equal_to_i_op(i)); - cudaCheckError(); - - // Compute size of ith partition - Cublas::dot(n, part_i.raw(), 1, part_i.raw(), 1, &partSize); - partSize = round(partSize); - if (partSize < 0.5) { - WARNING("empty partition"); - continue; - } - - // Compute number of edges cut by ith partition - L.mv(1, part_i.raw(), 0, Lx.raw()); - Cublas::dot(n, Lx.raw(), 1, part_i.raw(), 1, &partEdgesCut); - - // Record results - cost += partEdgesCut / partSize; - edgeCut += partEdgesCut / 2; - } - - // Clean up and return - return NVGRAPH_OK; -} - -// ========================================================= -// Explicit instantiation -// ========================================================= -template NVGRAPH_ERROR partition( - cugraph::experimental::GraphCSRView const &graph, - int nParts, - int nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - float tol_lanczos, - int maxIter_kmeans, - float tol_kmeans, - int *__restrict__ parts, - float *eigVals, - float *eigVecs); - -template NVGRAPH_ERROR partition( - cugraph::experimental::GraphCSRView const &graph, - int nParts, - int nEigVecs, - int maxIter_lanczos, - int restartIter_lanczos, - double tol_lanczos, - int maxIter_kmeans, - double tol_kmeans, - int *__restrict__ parts, - double *eigVals, - double *eigVecs); - -template NVGRAPH_ERROR analyzePartition( - cugraph::experimental::GraphCSRView const &graph, - int nParts, - const int *__restrict__ parts, - float &edgeCut, - float &cost); -template NVGRAPH_ERROR analyzePartition( - cugraph::experimental::GraphCSRView const &graph, - int nParts, - const int *__restrict__ parts, - double &edgeCut, - double &cost); - -} // namespace nvgraph diff --git a/cpp/src/nvgraph/spectral_matrix.cu b/cpp/src/nvgraph/spectral_matrix.cu deleted file mode 100644 index 66c2160741e..00000000000 --- a/cpp/src/nvgraph/spectral_matrix.cu +++ /dev/null @@ -1,765 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -//#ifdef NVGRAPH_PARTITION -//#ifdef DEBUG - -#include "include/spectral_matrix.hxx" - -#include -#include - -#include "include/debug_macros.h" -#include "include/nvgraph_cublas.hxx" -#include "include/nvgraph_cusparse.hxx" -#include "include/nvgraph_error.hxx" -#include "include/nvgraph_vector.hxx" - -// ========================================================= -// Useful macros -// ========================================================= - -// CUDA block size -#define BLOCK_SIZE 1024 - -// Get index of matrix entry -#define IDX(i, j, lda) ((i) + (j) * (lda)) - -namespace nvgraph { - -// ============================================= -// CUDA kernels -// ============================================= - -namespace { - -/// Apply diagonal matrix to vector -template -static __global__ void diagmv(IndexType_ n, - ValueType_ alpha, - const ValueType_ *__restrict__ D, - const ValueType_ *__restrict__ x, - ValueType_ *__restrict__ y) -{ - IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; - while (i < n) { - y[i] += alpha * D[i] * x[i]; - i += blockDim.x * gridDim.x; - } -} - -/// Apply diagonal matrix to a set of dense vectors (tall matrix) -template -static __global__ void diagmm(IndexType_ n, - IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ D, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) -{ - IndexType_ i, j, index; - - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < k; j += blockDim.y * gridDim.y) { - for (i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x) { - index = i + j * n; - if (beta_is_zero) { - y[index] = alpha * D[i] * x[index]; - } else { - y[index] = alpha * D[i] * x[index] + beta * y[index]; - } - } - } -} -} // namespace - -// ============================================= -// Dense matrix class -// ============================================= - -/// Constructor for dense matrix class -/** @param _trans Whether to transpose matrix. - * @param _m Number of rows. - * @param _n Number of columns. - * @param _A (Input, device memory, _m*_n entries) Matrix - * entries, stored column-major. - * @param _lda Leading dimension of _A. - */ -template -DenseMatrix::DenseMatrix( - bool _trans, IndexType_ _m, IndexType_ _n, const ValueType_ *_A, IndexType_ _lda) - : Matrix(_m, _n), trans(_trans), A(_A), lda(_lda) -{ - Cublas::set_pointer_mode_host(); - if (_lda < _m) FatalError("invalid dense matrix parameter (lda -DenseMatrix::~DenseMatrix() -{ -} - -/// Get and Set CUDA stream -template -void DenseMatrix::setCUDAStream(cudaStream_t _s) -{ - this->s = _s; - // printf("DenseMatrix setCUDAStream stream=%p\n",this->s); - Cublas::setStream(_s); -} -template -void DenseMatrix::getCUDAStream(cudaStream_t *_s) -{ - *_s = this->s; - // CHECK_CUBLAS(cublasGetStream(cublasHandle, _s)); -} - -/// Matrix-vector product for dense matrix class -/** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n entries) Vector. - * @param beta Scalar. - * @param y (Input/output, device memory, m entries) Output vector. - */ -template -void DenseMatrix::mv(ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - Cublas::gemv(this->trans, this->m, this->n, &alpha, this->A, this->lda, x, 1, &beta, y, 1); -} - -template -void DenseMatrix::mm(IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - Cublas::gemm( - this->trans, false, this->m, k, this->n, &alpha, A, lda, x, this->m, &beta, y, this->n); -} - -/// Color and Reorder -template -void DenseMatrix::color(IndexType_ *c, IndexType_ *p) const -{ -} - -template -void DenseMatrix::reorder(IndexType_ *p) const -{ -} - -/// Incomplete Cholesky (setup, factor and solve) -template -void DenseMatrix::prec_setup(Matrix *_M) -{ - printf("ERROR: DenseMatrix prec_setup dispacthed\n"); - // exit(1); -} - -template -void DenseMatrix::prec_solve(IndexType_ k, - ValueType_ alpha, - ValueType_ *__restrict__ fx, - ValueType_ *__restrict__ t) const -{ - printf("ERROR: DenseMatrix prec_solve dispacthed\n"); - // exit(1); -} - -template -ValueType_ DenseMatrix::getEdgeSum() const -{ - return 0.0; -} - -// ============================================= -// CSR matrix class -// ============================================= - -/// Constructor for CSR matrix class -/** @param _transA Whether to transpose matrix. - * @param _m Number of rows. - * @param _n Number of columns. - * @param _nnz Number of non-zero entries. - * @param _descrA Matrix properties. - * @param _csrValA (Input, device memory, _nnz entries) Matrix - * entry values. - * @param _csrRowPtrA (Input, device memory, _m+1 entries) Pointer - * to first entry in each row. - * @param _csrColIndA (Input, device memory, _nnz entries) Column - * index of each matrix entry. - */ -template -CsrMatrix::CsrMatrix(bool _trans, - bool _sym, - IndexType_ _m, - IndexType_ _n, - IndexType_ _nnz, - const cusparseMatDescr_t _descrA, - /*const*/ ValueType_ *_csrValA, - const IndexType_ *_csrRowPtrA, - const IndexType_ *_csrColIndA) - : Matrix(_m, _n), - trans(_trans), - sym(_sym), - nnz(_nnz), - descrA(_descrA), - csrValA(_csrValA), - csrRowPtrA(_csrRowPtrA), - csrColIndA(_csrColIndA) -{ - if (nnz < 0) FatalError("invalid CSR matrix parameter (nnz<0)", NVGRAPH_ERR_BAD_PARAMETERS); - Cusparse::set_pointer_mode_host(); -} - -/// Destructor for CSR matrix class -template -CsrMatrix::~CsrMatrix() -{ -} - -/// Get and Set CUDA stream -template -void CsrMatrix::setCUDAStream(cudaStream_t _s) -{ - this->s = _s; - // printf("CsrMatrix setCUDAStream stream=%p\n",this->s); - Cusparse::setStream(_s); -} -template -void CsrMatrix::getCUDAStream(cudaStream_t *_s) -{ - *_s = this->s; - // CHECK_CUSPARSE(cusparseGetStream(Cusparse::get_handle(), _s)); -} -template -void CsrMatrix::mm(IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - // CHECK_CUSPARSE(cusparseXcsrmm(Cusparse::get_handle(), transA, this->m, k, this->n, nnz, &alpha, - // descrA, csrValA, csrRowPtrA, csrColIndA, x, this->n, &beta, y, this->m)); - Cusparse::csrmm(this->trans, - this->sym, - this->m, - k, - this->n, - this->nnz, - &alpha, - this->csrValA, - this->csrRowPtrA, - this->csrColIndA, - x, - this->n, - &beta, - y, - this->m); -} - -/// Color and Reorder -template -void CsrMatrix::color(IndexType_ *c, IndexType_ *p) const -{ -} - -template -void CsrMatrix::reorder(IndexType_ *p) const -{ -} - -/// Incomplete Cholesky (setup, factor and solve) -template -void CsrMatrix::prec_setup(Matrix *_M) -{ - // printf("CsrMatrix prec_setup dispacthed\n"); - if (!factored) { - // analyse lower triangular factor - CHECK_CUSPARSE(cusparseCreateSolveAnalysisInfo(&info_l)); - CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_LOWER)); - CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_UNIT)); - CHECK_CUSPARSE(cusparseXcsrsm_analysis(Cusparse::get_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->m, - nnz, - descrA, - csrValA, - csrRowPtrA, - csrColIndA, - info_l)); - // analyse upper triangular factor - CHECK_CUSPARSE(cusparseCreateSolveAnalysisInfo(&info_u)); - CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_UPPER)); - CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_NON_UNIT)); - CHECK_CUSPARSE(cusparseXcsrsm_analysis(Cusparse::get_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->m, - nnz, - descrA, - csrValA, - csrRowPtrA, - csrColIndA, - info_u)); - // perform csrilu0 (should be slightly faster than csric0) - CHECK_CUSPARSE(cusparseXcsrilu0(Cusparse::get_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->m, - descrA, - csrValA, - csrRowPtrA, - csrColIndA, - info_l)); - // set factored flag to true - factored = true; - } -} - -template -void CsrMatrix::prec_solve(IndexType_ k, - ValueType_ alpha, - ValueType_ *__restrict__ fx, - ValueType_ *__restrict__ t) const -{ - // printf("CsrMatrix prec_solve dispacthed (stream %p)\n",this->s); - - // preconditioning Mx=f (where M = L*U, threfore x=U\(L\f)) - // solve lower triangular factor - CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_LOWER)); - CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_UNIT)); - CHECK_CUSPARSE(cusparseXcsrsm_solve(Cusparse::get_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->m, - k, - alpha, - descrA, - csrValA, - csrRowPtrA, - csrColIndA, - info_l, - fx, - this->m, - t, - this->m)); - // solve upper triangular factor - CHECK_CUSPARSE(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_UPPER)); - CHECK_CUSPARSE(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_NON_UNIT)); - CHECK_CUSPARSE(cusparseXcsrsm_solve(Cusparse::get_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->m, - k, - alpha, - descrA, - csrValA, - csrRowPtrA, - csrColIndA, - info_u, - t, - this->m, - fx, - this->m)); -} - -/// Matrix-vector product for CSR matrix class -/** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n entries) Vector. - * @param beta Scalar. - * @param y (Input/output, device memory, m entries) Output vector. - */ -template -void CsrMatrix::mv(ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - // TODO: consider using merge-path csrmv - Cusparse::csrmv(this->trans, - this->sym, - this->m, - this->n, - this->nnz, - &alpha, - this->csrValA, - this->csrRowPtrA, - this->csrColIndA, - x, - &beta, - y); -} - -template -ValueType_ CsrMatrix::getEdgeSum() const -{ - return 0.0; -} - -// ============================================= -// Laplacian matrix class -// ============================================= - -/// Constructor for Laplacian matrix class -/** @param A Adjacency matrix - */ -template -LaplacianMatrix::LaplacianMatrix( - /*const*/ Matrix &_A) - : Matrix(_A.m, _A.n), A(&_A) -{ - // Check that adjacency matrix is square - if (_A.m != _A.n) - FatalError("cannot construct Laplacian matrix from non-square adjacency matrix", - NVGRAPH_ERR_BAD_PARAMETERS); - // set CUDA stream - this->s = NULL; - // Construct degree matrix - D.allocate(_A.m, this->s); - Vector ones(this->n, this->s); - ones.fill(1.0); - _A.mv(1, ones.raw(), 0, D.raw()); - - // Set preconditioning matrix pointer to NULL - M = NULL; -} - -/// Destructor for Laplacian matrix class -template -LaplacianMatrix::~LaplacianMatrix() -{ -} - -/// Get and Set CUDA stream -template -void LaplacianMatrix::setCUDAStream(cudaStream_t _s) -{ - this->s = _s; - // printf("LaplacianMatrix setCUDAStream stream=%p\n",this->s); - A->setCUDAStream(_s); - if (M != NULL) { M->setCUDAStream(_s); } -} -template -void LaplacianMatrix::getCUDAStream(cudaStream_t *_s) -{ - *_s = this->s; - // A->getCUDAStream(_s); -} - -/// Matrix-vector product for Laplacian matrix class -/** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n entries) Vector. - * @param beta Scalar. - * @param y (Input/output, device memory, m entries) Output vector. - */ -template -void LaplacianMatrix::mv(ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - // Scale result vector - if (beta == 0) - CHECK_CUDA(cudaMemset(y, 0, (this->n) * sizeof(ValueType_))) - else if (beta != 1) - thrust::transform(thrust::device_pointer_cast(y), - thrust::device_pointer_cast(y + this->n), - thrust::make_constant_iterator(beta), - thrust::device_pointer_cast(y), - thrust::multiplies()); - - // Apply diagonal matrix - dim3 gridDim, blockDim; - gridDim.x = min(((this->n) + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); - gridDim.y = 1; - gridDim.z = 1; - blockDim.x = BLOCK_SIZE; - blockDim.y = 1; - blockDim.z = 1; - diagmv<<s>>>(this->n, alpha, D.raw(), x, y); - cudaCheckError(); - - // Apply adjacency matrix - A->mv(-alpha, x, 1, y); -} -/// Matrix-vector product for Laplacian matrix class -/** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n*k entries) nxk dense matrix. - * @param beta Scalar. - * @param y (Input/output, device memory, m*k entries) Output mxk dense matrix. - */ -template -void LaplacianMatrix::mm(IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - // Apply diagonal matrix - ValueType_ one = (ValueType_)1.0; - this->dm(k, alpha, x, beta, y); - - // Apply adjacency matrix - A->mm(k, -alpha, x, one, y); -} - -template -void LaplacianMatrix::dm(IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - IndexType_ t = k * (this->n); - dim3 gridDim, blockDim; - - // setup launch parameters - gridDim.x = min(((this->n) + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535); - gridDim.y = min(k, 65535); - gridDim.z = 1; - blockDim.x = BLOCK_SIZE; - blockDim.y = 1; - blockDim.z = 1; - - // Apply diagonal matrix - if (beta == 0.0) { - // set vectors to 0 (WARNING: notice that you need to set, not scale, because of NaNs corner - // case) - CHECK_CUDA(cudaMemset(y, 0, t * sizeof(ValueType_))); - diagmm - <<s>>>(this->n, k, alpha, D.raw(), x, beta, y); - } else { - diagmm - <<s>>>(this->n, k, alpha, D.raw(), x, beta, y); - } - cudaCheckError(); -} - -/// Color and Reorder -template -void LaplacianMatrix::color(IndexType_ *c, IndexType_ *p) const -{ -} - -template -void LaplacianMatrix::reorder(IndexType_ *p) const -{ -} - -/// Solve preconditioned system M x = f for a set of k vectors -template -void LaplacianMatrix::prec_setup(Matrix *_M) -{ - // save the pointer to preconditioner M - M = _M; - if (M != NULL) { - // setup the preconditioning matrix M - M->prec_setup(NULL); - } -} - -template -void LaplacianMatrix::prec_solve(IndexType_ k, - ValueType_ alpha, - ValueType_ *__restrict__ fx, - ValueType_ *__restrict__ t) const -{ - if (M != NULL) { - // preconditioning - M->prec_solve(k, alpha, fx, t); - } -} - -template -ValueType_ LaplacianMatrix::getEdgeSum() const -{ - return 0.0; -} -// ============================================= -// Modularity matrix class -// ============================================= - -/// Constructor for Modularity matrix class -/** @param A Adjacency matrix - */ -template -ModularityMatrix::ModularityMatrix( - /*const*/ Matrix &_A, IndexType_ _nnz) - : Matrix(_A.m, _A.n), A(&_A), nnz(_nnz) -{ - // Check that adjacency matrix is square - if (_A.m != _A.n) - FatalError("cannot construct Modularity matrix from non-square adjacency matrix", - NVGRAPH_ERR_BAD_PARAMETERS); - - // set CUDA stream - this->s = NULL; - // Construct degree matrix - D.allocate(_A.m, this->s); - Vector ones(this->n, this->s); - ones.fill(1.0); - _A.mv(1, ones.raw(), 0, D.raw()); - // D.dump(0,this->n); - edge_sum = D.nrm1(); - - // Set preconditioning matrix pointer to NULL - M = NULL; -} - -/// Destructor for Modularity matrix class -template -ModularityMatrix::~ModularityMatrix() -{ -} - -/// Get and Set CUDA stream -template -void ModularityMatrix::setCUDAStream(cudaStream_t _s) -{ - this->s = _s; - // printf("ModularityMatrix setCUDAStream stream=%p\n",this->s); - A->setCUDAStream(_s); - if (M != NULL) { M->setCUDAStream(_s); } -} - -template -void ModularityMatrix::getCUDAStream(cudaStream_t *_s) -{ - *_s = this->s; - // A->getCUDAStream(_s); -} - -/// Matrix-vector product for Modularity matrix class -/** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n entries) Vector. - * @param beta Scalar. - * @param y (Input/output, device memory, m entries) Output vector. - */ -template -void ModularityMatrix::mv(ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - // Scale result vector - if (alpha != 1 || beta != 0) - FatalError("This isn't implemented for Modularity Matrix currently", - NVGRAPH_ERR_NOT_IMPLEMENTED); - - // CHECK_CUBLAS(cublasXdot(handle, this->n, const double *x, int incx, const double *y, int incy, - // double *result)); - // y = A*x - A->mv(alpha, x, 0, y); - ValueType_ dot_res; - // gamma = d'*x - Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); - // y = y -(gamma/edge_sum)*d - Cublas::axpy(this->n, -(dot_res / this->edge_sum), D.raw(), 1, y, 1); -} -/// Matrix-vector product for Modularity matrix class -/** y is overwritten with alpha*A*x+beta*y. - * - * @param alpha Scalar. - * @param x (Input, device memory, n*k entries) nxk dense matrix. - * @param beta Scalar. - * @param y (Input/output, device memory, m*k entries) Output mxk dense matrix. - */ -template -void ModularityMatrix::mm(IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); -} - -template -void ModularityMatrix::dm(IndexType_ k, - ValueType_ alpha, - const ValueType_ *__restrict__ x, - ValueType_ beta, - ValueType_ *__restrict__ y) const -{ - FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); -} - -/// Color and Reorder -template -void ModularityMatrix::color(IndexType_ *c, IndexType_ *p) const -{ - FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); -} - -template -void ModularityMatrix::reorder(IndexType_ *p) const -{ - FatalError("This isn't implemented for Modularity Matrix currently", NVGRAPH_ERR_NOT_IMPLEMENTED); -} - -/// Solve preconditioned system M x = f for a set of k vectors -template -void ModularityMatrix::prec_setup(Matrix *_M) -{ - // save the pointer to preconditioner M - M = _M; - if (M != NULL) { - // setup the preconditioning matrix M - M->prec_setup(NULL); - } -} - -template -void ModularityMatrix::prec_solve(IndexType_ k, - ValueType_ alpha, - ValueType_ *__restrict__ fx, - ValueType_ *__restrict__ t) const -{ - if (M != NULL) { - FatalError("This isn't implemented for Modularity Matrix currently", - NVGRAPH_ERR_NOT_IMPLEMENTED); - } -} - -template -ValueType_ ModularityMatrix::getEdgeSum() const -{ - return edge_sum; -} -// Explicit instantiation -template class Matrix; -template class Matrix; -template class DenseMatrix; -template class DenseMatrix; -template class CsrMatrix; -template class CsrMatrix; -template class LaplacianMatrix; -template class LaplacianMatrix; -template class ModularityMatrix; -template class ModularityMatrix; - -} // namespace nvgraph -//#endif diff --git a/cpp/src/sort/bitonic.cuh b/cpp/src/sort/bitonic.cuh index 38249aa3973..e2922a58d39 100644 --- a/cpp/src/sort/bitonic.cuh +++ b/cpp/src/sort/bitonic.cuh @@ -1,7 +1,7 @@ // -*-c++-*- /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ #include #include -#include +#include namespace cugraph { namespace sort { diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu index 059651e80d2..63ef725c3b7 100644 --- a/cpp/src/structure/graph.cu +++ b/cpp/src/structure/graph.cu @@ -15,10 +15,11 @@ */ #include -#include "utilities/cuda_utils.cuh" -#include "utilities/error_utils.h" +#include "utilities/error.hpp" #include "utilities/graph_utils.cuh" +#include + namespace { template @@ -36,25 +37,26 @@ void degree_from_offsets(vertex_t number_of_vertices, } template -void degree_from_vertex_ids(const cugraph::experimental::Comm &comm, +void degree_from_vertex_ids(const raft::handle_t *handle, vertex_t number_of_vertices, edge_t number_of_edges, vertex_t const *indices, edge_t *degree, cudaStream_t stream) { - thrust::for_each( - rmm::exec_policy(stream)->on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(number_of_edges), - [indices, degree] __device__(edge_t e) { cugraph::atomicAdd(degree + indices[e], 1); }); - comm.allreduce(number_of_vertices, degree, degree, cugraph::experimental::ReduceOp::SUM); + thrust::for_each(rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(number_of_edges), + [indices, degree] __device__(edge_t e) { atomicAdd(degree + indices[e], 1); }); + if ((handle != nullptr) && (handle->comms_initialized())) { + auto &comm = handle->get_comms(); + comm.allreduce(degree, degree, number_of_vertices, raft::comms::op_t::SUM, stream); + } } } // namespace namespace cugraph { -namespace experimental { template void GraphViewBase::get_vertex_identifiers(VT *identifiers) const @@ -82,10 +84,14 @@ void GraphCOOView::degree(ET *degree, DegreeDirection direction) con cudaStream_t stream{nullptr}; if (direction != DegreeDirection::IN) { - if (GraphViewBase::comm.get_p()) // FIXME retrieve global source - // indexing for the allreduce work - CUGRAPH_FAIL("OPG degree not implemented for OUT degree"); - degree_from_vertex_ids(GraphViewBase::comm, + if ((GraphViewBase::handle != nullptr) && + (GraphViewBase::handle + ->comms_initialized())) // FIXME retrieve global source + // indexing for the allreduce work + { + CUGRAPH_FAIL("MG degree not implemented for OUT degree"); + } + degree_from_vertex_ids(GraphViewBase::handle, GraphViewBase::number_of_vertices, GraphViewBase::number_of_edges, src_indices, @@ -94,7 +100,7 @@ void GraphCOOView::degree(ET *degree, DegreeDirection direction) con } if (direction != DegreeDirection::OUT) { - degree_from_vertex_ids(GraphViewBase::comm, + degree_from_vertex_ids(GraphViewBase::handle, GraphViewBase::number_of_vertices, GraphViewBase::number_of_edges, dst_indices, @@ -115,15 +121,17 @@ void GraphCompressedSparseBaseView::degree(ET *degree, DegreeDirecti cudaStream_t stream{nullptr}; if (direction != DegreeDirection::IN) { - if (GraphViewBase::comm.get_p()) - CUGRAPH_FAIL("OPG degree not implemented for OUT degree"); // FIXME retrieve global - // source indexing for - // the allreduce to work + if ((GraphViewBase::handle != nullptr) && + (GraphViewBase::handle->comms_initialized())) { + CUGRAPH_FAIL("MG degree not implemented for OUT degree"); // FIXME retrieve global + // source indexing for + // the allreduce to work + } degree_from_offsets(GraphViewBase::number_of_vertices, offsets, degree, stream); } if (direction != DegreeDirection::OUT) { - degree_from_vertex_ids(GraphViewBase::comm, + degree_from_vertex_ids(GraphViewBase::handle, GraphViewBase::number_of_vertices, GraphViewBase::number_of_edges, indices, @@ -139,5 +147,4 @@ template class GraphCOOView; template class GraphCOOView; template class GraphCompressedSparseBaseView; template class GraphCompressedSparseBaseView; -} // namespace experimental } // namespace cugraph diff --git a/cpp/src/topology/topology.cuh b/cpp/src/topology/topology.cuh index 15fbf588c23..82b0e72c705 100644 --- a/cpp/src/topology/topology.cuh +++ b/cpp/src/topology/topology.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/traversal/bfs.cu b/cpp/src/traversal/bfs.cu index cbe741424ea..dfb7a32499d 100644 --- a/cpp/src/traversal/bfs.cu +++ b/cpp/src/traversal/bfs.cu @@ -16,8 +16,10 @@ #include "graph.hpp" -#include +#include #include "bfs_kernels.cuh" +#include "mg/bfs.cuh" +#include "mg/common_utils.cuh" #include "traversal_common.cuh" #include "utilities/graph_utils.cuh" @@ -265,7 +267,6 @@ void BFS::traverse(IndexType source_vertex) bool can_use_bottom_up = (!sp_counters && !directed && distances); while (nf > 0) { - // Each vertices can appear only once in the frontierer array - we know it will fit new_frontier = frontier + nf; IndexType old_nf = nf; resetDevicePointers(); @@ -356,7 +357,7 @@ void BFS::traverse(IndexType source_vertex) mu -= mf; cudaMemcpyAsync(&nf, d_new_frontier_cnt, sizeof(IndexType), cudaMemcpyDeviceToHost, stream); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); // We need nf cudaStreamSynchronize(stream); @@ -413,7 +414,7 @@ void BFS::traverse(IndexType source_vertex) sizeof(IndexType), cudaMemcpyDeviceToHost, stream); - CUDA_CHECK_LAST() + CHECK_CUDA(stream); // We need last_left_unvisited_size cudaStreamSynchronize(stream); bfs_kernels::bottom_up_large(left_unvisited_queue, @@ -431,7 +432,7 @@ void BFS::traverse(IndexType source_vertex) deterministic); } cudaMemcpyAsync(&nf, d_new_frontier_cnt, sizeof(IndexType), cudaMemcpyDeviceToHost, stream); - CUDA_CHECK_LAST() + CHECK_CUDA(stream); // We will need nf cudaStreamSynchronize(stream); @@ -461,50 +462,111 @@ void BFS::clean() // the vectors have a destructor that takes care of cleaning } +// Explicit Instantiation +template class BFS; template class BFS; +template class BFS; + } // namespace detail // NOTE: SP counter increase extremely fast on large graph // It can easily reach 1e40~1e70 on GAP-road.mtx template -void bfs(experimental::GraphCSRView const &graph, +void bfs(raft::handle_t const &handle, + GraphCSRView const &graph, VT *distances, VT *predecessors, double *sp_counters, const VT start_vertex, - bool directed) + bool directed, + bool mg_batch) { - CUGRAPH_EXPECTS(typeid(VT) == typeid(int), "Unsupported vertex id data type, please use int"); - CUGRAPH_EXPECTS(typeid(ET) == typeid(int), "Unsupported edge id data type, please use int"); - CUGRAPH_EXPECTS((typeid(WT) == typeid(float)) || (typeid(WT) == typeid(double)), - "Unsupported weight data type, please use float or double"); - - VT number_of_vertices = graph.number_of_vertices; - ET number_of_edges = graph.number_of_edges; - - const VT *indices_ptr = graph.indices; - const ET *offsets_ptr = graph.offsets; - - int alpha = 15; - int beta = 18; - // FIXME: Use VT and ET in the BFS detail - cugraph::detail::BFS bfs( - number_of_vertices, number_of_edges, offsets_ptr, indices_ptr, directed, alpha, beta); - bfs.configure(distances, predecessors, sp_counters, nullptr); - bfs.traverse(start_vertex); + static_assert(std::is_integral::value && sizeof(VT) >= sizeof(int32_t), + "Unsupported vertex id data type. Use integral types of size >= sizeof(int32_t)"); + static_assert(std::is_same::value, + "VT and ET should be the same time for the current BFS implementation"); + static_assert(std::is_floating_point::value, + "Unsupported edge weight type. Use floating point types"); // actually, this is + // unnecessary for BFS + if (handle.comms_initialized() && !mg_batch) { + CUGRAPH_EXPECTS(sp_counters == nullptr, + "BFS Traversal shortest path is not supported in MG path"); + mg::bfs(handle, graph, distances, predecessors, start_vertex); + } else { + VT number_of_vertices = graph.number_of_vertices; + ET number_of_edges = graph.number_of_edges; + + const VT *indices_ptr = graph.indices; + const ET *offsets_ptr = graph.offsets; + + int alpha = 15; + int beta = 18; + // FIXME: Use VT and ET in the BFS detail + cugraph::detail::BFS bfs( + number_of_vertices, number_of_edges, offsets_ptr, indices_ptr, directed, alpha, beta); + bfs.configure(distances, predecessors, sp_counters, nullptr); + bfs.traverse(start_vertex); + } } -template void bfs(experimental::GraphCSRView const &graph, - int *distances, - int *predecessors, - double *sp_counters, - const int source_vertex, - bool directed); -template void bfs(experimental::GraphCSRView const &graph, - int *distances, - int *predecessors, - double *sp_counters, - const int source_vertex, - bool directed); +// Explicit Instantiation +template void bfs(raft::handle_t const &handle, + GraphCSRView const &graph, + uint32_t *distances, + uint32_t *predecessors, + double *sp_counters, + const uint32_t source_vertex, + bool directed, + bool mg_batch); + +// Explicit Instantiation +template void bfs(raft::handle_t const &handle, + GraphCSRView const &graph, + uint32_t *distances, + uint32_t *predecessors, + double *sp_counters, + const uint32_t source_vertex, + bool directed, + bool mg_batch); + +// Explicit Instantiation +template void bfs(raft::handle_t const &handle, + GraphCSRView const &graph, + int32_t *distances, + int32_t *predecessors, + double *sp_counters, + const int32_t source_vertex, + bool directed, + bool mg_batch); + +// Explicit Instantiation +template void bfs(raft::handle_t const &handle, + GraphCSRView const &graph, + int32_t *distances, + int32_t *predecessors, + double *sp_counters, + const int32_t source_vertex, + bool directed, + bool mg_batch); + +// Explicit Instantiation +template void bfs(raft::handle_t const &handle, + GraphCSRView const &graph, + int64_t *distances, + int64_t *predecessors, + double *sp_counters, + const int64_t source_vertex, + bool directed, + bool mg_batch); + +// Explicit Instantiation +template void bfs(raft::handle_t const &handle, + GraphCSRView const &graph, + int64_t *distances, + int64_t *predecessors, + double *sp_counters, + const int64_t source_vertex, + bool directed, + bool mg_batch); } // namespace cugraph diff --git a/cpp/src/traversal/bfs_kernels.cuh b/cpp/src/traversal/bfs_kernels.cuh index ceac8e5a1fa..bf2ec2fc6ee 100644 --- a/cpp/src/traversal/bfs_kernels.cuh +++ b/cpp/src/traversal/bfs_kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 NVIDIA CORPORATION. + * Copyright (c) 2018-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,8 +15,10 @@ */ #include -#include +#include #include + +#include "graph.hpp" #include "traversal_common.cuh" namespace cugraph { @@ -92,7 +94,7 @@ __global__ void fill_unvisited_queue_kernel(int *visited_bmap, // saving the common offset if (threadIdx.x == (FILL_UNVISITED_QUEUE_DIMX - 1)) { IndexType total = unvisited_thread_offset + n_unvisited_in_int; - unvisited_common_block_offset = atomicAdd(unvisited_cnt, total); + unvisited_common_block_offset = traversal::atomicAdd(unvisited_cnt, total); } // syncthreads for two reasons : @@ -161,11 +163,12 @@ void fill_unvisited_queue(int *visited_bmap, dim3 grid, block; block.x = FILL_UNVISITED_QUEUE_DIMX; - grid.x = min((IndexType)MAXBLOCKS, (visited_bmap_nints + block.x - 1) / block.x); + grid.x = std::min(static_cast(MAXBLOCKS), + (static_cast(visited_bmap_nints) + block.x - 1) / block.x); fill_unvisited_queue_kernel<<>>( visited_bmap, visited_bmap_nints, n, unvisited, unvisited_cnt); - CUDA_CHECK_LAST(); + CHECK_CUDA(m_stream); } // @@ -206,7 +209,7 @@ __global__ void count_unvisited_edges_kernel(const IndexType *potentially_unvisi BlockReduce(reduce_temp_storage).Sum(thread_unvisited_edges_count); // block_unvisited_edges_count is only defined is th.x == 0 - if (threadIdx.x == 0) atomicAdd(mu, block_unvisited_edges_count); + if (threadIdx.x == 0) traversal::atomicAdd(mu, block_unvisited_edges_count); } // Wrapper @@ -220,11 +223,12 @@ void count_unvisited_edges(const IndexType *potentially_unvisited, { dim3 grid, block; block.x = COUNT_UNVISITED_EDGES_DIMX; - grid.x = min((IndexType)MAXBLOCKS, (potentially_unvisited_size + block.x - 1) / block.x); + grid.x = std::min(static_cast(MAXBLOCKS), + (static_cast(potentially_unvisited_size) + block.x - 1) / block.x); count_unvisited_edges_kernel<<>>( potentially_unvisited, potentially_unvisited_size, visited_bmap, node_degree, mu); - CUDA_CHECK_LAST(); + CHECK_CUDA(m_stream); } // @@ -285,6 +289,11 @@ __global__ void main_bottomup_kernel(const IndexType *unvisited, const int warpid = threadIdx.x / WARP_SIZE; const int laneid = threadIdx.x % WARP_SIZE; + // When this kernel is converted to support different VT and ET, this + // will likely split into invalid_vid and invalid_eid + // This is equivalent to ~IndexType(0) (i.e., all bits set to 1) + constexpr IndexType invalid_idx = cugraph::invalid_idx::value; + // we will call __syncthreads inside the loop // we need to keep complete block active for (IndexType block_off = blockIdx.x * blockDim.x; block_off < unvisited_size; @@ -299,8 +308,9 @@ __global__ void main_bottomup_kernel(const IndexType *unvisited, // by different in in visited_bmap) IndexType visited_bmap_index[1]; // this is an array of size 1 because CUB // needs one - visited_bmap_index[0] = -1; - IndexType unvisited_vertex = -1; + + visited_bmap_index[0] = invalid_idx; + IndexType unvisited_vertex = invalid_idx; // local_visited_bmap gives info on the visited bit of unvisited_vertex // @@ -329,7 +339,9 @@ __global__ void main_bottomup_kernel(const IndexType *unvisited, IndexType degree = edge_end - edge_begin; - for (IndexType edge = edge_begin; edge < min(edge_end, edge_begin + MAIN_BOTTOMUP_MAX_EDGES); + for (IndexType edge = edge_begin; + edge < min(static_cast(edge_end), + static_cast(edge_begin) + MAIN_BOTTOMUP_MAX_EDGES); ++edge) { if (edge_mask && !edge_mask[edge]) continue; @@ -353,7 +365,7 @@ __global__ void main_bottomup_kernel(const IndexType *unvisited, // If we haven't found a parent and there's more edge to check if (!found && degree > MAIN_BOTTOMUP_MAX_EDGES) { - left_unvisited_off = atomicAdd(left_unvisited_cnt, (IndexType)1); + left_unvisited_off = traversal::atomicAdd(left_unvisited_cnt, static_cast(1)); more_to_visit = 1; } } @@ -393,7 +405,7 @@ __global__ void main_bottomup_kernel(const IndexType *unvisited, // broadcasting local_visited_bmap_warp_head __syncthreads(); - int head_ballot = cugraph::detail::utils::ballot(is_head); + int head_ballot = __ballot_sync(raft::warp_full_mask(), is_head); // As long as idx < unvisited_size, we know there's at least one head per // warp @@ -438,9 +450,8 @@ __global__ void main_bottomup_kernel(const IndexType *unvisited, // the destination thread of the __shfl is active int laneid_max = - min((IndexType)(WARP_SIZE - 1), (unvisited_size - (block_off + 32 * warpid))); - IndexType last_v = - cugraph::detail::utils::shfl(unvisited_vertex, laneid_max, WARP_SIZE, __activemask()); + min(static_cast(WARP_SIZE - 1), (unvisited_size - (block_off + 32 * warpid))); + IndexType last_v = __shfl_sync(__activemask(), unvisited_vertex, laneid_max, WARP_SIZE); if (is_last_head_in_warp) { int ilast_v = last_v % INT_SIZE + 1; @@ -462,7 +473,7 @@ __global__ void main_bottomup_kernel(const IndexType *unvisited, BlockScan(scan_temp_storage).ExclusiveSum(found, thread_frontier_offset); IndexType inclusive_sum = thread_frontier_offset + found; if (threadIdx.x == (MAIN_BOTTOMUP_DIMX - 1) && inclusive_sum) { - frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum); + frontier_common_block_offset = traversal::atomicAdd(new_frontier_cnt, inclusive_sum); } // 1) Broadcasting frontier_common_block_offset @@ -495,7 +506,8 @@ void bottom_up_main(IndexType *unvisited, dim3 grid, block; block.x = MAIN_BOTTOMUP_DIMX; - grid.x = min((IndexType)MAXBLOCKS, ((unvisited_size + block.x)) / block.x); + grid.x = std::min(static_cast(MAXBLOCKS), + (static_cast(unvisited_size) + block.x) / block.x); main_bottomup_kernel<<>>(unvisited, unvisited_size, @@ -510,7 +522,7 @@ void bottom_up_main(IndexType *unvisited, distances, predecessors, edge_mask); - CUDA_CHECK_LAST(); + CHECK_CUDA(m_stream); } // @@ -535,6 +547,11 @@ __global__ void bottom_up_large_degree_kernel(IndexType *left_unvisited, int logical_warp_id = threadIdx.x / BOTTOM_UP_LOGICAL_WARP_SIZE; int logical_warps_per_block = blockDim.x / BOTTOM_UP_LOGICAL_WARP_SIZE; + // When this kernel is converted to support different VT and ET, this + // will likely split into invalid_vid and invalid_eid + // This is equivalent to ~IndexType(0) (i.e., all bits set to 1) + constexpr IndexType invalid_idx = cugraph::invalid_idx::value; + // Inactive threads are not a pb for __ballot (known behaviour) for (IndexType idx = logical_warps_per_block * blockIdx.x + logical_warp_id; idx < left_unvisited_size; @@ -555,7 +572,7 @@ __global__ void bottom_up_large_degree_kernel(IndexType *left_unvisited, // is know with inactive threads for (IndexType i_edge = first_i_edge + logical_lane_id; i_edge < end_i_edge; i_edge += BOTTOM_UP_LOGICAL_WARP_SIZE) { - IndexType valid_parent = -1; + IndexType valid_parent = invalid_idx; if (!edge_mask || edge_mask[i_edge]) { IndexType u = col_ind[i_edge]; @@ -564,7 +581,8 @@ __global__ void bottom_up_large_degree_kernel(IndexType *left_unvisited, if (lvl_u == (lvl - 1)) { valid_parent = u; } } - unsigned int warp_valid_p_ballot = cugraph::detail::utils::ballot((valid_parent != -1)); + unsigned int warp_valid_p_ballot = + __ballot_sync(raft::warp_full_mask(), valid_parent != invalid_idx); int logical_warp_id_in_warp = (threadIdx.x % WARP_SIZE) / BOTTOM_UP_LOGICAL_WARP_SIZE; unsigned int mask = (1 << BOTTOM_UP_LOGICAL_WARP_SIZE) - 1; @@ -576,7 +594,7 @@ __global__ void bottom_up_large_degree_kernel(IndexType *left_unvisited, if (chosen_thread == logical_lane_id) { // Using only one valid parent (reduce bw) - IndexType off = atomicAdd(new_frontier_cnt, (IndexType)1); + IndexType off = traversal::atomicAdd(new_frontier_cnt, static_cast(1)); int m = 1 << (v % INT_SIZE); atomicOr(&visited[v / INT_SIZE], m); distances[v] = lvl; @@ -608,8 +626,10 @@ void bottom_up_large(IndexType *left_unvisited, { dim3 grid, block; block.x = LARGE_BOTTOMUP_DIMX; - grid.x = min((IndexType)MAXBLOCKS, - ((left_unvisited_size + block.x - 1) * BOTTOM_UP_LOGICAL_WARP_SIZE) / block.x); + grid.x = std::min( + static_cast(MAXBLOCKS), + ((static_cast(left_unvisited_size) + block.x - 1) * BOTTOM_UP_LOGICAL_WARP_SIZE) / + block.x); bottom_up_large_degree_kernel<<>>(left_unvisited, left_unvisited_size, @@ -622,7 +642,7 @@ void bottom_up_large(IndexType *left_unvisited, distances, predecessors, edge_mask); - CUDA_CHECK_LAST(); + CHECK_CUDA(m_stream); } // @@ -704,18 +724,27 @@ __global__ void topdown_expand_kernel( __shared__ IndexType block_n_frontier_candidates; IndexType block_offset = (blockDim.x * blockIdx.x) * max_items_per_thread; + + // When this kernel is converted to support different VT and ET, this + // will likely split into invalid_vid and invalid_eid + // This is equivalent to ~IndexType(0) (i.e., all bits set to 1) + constexpr IndexType invalid_idx = cugraph::invalid_idx::value; + IndexType n_items_per_thread_left = - (totaldegree - block_offset + TOP_DOWN_EXPAND_DIMX - 1) / TOP_DOWN_EXPAND_DIMX; + (totaldegree > block_offset) + ? (totaldegree - block_offset + TOP_DOWN_EXPAND_DIMX - 1) / TOP_DOWN_EXPAND_DIMX + : 0; n_items_per_thread_left = min(max_items_per_thread, n_items_per_thread_left); for (; (n_items_per_thread_left > 0) && (block_offset < totaldegree); block_offset += MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x, - n_items_per_thread_left -= MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD) { + n_items_per_thread_left -= min( + n_items_per_thread_left, static_cast(MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD))) { // In this loop, we will process batch_set_size batches IndexType nitems_per_thread = - min(n_items_per_thread_left, (IndexType)MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD); + min(n_items_per_thread_left, static_cast(MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD)); // Loading buckets offset (see compute_bucket_offsets_kernel) @@ -803,8 +832,9 @@ __global__ void topdown_expand_kernel( // We process TOP_DOWN_BATCH_SIZE edge in parallel (instruction // parallism) Reduces latency - IndexType current_max_edge_index = - min(block_offset + (left + nitems_per_thread_for_this_load) * blockDim.x, totaldegree); + IndexType current_max_edge_index = min( + static_cast(block_offset) + (left + nitems_per_thread_for_this_load) * blockDim.x, + static_cast(totaldegree)); // We will need vec_u (source of the edge) until the end if we need to // save the predecessors For others informations, we will reuse pointers @@ -834,8 +864,8 @@ __global__ void topdown_expand_kernel( vec_u[iv] = frontier[k]; // origin of this edge vec_frontier_degrees_exclusive_sum_index[iv] = frontier_degrees_exclusive_sum[k]; } else { - vec_u[iv] = -1; - vec_frontier_degrees_exclusive_sum_index[iv] = -1; + vec_u[iv] = invalid_idx; + vec_frontier_degrees_exclusive_sum_index[iv] = invalid_idx; } } @@ -844,7 +874,7 @@ __global__ void topdown_expand_kernel( for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { IndexType u = vec_u[iv]; // row_ptr for this vertex origin u - vec_row_ptr_u[iv] = (u != -1) ? row_ptr[u] : -1; + vec_row_ptr_u[iv] = (u != invalid_idx) ? row_ptr[u] : invalid_idx; } // We won't need row_ptr after that, reusing pointer @@ -856,12 +886,18 @@ __global__ void topdown_expand_kernel( IndexType gid = block_offset + thread_item_index * blockDim.x + threadIdx.x; IndexType row_ptr_u = vec_row_ptr_u[iv]; - IndexType edge = row_ptr_u + gid - vec_frontier_degrees_exclusive_sum_index[iv]; - - if (edge_mask && !edge_mask[edge]) row_ptr_u = -1; // disabling edge - - // Destination of this edge - vec_dest_v[iv] = (row_ptr_u != -1) ? col_ind[edge] : -1; + // Need this check so that we don't use invalid values of edge to index + if (row_ptr_u != invalid_idx) { + IndexType edge = row_ptr_u + gid - vec_frontier_degrees_exclusive_sum_index[iv]; + + if (edge_mask && !edge_mask[edge]) { + // Disabling edge + row_ptr_u = invalid_idx; + } else { + // Destination of this edge + vec_dest_v[iv] = col_ind[edge]; + } + } } // We don't need vec_frontier_degrees_exclusive_sum_index anymore @@ -874,7 +910,7 @@ __global__ void topdown_expand_kernel( for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { IndexType v = vec_dest_v[iv]; vec_v_visited_bmap[iv] = - (v != -1) ? previous_bmap[v / INT_SIZE] : (~0); // will look visited + (v != invalid_idx) ? previous_bmap[v / INT_SIZE] : (~int(0)); // will look visited } // From now on we will consider v as a frontier candidate @@ -889,7 +925,7 @@ __global__ void topdown_expand_kernel( int is_visited = vec_v_visited_bmap[iv] & m; - if (is_visited) vec_frontier_candidate[iv] = -1; + if (is_visited) vec_frontier_candidate[iv] = invalid_idx; } // Each source should update the destination shortest path counter @@ -898,7 +934,7 @@ __global__ void topdown_expand_kernel( #pragma unroll for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { IndexType dst = vec_frontier_candidate[iv]; - if (dst != -1) { + if (dst != invalid_idx) { IndexType src = vec_u[iv]; atomicAdd(&sp_counters[dst], sp_counters[src]); } @@ -912,7 +948,7 @@ __global__ void topdown_expand_kernel( #pragma unroll for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { IndexType v = vec_frontier_candidate[iv]; - vec_is_isolated_bmap[iv] = (v != -1) ? isolated_bmap[v / INT_SIZE] : -1; + vec_is_isolated_bmap[iv] = (v != invalid_idx) ? isolated_bmap[v / INT_SIZE] : ~int(0); } #pragma unroll @@ -928,7 +964,7 @@ __global__ void topdown_expand_kernel( // visited, and save distance and predecessor here. Not need to // check return value of atomicOr - if (is_isolated && v != -1) { + if (is_isolated && v != invalid_idx) { int m = 1 << (v % INT_SIZE); atomicOr(&bmap[v / INT_SIZE], m); if (distances) distances[v] = lvl; @@ -936,7 +972,7 @@ __global__ void topdown_expand_kernel( if (predecessors) predecessors[v] = vec_u[iv]; // This is no longer a candidate, neutralize it - vec_frontier_candidate[iv] = -1; + vec_frontier_candidate[iv] = invalid_idx; } } } @@ -947,7 +983,7 @@ __global__ void topdown_expand_kernel( #pragma unroll for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { IndexType v = vec_frontier_candidate[iv]; - if (v != -1) ++thread_n_frontier_candidates; + if (v != invalid_idx) ++thread_n_frontier_candidates; } // We need to have all nfrontier_candidates to be ready before doing the @@ -965,7 +1001,7 @@ __global__ void topdown_expand_kernel( // May have bank conflicts IndexType frontier_candidate = vec_frontier_candidate[iv]; - if (frontier_candidate != -1) { + if (frontier_candidate != invalid_idx) { shared_local_new_frontier_candidates[thread_frontier_candidate_offset] = frontier_candidate; shared_local_new_frontier_predecessors[thread_frontier_candidate_offset] = vec_u[iv]; @@ -990,7 +1026,7 @@ __global__ void topdown_expand_kernel( #pragma unroll for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { const int idx_shared = iv * blockDim.x + threadIdx.x; - vec_frontier_accepted_vertex[iv] = -1; + vec_frontier_accepted_vertex[iv] = invalid_idx; if (idx_shared < block_n_frontier_candidates) { IndexType v = shared_local_new_frontier_candidates[idx_shared]; // popping @@ -1024,7 +1060,7 @@ __global__ void topdown_expand_kernel( // for this thread, thread_new_frontier_offset + has_successor // (exclusive sum) if (inclusive_sum) - frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum); + frontier_common_block_offset = traversal::atomicAdd(new_frontier_cnt, inclusive_sum); } // Broadcasting frontier_common_block_offset @@ -1036,7 +1072,7 @@ __global__ void topdown_expand_kernel( if (idx_shared < block_n_frontier_candidates) { IndexType new_frontier_vertex = vec_frontier_accepted_vertex[iv]; - if (new_frontier_vertex != -1) { + if (new_frontier_vertex != invalid_idx) { IndexType off = frontier_common_block_offset + thread_new_frontier_offset++; new_frontier[off] = new_frontier_vertex; } @@ -1084,12 +1120,14 @@ void frontier_expand(const IndexType *row_ptr, dim3 block; block.x = TOP_DOWN_EXPAND_DIMX; - IndexType max_items_per_thread = (totaldegree + MAXBLOCKS * block.x - 1) / (MAXBLOCKS * block.x); + IndexType max_items_per_thread = + (static_cast(totaldegree) + MAXBLOCKS * block.x - 1) / (MAXBLOCKS * block.x); dim3 grid; - grid.x = - min((totaldegree + max_items_per_thread * block.x - 1) / (max_items_per_thread * block.x), - (IndexType)MAXBLOCKS); + grid.x = std::min((static_cast(totaldegree) + max_items_per_thread * block.x - 1) / + (max_items_per_thread * block.x), + static_cast(MAXBLOCKS)); + // Shortest Path counting (Betweenness Centrality) // We need to keep track of the previously visited bmap @@ -1117,123 +1155,7 @@ void frontier_expand(const IndexType *row_ptr, edge_mask, isolated_bmap, directed); - CUDA_CHECK_LAST(); -} - -template -__global__ void flag_isolated_vertices_kernel(IndexType n, - int *isolated_bmap, - const IndexType *row_ptr, - IndexType *degrees, - IndexType *nisolated) -{ - typedef cub::BlockLoad - BlockLoad; - typedef cub::BlockStore - BlockStore; - typedef cub::BlockReduce BlockReduce; - typedef cub::WarpReduce WarpReduce; - - __shared__ typename BlockLoad::TempStorage load_temp_storage; - __shared__ typename BlockStore::TempStorage store_temp_storage; - __shared__ typename BlockReduce::TempStorage block_reduce_temp_storage; - - __shared__ typename WarpReduce::TempStorage - warp_reduce_temp_storage[FLAG_ISOLATED_VERTICES_DIMX / FLAG_ISOLATED_VERTICES_THREADS_PER_INT]; - - __shared__ IndexType row_ptr_tail[FLAG_ISOLATED_VERTICES_DIMX]; - - for (IndexType block_off = FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * (blockDim.x * blockIdx.x); - block_off < n; - block_off += FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * (blockDim.x * gridDim.x)) { - IndexType thread_off = block_off + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * threadIdx.x; - IndexType last_node_thread = thread_off + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1; - - IndexType thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD]; - IndexType block_valid_items = n - block_off + 1; //+1, we need row_ptr[last_node+1] - - BlockLoad(load_temp_storage).Load(row_ptr + block_off, thread_row_ptr, block_valid_items, -1); - - // To compute 4 degrees, we need 5 values of row_ptr - // Saving the "5th" value in shared memory for previous thread to use - if (threadIdx.x > 0) { row_ptr_tail[threadIdx.x - 1] = thread_row_ptr[0]; } - - // If this is the last thread, it needs to load its row ptr tail value - if (threadIdx.x == (FLAG_ISOLATED_VERTICES_DIMX - 1) && last_node_thread < n) { - row_ptr_tail[threadIdx.x] = row_ptr[last_node_thread + 1]; - } - __syncthreads(); // we may reuse temp_storage - - int local_isolated_bmap = 0; - - IndexType imax = (n - thread_off); - - IndexType local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD]; - -#pragma unroll - for (int i = 0; i < (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1); ++i) { - IndexType degree = local_degree[i] = thread_row_ptr[i + 1] - thread_row_ptr[i]; - - if (i < imax) local_isolated_bmap |= ((degree == 0) << i); - } - - if (last_node_thread < n) { - IndexType degree = local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1] = - row_ptr_tail[threadIdx.x] - thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1]; - - local_isolated_bmap |= ((degree == 0) << (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1)); - } - - local_isolated_bmap <<= (thread_off % INT_SIZE); - - IndexType local_nisolated = __popc(local_isolated_bmap); - - // We need local_nisolated and local_isolated_bmap to be ready for next - // steps - __syncthreads(); - - IndexType total_nisolated = BlockReduce(block_reduce_temp_storage).Sum(local_nisolated); - - if (threadIdx.x == 0 && total_nisolated) { atomicAdd(nisolated, total_nisolated); } - - int logicalwarpid = threadIdx.x / FLAG_ISOLATED_VERTICES_THREADS_PER_INT; - - // Building int for bmap - int int_aggregate_isolated_bmap = WarpReduce(warp_reduce_temp_storage[logicalwarpid]) - .Reduce(local_isolated_bmap, traversal::BitwiseOr()); - - int is_head_of_visited_int = ((threadIdx.x % (FLAG_ISOLATED_VERTICES_THREADS_PER_INT)) == 0); - if (is_head_of_visited_int) { - isolated_bmap[thread_off / INT_SIZE] = int_aggregate_isolated_bmap; - } - - BlockStore(store_temp_storage).Store(degrees + block_off, local_degree, block_valid_items); - } -} - -template -void flag_isolated_vertices(IndexType n, - int *isolated_bmap, - const IndexType *row_ptr, - IndexType *degrees, - IndexType *nisolated, - cudaStream_t m_stream) -{ - dim3 grid, block; - block.x = FLAG_ISOLATED_VERTICES_DIMX; - - grid.x = min((IndexType)MAXBLOCKS, - (n / FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD + 1 + block.x - 1) / block.x); - - flag_isolated_vertices_kernel<<>>( - n, isolated_bmap, row_ptr, degrees, nisolated); - CUDA_CHECK_LAST(); + CHECK_CUDA(m_stream); } } // namespace bfs_kernels diff --git a/cpp/src/traversal/mg/bfs.cuh b/cpp/src/traversal/mg/bfs.cuh new file mode 100644 index 00000000000..b053a6ff75a --- /dev/null +++ b/cpp/src/traversal/mg/bfs.cuh @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "../traversal_common.cuh" +#include "common_utils.cuh" +#include "frontier_expand.cuh" + +namespace cugraph { + +namespace mg { + +namespace detail { + +template +void bfs_traverse(raft::handle_t const &handle, + cugraph::GraphCSRView const &graph, + const vertex_t start_vertex, + rmm::device_vector &visited_bmap, + rmm::device_vector &output_frontier_bmap, + operator_t &bfs_op) +{ + // Frontiers required for BFS + rmm::device_vector input_frontier(graph.number_of_vertices); + rmm::device_vector output_frontier(graph.number_of_vertices); + + // Bitmaps required for BFS + size_t word_count = detail::number_of_words(graph.number_of_vertices); + rmm::device_vector isolated_bmap(word_count, 0); + rmm::device_vector unique_bmap(word_count, 0); + rmm::device_vector temp_buffer_len(handle.get_comms().get_size()); + + // Reusing buffers to create isolated bitmap + { + rmm::device_vector &local_isolated_ids = input_frontier; + rmm::device_vector &global_isolated_ids = output_frontier; + detail::create_isolated_bitmap( + handle, graph, local_isolated_ids, global_isolated_ids, temp_buffer_len, isolated_bmap); + } + + if (is_vertex_isolated(isolated_bmap, start_vertex)) { return; } + + // Frontier Expand for calls to bfs functors + detail::FrontierExpand fexp(handle, graph); + + cudaStream_t stream = handle.get_stream(); + + // Initialize input frontier + input_frontier[0] = start_vertex; + vertex_t input_frontier_len = 1; + + do { + // Mark all input frontier vertices as visited + detail::add_to_bitmap(handle, visited_bmap, input_frontier, input_frontier_len); + + bfs_op.increment_level(); + + // Remove duplicates,isolated and out of partition vertices + // from input_frontier and store it to output_frontier + input_frontier_len = detail::preprocess_input_frontier(handle, + graph, + unique_bmap, + isolated_bmap, + input_frontier, + input_frontier_len, + output_frontier); + // Swap input and output frontier + input_frontier.swap(output_frontier); + + // Clear output frontier bitmap + thrust::fill(rmm::exec_policy(stream)->on(stream), + output_frontier_bmap.begin(), + output_frontier_bmap.end(), + static_cast(0)); + + // Generate output frontier bitmap from input frontier + vertex_t output_frontier_len = + fexp(bfs_op, input_frontier, input_frontier_len, output_frontier); + + // Collect output_frontier from all ranks to input_frontier + // If not empty then we proceed to next iteration. + // Note that its an error to remove duplicates and non local + // start vertices here since it is possible that doing so will + // result in input_frontier_len to be 0. That would cause some + // ranks to go ahead with the iteration and some to terminate. + // This would further cause a nccl communication error since + // not every rank participates in broadcast/allgather in + // subsequent calls + input_frontier_len = detail::collect_vectors( + handle, temp_buffer_len, output_frontier, output_frontier_len, input_frontier); + + } while (input_frontier_len != 0); +} + +} // namespace detail + +template +void bfs(raft::handle_t const &handle, + cugraph::GraphCSRView const &graph, + vertex_t *distances, + vertex_t *predecessors, + const vertex_t start_vertex) +{ + CUGRAPH_EXPECTS(handle.comms_initialized(), + "cugraph::mg::bfs() expected to work only in multi gpu case."); + + // Distances and predecessors are of the size global_number_of_vertices + vertex_t global_number_of_vertices = detail::get_global_vertex_count(handle, graph); + + size_t word_count = detail::number_of_words(global_number_of_vertices); + rmm::device_vector visited_bmap(word_count, 0); + rmm::device_vector output_frontier_bmap(word_count, 0); + + cudaStream_t stream = handle.get_stream(); + + // Set all predecessors to be invalid vertex ids + thrust::fill(rmm::exec_policy(stream)->on(stream), + predecessors, + predecessors + global_number_of_vertices, + cugraph::invalid_idx::value); + + if (distances == nullptr) { + detail::BFSStepNoDist bfs_op( + output_frontier_bmap.data().get(), visited_bmap.data().get(), predecessors); + + detail::bfs_traverse(handle, graph, start_vertex, visited_bmap, output_frontier_bmap, bfs_op); + + } else { + // Update distances to max distances everywhere except start_vertex + // where it is set to 0 + detail::fill_max_dist(handle, graph, start_vertex, global_number_of_vertices, distances); + + detail::BFSStep bfs_op( + output_frontier_bmap.data().get(), visited_bmap.data().get(), predecessors, distances); + + detail::bfs_traverse(handle, graph, start_vertex, visited_bmap, output_frontier_bmap, bfs_op); + + // In place reduce to collect distances + if (handle.comms_initialized()) { + handle.get_comms().allreduce( + distances, distances, global_number_of_vertices, raft::comms::op_t::MIN, stream); + } + } + + // In place reduce to collect predecessors + if (handle.comms_initialized()) { + auto op = raft::comms::op_t::MIN; + if (std::is_signed::value) { op = raft::comms::op_t::MAX; } + handle.get_comms().allreduce(predecessors, predecessors, global_number_of_vertices, op, stream); + } +} + +} // namespace mg + +} // namespace cugraph diff --git a/cpp/src/traversal/mg/common_utils.cuh b/cpp/src/traversal/mg/common_utils.cuh new file mode 100644 index 00000000000..6199730c28f --- /dev/null +++ b/cpp/src/traversal/mg/common_utils.cuh @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include "../traversal_common.cuh" + +namespace cugraph { + +namespace mg { + +namespace detail { + +template +constexpr int BitsPWrd = sizeof(degree_t) * 8; + +template +constexpr int NumberBins = sizeof(degree_t) * 8 + 1; + +template +constexpr inline return_t number_of_words(return_t number_of_bits) +{ + return raft::div_rounding_up_safe(number_of_bits, static_cast(BitsPWrd)); +} + +template +struct isDegreeZero { + edge_t const *offset_; + isDegreeZero(edge_t const *offset) : offset_(offset) {} + + __device__ bool operator()(const edge_t &id) const { return (offset_[id + 1] == offset_[id]); } +}; + +struct set_nth_bit { + uint32_t *bmap_; + set_nth_bit(uint32_t *bmap) : bmap_(bmap) {} + + template + __device__ void operator()(const return_t &id) + { + atomicOr(bmap_ + (id / BitsPWrd), (uint32_t{1} << (id % BitsPWrd))); + } +}; + +template +bool is_vertex_isolated(rmm::device_vector &bmap, vertex_t id) +{ + uint32_t word = bmap[id / BitsPWrd]; + uint32_t active_bit = static_cast(1) << (id % BitsPWrd); + // If idth bit of bmap is set to 1 then return true + return ((active_bit & word) != 0); +} + +template +struct BFSStepNoDist { + uint32_t *output_frontier_; + uint32_t *visited_; + vertex_t *predecessors_; + + BFSStepNoDist(uint32_t *output_frontier, uint32_t *visited, vertex_t *predecessors) + : output_frontier_(output_frontier), visited_(visited), predecessors_(predecessors) + { + } + + __device__ bool operator()(vertex_t src, vertex_t dst) + { + uint32_t active_bit = static_cast(1) << (dst % BitsPWrd); + uint32_t prev_word = atomicOr(output_frontier_ + (dst / BitsPWrd), active_bit); + bool dst_not_visited_earlier = !(active_bit & visited_[dst / BitsPWrd]); + bool dst_not_visited_current = !(prev_word & active_bit); + // If this thread activates the frontier bitmap for a destination + // then the source is the predecessor of that destination + if (dst_not_visited_earlier && dst_not_visited_current) { + predecessors_[dst] = src; + return true; + } else { + return false; + } + } + + // No-op + void increment_level(void) {} +}; + +template +struct BFSStep { + uint32_t *output_frontier_; + uint32_t *visited_; + vertex_t *predecessors_; + vertex_t *distances_; + vertex_t level_; + + BFSStep(uint32_t *output_frontier, uint32_t *visited, vertex_t *predecessors, vertex_t *distances) + : output_frontier_(output_frontier), + visited_(visited), + predecessors_(predecessors), + distances_(distances), + level_(0) + { + } + + __device__ bool operator()(vertex_t src, vertex_t dst) + { + uint32_t active_bit = static_cast(1) << (dst % BitsPWrd); + uint32_t prev_word = atomicOr(output_frontier_ + (dst / BitsPWrd), active_bit); + bool dst_not_visited_earlier = !(active_bit & visited_[dst / BitsPWrd]); + bool dst_not_visited_current = !(prev_word & active_bit); + // If this thread activates the frontier bitmap for a destination + // then the source is the predecessor of that destination + if (dst_not_visited_earlier && dst_not_visited_current) { + distances_[dst] = level_; + predecessors_[dst] = src; + return true; + } else { + return false; + } + } + + void increment_level(void) { ++level_; } +}; + +template +vertex_t populate_isolated_vertices(raft::handle_t const &handle, + cugraph::GraphCSRView const &graph, + rmm::device_vector &isolated_vertex_ids) +{ + bool is_mg = (handle.comms_initialized() && (graph.local_vertices != nullptr) && + (graph.local_offsets != nullptr)); + cudaStream_t stream = handle.get_stream(); + + edge_t vertex_begin_, vertex_end_; + if (is_mg) { + vertex_begin_ = graph.local_offsets[handle.get_comms().get_rank()]; + vertex_end_ = graph.local_offsets[handle.get_comms().get_rank()] + + graph.local_vertices[handle.get_comms().get_rank()]; + } else { + vertex_begin_ = 0; + vertex_end_ = graph.number_of_vertices; + } + auto count = thrust::copy_if(rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(vertex_begin_), + thrust::make_counting_iterator(vertex_end_), + thrust::make_counting_iterator(0), + isolated_vertex_ids.begin(), + isDegreeZero(graph.offsets)) - + isolated_vertex_ids.begin(); + return static_cast(count); +} + +template +return_t collect_vectors(raft::handle_t const &handle, + rmm::device_vector &buffer_len, + rmm::device_vector &local, + return_t local_count, + rmm::device_vector &global) +{ + CHECK_CUDA(handle.get_stream()); + buffer_len.resize(handle.get_comms().get_size()); + auto my_rank = handle.get_comms().get_rank(); + buffer_len[my_rank] = static_cast(local_count); + handle.get_comms().allgather( + buffer_len.data().get() + my_rank, buffer_len.data().get(), 1, handle.get_stream()); + CHECK_CUDA(handle.get_stream()); + // buffer_len now contains the lengths of all local buffers + // for all ranks + + thrust::host_vector h_buffer_len = buffer_len; + // h_buffer_offsets has to be int because raft allgatherv expects + // int array for displacement vector. This should be changed in + // raft so that the displacement is templated + thrust::host_vector h_buffer_offsets(h_buffer_len.size()); + + thrust::exclusive_scan( + thrust::host, h_buffer_len.begin(), h_buffer_len.end(), h_buffer_offsets.begin()); + return_t global_buffer_len = h_buffer_len.back() + h_buffer_offsets.back(); + + handle.get_comms().allgatherv(local.data().get(), + global.data().get(), + h_buffer_len.data(), + h_buffer_offsets.data(), + handle.get_stream()); + CHECK_CUDA(handle.get_stream()); + return global_buffer_len; +} + +template +void add_to_bitmap(raft::handle_t const &handle, + rmm::device_vector &bmap, + rmm::device_vector &id, + return_t count) +{ + cudaStream_t stream = handle.get_stream(); + thrust::for_each(rmm::exec_policy(stream)->on(stream), + id.begin(), + id.begin() + count, + set_nth_bit(bmap.data().get())); + CHECK_CUDA(stream); +} + +// For all vertex ids i which are isolated (out degree is 0), set +// ith bit of isolated_bmap to 1 +template +void create_isolated_bitmap(raft::handle_t const &handle, + cugraph::GraphCSRView const &graph, + rmm::device_vector &local_isolated_ids, + rmm::device_vector &global_isolated_ids, + rmm::device_vector &temp_buffer_len, + rmm::device_vector &isolated_bmap) +{ + size_t word_count = detail::number_of_words(graph.number_of_vertices); + local_isolated_ids.resize(graph.number_of_vertices); + global_isolated_ids.resize(graph.number_of_vertices); + temp_buffer_len.resize(handle.get_comms().get_size()); + isolated_bmap.resize(word_count); + + vertex_t local_isolated_count = populate_isolated_vertices(handle, graph, local_isolated_ids); + vertex_t global_isolated_count = collect_vectors( + handle, temp_buffer_len, local_isolated_ids, local_isolated_count, global_isolated_ids); + add_to_bitmap(handle, isolated_bmap, global_isolated_ids, global_isolated_count); +} + +template +return_t remove_duplicates(raft::handle_t const &handle, + rmm::device_vector &data, + return_t data_len) +{ + cudaStream_t stream = handle.get_stream(); + thrust::sort(rmm::exec_policy(stream)->on(stream), data.begin(), data.begin() + data_len); + auto unique_count = + thrust::unique(rmm::exec_policy(stream)->on(stream), data.begin(), data.begin() + data_len) - + data.begin(); + return static_cast(unique_count); +} + +// Use the fact that any value in id array can only be in +// the range [id_begin, id_end) to create a unique set of +// ids. bmap is expected to be of the length +// id_end/BitsPWrd and is set to 0 initially +template +__global__ void remove_duplicates_kernel(uint32_t *bmap, + return_t *in_id, + return_t id_begin, + return_t id_end, + return_t count, + return_t *out_id, + return_t *out_count) +{ + return_t tid = blockIdx.x * blockDim.x + threadIdx.x; + return_t id; + if (tid < count) { + id = in_id[tid]; + } else { + // Invalid vertex id to avoid partial thread block execution + id = id_end; + } + + int acceptable_vertex = 0; + // If id is not in the acceptable range then set it to + // an invalid vertex id + if ((id >= id_begin) && (id < id_end)) { + uint32_t active_bit = static_cast(1) << (id % BitsPWrd); + uint32_t prev_word = atomicOr(bmap + (id / BitsPWrd), active_bit); + // If bit was set by this thread then the id is unique + if (!(prev_word & active_bit)) { acceptable_vertex = 1; } + } + + __shared__ return_t block_offset; + typedef cub::BlockScan BlockScan; + __shared__ typename BlockScan::TempStorage temp_storage; + int thread_write_offset; + int block_acceptable_vertex_count; + BlockScan(temp_storage) + .ExclusiveSum(acceptable_vertex, thread_write_offset, block_acceptable_vertex_count); + + // If the block is not going to write unique ids then return + if (block_acceptable_vertex_count == 0) { return; } + + if (threadIdx.x == 0) { + block_offset = cugraph::detail::traversal::atomicAdd( + out_count, static_cast(block_acceptable_vertex_count)); + } + __syncthreads(); + + if (acceptable_vertex) { out_id[block_offset + thread_write_offset] = id; } +} + +template +__global__ void remove_duplicates_kernel(uint32_t *bmap, + uint32_t *isolated_bmap, + return_t *in_id, + return_t id_begin, + return_t id_end, + return_t count, + return_t *out_id, + return_t *out_count) +{ + return_t tid = blockIdx.x * blockDim.x + threadIdx.x; + return_t id; + if (tid < count) { + id = in_id[tid]; + } else { + // Invalid vertex id to avoid partial thread block execution + id = id_end; + } + + int acceptable_vertex = 0; + // If id is not in the acceptable range then set it to + // an invalid vertex id + if ((id >= id_begin) && (id < id_end)) { + uint32_t active_bit = static_cast(1) << (id % BitsPWrd); + uint32_t prev_word = atomicOr(bmap + (id / BitsPWrd), active_bit); + // If bit was set by this thread then the id is unique + if (!(prev_word & active_bit)) { + // If id is isolated (out-degree == 0) then mark it as unacceptable + bool is_dst_isolated = active_bit & isolated_bmap[id / BitsPWrd]; + acceptable_vertex = !is_dst_isolated; + } + } + + __shared__ return_t block_offset; + typedef cub::BlockScan BlockScan; + __shared__ typename BlockScan::TempStorage temp_storage; + int thread_write_offset; + int block_acceptable_vertex_count; + BlockScan(temp_storage) + .ExclusiveSum(acceptable_vertex, thread_write_offset, block_acceptable_vertex_count); + + // If the block is not going to write unique ids then return + if (block_acceptable_vertex_count == 0) { return; } + + if (threadIdx.x == 0) { + block_offset = cugraph::detail::traversal::atomicAdd( + out_count, static_cast(block_acceptable_vertex_count)); + } + __syncthreads(); + + if (acceptable_vertex) { out_id[block_offset + thread_write_offset] = id; } +} + +template +return_t remove_duplicates(raft::handle_t const &handle, + rmm::device_vector &bmap, + rmm::device_vector &data, + return_t data_len, + return_t data_begin, + return_t data_end, + rmm::device_vector &out_data) +{ + cudaStream_t stream = handle.get_stream(); + + rmm::device_vector unique_count(1, 0); + + thrust::fill( + rmm::exec_policy(stream)->on(stream), bmap.begin(), bmap.end(), static_cast(0)); + constexpr return_t threads = 256; + return_t blocks = raft::div_rounding_up_safe(data_len, threads); + remove_duplicates_kernel<<>>(bmap.data().get(), + data.data().get(), + data_begin, + data_end, + data_len, + out_data.data().get(), + unique_count.data().get()); + CHECK_CUDA(stream); + return static_cast(unique_count[0]); +} + +template +vertex_t preprocess_input_frontier(raft::handle_t const &handle, + cugraph::GraphCSRView const &graph, + rmm::device_vector &bmap, + rmm::device_vector &isolated_bmap, + rmm::device_vector &input_frontier, + vertex_t input_frontier_len, + rmm::device_vector &output_frontier) +{ + cudaStream_t stream = handle.get_stream(); + + vertex_t vertex_begin = graph.local_offsets[handle.get_comms().get_rank()]; + vertex_t vertex_end = graph.local_offsets[handle.get_comms().get_rank()] + + graph.local_vertices[handle.get_comms().get_rank()]; + rmm::device_vector unique_count(1, 0); + + thrust::fill( + rmm::exec_policy(stream)->on(stream), bmap.begin(), bmap.end(), static_cast(0)); + constexpr vertex_t threads = 256; + vertex_t blocks = raft::div_rounding_up_safe(input_frontier_len, threads); + remove_duplicates_kernel<<>>(bmap.data().get(), + isolated_bmap.data().get(), + input_frontier.data().get(), + vertex_begin, + vertex_end, + input_frontier_len, + output_frontier.data().get(), + unique_count.data().get()); + CHECK_CUDA(stream); + return static_cast(unique_count[0]); +} + +template +vertex_t preprocess_input_frontier(raft::handle_t const &handle, + cugraph::GraphCSRView const &graph, + rmm::device_vector &bmap, + rmm::device_vector &input_frontier, + vertex_t input_frontier_len, + rmm::device_vector &output_frontier) +{ + cudaStream_t stream = handle.get_stream(); + + vertex_t vertex_begin = graph.local_offsets[handle.get_comms().get_rank()]; + vertex_t vertex_end = graph.local_offsets[handle.get_comms().get_rank()] + + graph.local_vertices[handle.get_comms().get_rank()]; + rmm::device_vector unique_count(1, 0); + + thrust::fill( + rmm::exec_policy(stream)->on(stream), bmap.begin(), bmap.end(), static_cast(0)); + constexpr vertex_t threads = 256; + vertex_t blocks = raft::div_rounding_up_safe(input_frontier_len, threads); + remove_duplicates_kernel<<>>(bmap.data().get(), + input_frontier.data().get(), + vertex_begin, + vertex_end, + input_frontier_len, + output_frontier.data().get(), + unique_count.data().get()); + CHECK_CUDA(stream); + return static_cast(unique_count[0]); +} + +template +__global__ void fill_kernel(vertex_t *distances, vertex_t count, vertex_t start_vertex) +{ + vertex_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= count) { return; } + if (tid == start_vertex) { + distances[tid] = vertex_t{0}; + } else { + distances[tid] = cugraph::detail::traversal::vec_t::max; + } +} + +template +void fill_max_dist(raft::handle_t const &handle, + cugraph::GraphCSRView const &graph, + vertex_t start_vertex, + vertex_t global_number_of_vertices, + vertex_t *distances) +{ + if (distances == nullptr) { return; } + vertex_t array_size = global_number_of_vertices; + constexpr vertex_t threads = 256; + vertex_t blocks = raft::div_rounding_up_safe(array_size, threads); + fill_kernel<<>>(distances, array_size, start_vertex); +} + +template +vertex_t get_global_vertex_count(raft::handle_t const &handle, + cugraph::GraphCSRView const &graph) +{ + rmm::device_vector id(1); + id[0] = *thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + graph.indices, + graph.indices + graph.number_of_edges); + handle.get_comms().allreduce( + id.data().get(), id.data().get(), 1, raft::comms::op_t::MAX, handle.get_stream()); + vertex_t max_vertex_id = id[0]; + + if ((graph.number_of_vertices - 1) > max_vertex_id) { + max_vertex_id = graph.number_of_vertices - 1; + } + + return max_vertex_id + 1; +} + +} // namespace detail + +} // namespace mg + +} // namespace cugraph diff --git a/cpp/src/traversal/mg/frontier_expand.cuh b/cpp/src/traversal/mg/frontier_expand.cuh new file mode 100644 index 00000000000..2733c319087 --- /dev/null +++ b/cpp/src/traversal/mg/frontier_expand.cuh @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "frontier_expand_kernels.cuh" +#include "vertex_binning.cuh" + +namespace cugraph { + +namespace mg { + +namespace detail { + +template +class FrontierExpand { + raft::handle_t const &handle_; + cugraph::GraphCSRView const &graph_; + VertexBinner dist_; + rmm::device_vector reorganized_vertices_; + edge_t vertex_begin_; + edge_t vertex_end_; + rmm::device_vector output_vertex_count_; + + public: + FrontierExpand(raft::handle_t const &handle, + cugraph::GraphCSRView const &graph) + : handle_(handle), graph_(graph) + { + bool is_mg = (handle.comms_initialized() && (graph.local_vertices != nullptr) && + (graph.local_offsets != nullptr)); + if (is_mg) { + reorganized_vertices_.resize(graph.local_vertices[handle_.get_comms().get_rank()]); + vertex_begin_ = graph.local_offsets[handle_.get_comms().get_rank()]; + vertex_end_ = graph.local_offsets[handle_.get_comms().get_rank()] + + graph.local_vertices[handle_.get_comms().get_rank()]; + } else { + reorganized_vertices_.resize(graph.number_of_vertices); + vertex_begin_ = 0; + vertex_end_ = graph.number_of_vertices; + } + output_vertex_count_.resize(1); + } + + // Return the size of the output_frontier + template + vertex_t operator()(operator_t op, + rmm::device_vector &input_frontier, + vertex_t input_frontier_len, + rmm::device_vector &output_frontier) + { + if (input_frontier_len == 0) { return static_cast(0); } + cudaStream_t stream = handle_.get_stream(); + output_vertex_count_[0] = 0; + dist_.setup(graph_.offsets, nullptr, vertex_begin_, vertex_end_); + auto distribution = + dist_.run(input_frontier, input_frontier_len, reorganized_vertices_, stream); + + DegreeBucket large_bucket = distribution.degreeRange(16); + // TODO : Use other streams from handle_ + large_vertex_lb(graph_, + large_bucket, + op, + vertex_begin_, + output_frontier.data().get(), + output_vertex_count_.data().get(), + stream); + + DegreeBucket medium_bucket = distribution.degreeRange(12, 16); + medium_vertex_lb(graph_, + medium_bucket, + op, + vertex_begin_, + output_frontier.data().get(), + output_vertex_count_.data().get(), + stream); + + DegreeBucket small_bucket_0 = distribution.degreeRange(10, 12); + DegreeBucket small_bucket_1 = distribution.degreeRange(8, 10); + DegreeBucket small_bucket_2 = distribution.degreeRange(6, 8); + DegreeBucket small_bucket_3 = distribution.degreeRange(0, 6); + + small_vertex_lb(graph_, + small_bucket_0, + op, + vertex_begin_, + output_frontier.data().get(), + output_vertex_count_.data().get(), + stream); + small_vertex_lb(graph_, + small_bucket_1, + op, + vertex_begin_, + output_frontier.data().get(), + output_vertex_count_.data().get(), + stream); + small_vertex_lb(graph_, + small_bucket_2, + op, + vertex_begin_, + output_frontier.data().get(), + output_vertex_count_.data().get(), + stream); + small_vertex_lb(graph_, + small_bucket_3, + op, + vertex_begin_, + output_frontier.data().get(), + output_vertex_count_.data().get(), + stream); + return output_vertex_count_[0]; + } +}; + +} // namespace detail + +} // namespace mg + +} // namespace cugraph diff --git a/cpp/src/traversal/mg/frontier_expand_kernels.cuh b/cpp/src/traversal/mg/frontier_expand_kernels.cuh new file mode 100644 index 00000000000..625ec0d956f --- /dev/null +++ b/cpp/src/traversal/mg/frontier_expand_kernels.cuh @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "vertex_binning.cuh" + +namespace cugraph { + +namespace mg { + +namespace detail { + +template +__device__ void write_to_frontier(vertex_t const *thread_frontier, + int thread_frontier_count, + vertex_t *block_frontier, + int *block_frontier_count, + vertex_t *output_frontier, + edge_t *block_write_offset, + edge_t *output_frontier_count) +{ + // Set frontier count for block to 0 + if (threadIdx.x == 0) { *block_frontier_count = 0; } + __syncthreads(); + + // Find out where to write the thread frontier to shared memory + int thread_write_offset = atomicAdd(block_frontier_count, thread_frontier_count); + for (int i = 0; i < thread_frontier_count; ++i) { + block_frontier[i + thread_write_offset] = thread_frontier[i]; + } + __syncthreads(); + + // If the total number of frontiers for this block is 0 then return + if (*block_frontier_count == 0) { return; } + + // Find out where to write the block frontier to global memory + if (threadIdx.x == 0) { + *block_write_offset = cugraph::detail::traversal::atomicAdd( + output_frontier_count, static_cast(*block_frontier_count)); + } + __syncthreads(); + + // Write block frontier to global memory + for (int i = threadIdx.x; i < (*block_frontier_count); i += blockDim.x) { + output_frontier[(*block_write_offset) + i] = block_frontier[i]; + } +} + +template +__global__ void block_per_vertex(edge_t const *offsets, + vertex_t const *indices, + vertex_t const *input_frontier, + vertex_t input_frontier_count, + vertex_t vertex_begin, + vertex_t *output_frontier, + edge_t *output_frontier_count, + operator_t op) +{ + if (blockIdx.x >= input_frontier_count) { return; } + + __shared__ edge_t block_write_offset; + __shared__ vertex_t block_frontier[BlockSize * EdgesPerThread]; + __shared__ int block_frontier_count; + vertex_t thread_frontier[EdgesPerThread]; + + vertex_t source = input_frontier[blockIdx.x]; + edge_t beg_edge_offset = offsets[source]; + edge_t end_edge_offset = offsets[source + 1]; + + edge_t edge_offset = threadIdx.x + beg_edge_offset; + int num_iter = (end_edge_offset - beg_edge_offset + BlockSize - 1) / BlockSize; + + int thread_frontier_count = 0; + for (int i = 0; i < num_iter; ++i) { + if (edge_offset < end_edge_offset) { + vertex_t destination = indices[edge_offset]; + // If operator returns true then add to local frontier + if (op(source + vertex_begin, destination)) { + thread_frontier[thread_frontier_count++] = destination; + } + } + bool is_last_iter = (i == (num_iter - 1)); + bool is_nth_iter = (i % EdgesPerThread == 0); + // Write to frontier every EdgesPerThread iterations + // Or if it is the last iteration of the for loop + if (is_nth_iter || is_last_iter) { + write_to_frontier(thread_frontier, + thread_frontier_count, + block_frontier, + &block_frontier_count, + output_frontier, + &block_write_offset, + output_frontier_count); + thread_frontier_count = 0; + } + edge_offset += blockDim.x; + } +} + +template +__global__ void kernel_per_vertex(edge_t const *offsets, + vertex_t const *indices, + vertex_t const *input_frontier, + vertex_t input_frontier_count, + vertex_t vertex_begin, + vertex_t *output_frontier, + edge_t *output_frontier_count, + operator_t op) +{ + vertex_t current_vertex_index = 0; + __shared__ edge_t block_write_offset; + __shared__ vertex_t block_frontier[BlockSize * EdgesPerThread]; + __shared__ int block_frontier_count; + + edge_t stride = blockDim.x * gridDim.x; + vertex_t thread_frontier[EdgesPerThread]; + + while (current_vertex_index < input_frontier_count) { + vertex_t source = input_frontier[current_vertex_index]; + edge_t beg_block_offset = offsets[source] + (blockIdx.x * blockDim.x); + edge_t end_block_offset = offsets[source + 1]; + int i = 0; + int thread_frontier_count = 0; + for (edge_t block_offset = beg_block_offset; block_offset < end_block_offset; + block_offset += stride) { + if (block_offset + threadIdx.x < end_block_offset) { + vertex_t destination = indices[block_offset + threadIdx.x]; + if (op(source + vertex_begin, destination)) { + thread_frontier[thread_frontier_count++] = destination; + } + } + bool is_last_iter = (block_offset + blockDim.x >= end_block_offset); + bool is_nth_iter = (i % EdgesPerThread == 0); + if (is_nth_iter || is_last_iter) { + write_to_frontier(thread_frontier, + thread_frontier_count, + block_frontier, + &block_frontier_count, + output_frontier, + &block_write_offset, + output_frontier_count); + thread_frontier_count = 0; + } + ++i; + } + ++current_vertex_index; + } +} + +template +void large_vertex_lb(cugraph::GraphCSRView const &graph, + DegreeBucket &bucket, + operator_t op, + vertex_t vertex_begin, + vertex_t *output_vertex_ids, + edge_t *output_vertex_ids_offset, + cudaStream_t stream) +{ + if (bucket.numberOfVertices != 0) { + const int block_size = 1024; + int block_count = (1 << (bucket.ceilLogDegreeStart - 8)); + kernel_per_vertex + <<>>(graph.offsets, + graph.indices, + bucket.vertexIds, + bucket.numberOfVertices, + vertex_begin, + output_vertex_ids, + output_vertex_ids_offset, + op); + CHECK_CUDA(stream); + } +} + +template +void medium_vertex_lb(cugraph::GraphCSRView const &graph, + DegreeBucket &bucket, + operator_t op, + vertex_t vertex_begin, + vertex_t *output_vertex_ids, + edge_t *output_vertex_ids_offset, + cudaStream_t stream) +{ + // Vertices with degrees 2^12 <= d < 2^16 are handled by this kernel + // Block size of 1024 is chosen to reduce wasted threads for a vertex + const int block_size = 1024; + int block_count = bucket.numberOfVertices; + if (block_count != 0) { + block_per_vertex + <<>>(graph.offsets, + graph.indices, + bucket.vertexIds, + bucket.numberOfVertices, + vertex_begin, + output_vertex_ids, + output_vertex_ids_offset, + op); + CHECK_CUDA(stream); + } +} + +template +void small_vertex_lb(cugraph::GraphCSRView const &graph, + DegreeBucket &bucket, + operator_t op, + vertex_t vertex_begin, + vertex_t *output_vertex_ids, + edge_t *output_vertex_ids_offset, + cudaStream_t stream) +{ + int block_count = bucket.numberOfVertices; + if (block_count == 0) { return; } + // For vertices with degree <= 32 block size of 32 is chosen + // For all vertices with degree d such that 2^x <= d < 2^x+1 + // the block size is chosen to be 2^x. This is done so that + // vertices with degrees 1.5*2^x are also handled in a load + // balanced way + int block_size = 512; + if (bucket.ceilLogDegreeEnd < 6) { + block_size = 32; + block_per_vertex<32, 8><<>>(graph.offsets, + graph.indices, + bucket.vertexIds, + bucket.numberOfVertices, + vertex_begin, + output_vertex_ids, + output_vertex_ids_offset, + op); + } else if (bucket.ceilLogDegreeEnd < 8) { + block_size = 64; + block_per_vertex<64, 8><<>>(graph.offsets, + graph.indices, + bucket.vertexIds, + bucket.numberOfVertices, + vertex_begin, + output_vertex_ids, + output_vertex_ids_offset, + op); + } else if (bucket.ceilLogDegreeEnd < 10) { + block_size = 128; + block_per_vertex<128, 8><<>>(graph.offsets, + graph.indices, + bucket.vertexIds, + bucket.numberOfVertices, + vertex_begin, + output_vertex_ids, + output_vertex_ids_offset, + op); + } else if (bucket.ceilLogDegreeEnd < 12) { + block_size = 512; + block_per_vertex<512, 4><<>>(graph.offsets, + graph.indices, + bucket.vertexIds, + bucket.numberOfVertices, + vertex_begin, + output_vertex_ids, + output_vertex_ids_offset, + op); + } else { + block_size = 512; + block_per_vertex<512, 4><<>>(graph.offsets, + graph.indices, + bucket.vertexIds, + bucket.numberOfVertices, + vertex_begin, + output_vertex_ids, + output_vertex_ids_offset, + op); + } + CHECK_CUDA(stream); +} + +} // namespace detail + +} // namespace mg + +} // namespace cugraph diff --git a/cpp/src/traversal/mg/vertex_binning.cuh b/cpp/src/traversal/mg/vertex_binning.cuh new file mode 100644 index 00000000000..3d8c963c466 --- /dev/null +++ b/cpp/src/traversal/mg/vertex_binning.cuh @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "common_utils.cuh" +#include "vertex_binning_kernels.cuh" + +namespace cugraph { + +namespace mg { + +namespace detail { + +template +struct DegreeBucket { + vertex_t* vertexIds; + vertex_t numberOfVertices; + edge_t ceilLogDegreeStart; + edge_t ceilLogDegreeEnd; +}; + +template +class LogDistribution { + vertex_t* vertex_id_begin_; + thrust::host_vector bin_offsets_; + + public: + LogDistribution(rmm::device_vector& vertex_id, rmm::device_vector& bin_offsets) + : vertex_id_begin_(vertex_id.data().get()), bin_offsets_(bin_offsets) + { + } + + DegreeBucket degreeRange( + edge_t ceilLogDegreeStart, edge_t ceilLogDegreeEnd = std::numeric_limits::max()) + { + ceilLogDegreeStart = std::max(ceilLogDegreeStart, edge_t{0}); + if (ceilLogDegreeEnd > static_cast(bin_offsets_.size()) - 2) { + ceilLogDegreeEnd = bin_offsets_.size() - 2; + } + return DegreeBucket{ + vertex_id_begin_ + bin_offsets_[ceilLogDegreeStart + 1], + bin_offsets_[ceilLogDegreeEnd + 1] - bin_offsets_[ceilLogDegreeStart + 1], + ceilLogDegreeStart, + ceilLogDegreeEnd}; + } +}; + +template +class VertexBinner { + edge_t* offsets_; + uint32_t* active_bitmap_; + vertex_t vertex_begin_; + vertex_t vertex_end_; + + rmm::device_vector tempBins_; + rmm::device_vector bin_offsets_; + + public: + VertexBinner(void) : tempBins_(NumberBins), bin_offsets_(NumberBins) {} + + void setup(edge_t* offsets, uint32_t* active_bitmap, vertex_t vertex_begin, vertex_t vertex_end) + { + offsets_ = offsets; + active_bitmap_ = active_bitmap; + vertex_begin_ = vertex_begin; + vertex_end_ = vertex_end; + } + + LogDistribution run(rmm::device_vector& reorganized_vertices, + cudaStream_t stream); + + LogDistribution run(rmm::device_vector& input_vertices, + vertex_t input_vertices_len, + rmm::device_vector& reorganized_vertices, + cudaStream_t stream); +}; + +template +LogDistribution VertexBinner::run( + rmm::device_vector& reorganized_vertices, cudaStream_t stream) +{ + thrust::fill( + rmm::exec_policy(stream)->on(stream), bin_offsets_.begin(), bin_offsets_.end(), edge_t{0}); + thrust::fill(rmm::exec_policy(stream)->on(stream), tempBins_.begin(), tempBins_.end(), edge_t{0}); + bin_vertices(reorganized_vertices, + bin_offsets_, + tempBins_, + active_bitmap_, + offsets_, + vertex_begin_, + vertex_end_, + stream); + + return LogDistribution(reorganized_vertices, bin_offsets_); +} + +template +LogDistribution VertexBinner::run( + rmm::device_vector& input_vertices, + vertex_t input_vertices_len, + rmm::device_vector& reorganized_vertices, + cudaStream_t stream) +{ + bin_vertices(input_vertices, + input_vertices_len, + reorganized_vertices, + bin_offsets_, + tempBins_, + offsets_, + vertex_begin_, + vertex_end_, + stream); + + return LogDistribution(reorganized_vertices, bin_offsets_); +} + +} // namespace detail + +} // namespace mg + +} // namespace cugraph diff --git a/cpp/src/traversal/mg/vertex_binning_kernels.cuh b/cpp/src/traversal/mg/vertex_binning_kernels.cuh new file mode 100644 index 00000000000..dbb339fea05 --- /dev/null +++ b/cpp/src/traversal/mg/vertex_binning_kernels.cuh @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include "../traversal_common.cuh" + +namespace cugraph { + +namespace mg { + +namespace detail { + +template +__device__ inline typename std::enable_if<(sizeof(degree_t) == 4), int>::type ceilLog2_p1( + degree_t val) +{ + return BitsPWrd - __clz(val) + (__popc(val) > 1); +} + +template +__device__ inline typename std::enable_if<(sizeof(degree_t) == 8), int>::type ceilLog2_p1( + degree_t val) +{ + return BitsPWrd - __clzll(val) + (__popcll(val) > 1); +} + +template +__global__ void simple_fill(return_t *bin0, return_t *bin1, return_t count) +{ + for (return_t i = 0; i < count; i++) { + bin0[i] = 0; + bin1[i] = 0; + } +} + +template +__global__ void exclusive_scan(return_t *data, return_t *out) +{ + constexpr int BinCount = NumberBins; + return_t lData[BinCount]; + thrust::exclusive_scan(thrust::seq, data, data + BinCount, lData); + for (int i = 0; i < BinCount; ++i) { + out[i] = lData[i]; + data[i] = lData[i]; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Queue enabled kernels +//////////////////////////////////////////////////////////////////////////////// + +// Given the CSR offsets of vertices and the related active bit map +// count the number of vertices that belong to a particular bin where +// vertex with degree d such that 2^x < d <= 2^x+1 belong to bin (x+1) +// Vertices with degree 0 are counted in bin 0 +// In this function, any id in vertex_ids array is only acceptable as long +// as its value is between vertex_begin and vertex_end +template +__global__ void count_bin_sizes(edge_t *bins, + edge_t const *offsets, + vertex_t const *vertex_ids, + edge_t const vertex_id_count, + vertex_t vertex_begin, + vertex_t vertex_end) +{ + using cugraph::detail::traversal::atomicAdd; + constexpr int BinCount = NumberBins; + __shared__ edge_t lBin[BinCount]; + for (int i = threadIdx.x; i < BinCount; i += blockDim.x) { lBin[i] = 0; } + __syncthreads(); + + for (vertex_t i = threadIdx.x + (blockIdx.x * blockDim.x); i < vertex_id_count; + i += gridDim.x * blockDim.x) { + auto source = vertex_ids[i]; + if ((source >= vertex_begin) && (source < vertex_end)) { + // Take care of OPG partitioning + // source logical vertex resides from offsets[source - vertex_begin] + // to offsets[source - vertex_begin + 1] + source -= vertex_begin; + auto degree = offsets[source + 1] - offsets[source]; + atomicAdd(lBin + ceilLog2_p1(degree), edge_t{1}); + } + } + __syncthreads(); + + for (int i = threadIdx.x; i < BinCount; i += blockDim.x) { atomicAdd(bins + i, lBin[i]); } +} + +// Bin vertices to the appropriate bins by taking into account +// the starting offsets calculated by count_bin_sizes +template +__global__ void create_vertex_bins(vertex_t *out_vertex_ids, + edge_t *bin_offsets, + edge_t const *offsets, + vertex_t *in_vertex_ids, + edge_t const vertex_id_count, + vertex_t vertex_begin, + vertex_t vertex_end) +{ + using cugraph::detail::traversal::atomicAdd; + constexpr int BinCount = NumberBins; + __shared__ edge_t lBin[BinCount]; + __shared__ int lPos[BinCount]; + if (threadIdx.x < BinCount) { + lBin[threadIdx.x] = 0; + lPos[threadIdx.x] = 0; + } + __syncthreads(); + + vertex_t vertex_index = (threadIdx.x + blockIdx.x * blockDim.x); + bool is_valid_vertex = (vertex_index < vertex_id_count); + vertex_t source; + + if (is_valid_vertex) { + source = in_vertex_ids[vertex_index]; + is_valid_vertex = ((source >= vertex_begin) && (source < vertex_end)); + source -= vertex_begin; + } + + int threadBin; + edge_t threadPos; + if (is_valid_vertex) { + threadBin = ceilLog2_p1(offsets[source + 1] - offsets[source]); + threadPos = atomicAdd(lBin + threadBin, edge_t{1}); + } + __syncthreads(); + + if (threadIdx.x < BinCount) { + lPos[threadIdx.x] = atomicAdd(bin_offsets + threadIdx.x, lBin[threadIdx.x]); + } + __syncthreads(); + + if (is_valid_vertex) { out_vertex_ids[lPos[threadBin] + threadPos] = source; } +} + +template +void bin_vertices(rmm::device_vector &input_vertex_ids, + vertex_t input_vertex_ids_len, + rmm::device_vector &reorganized_vertex_ids, + rmm::device_vector &bin_count_offsets, + rmm::device_vector &bin_count, + edge_t *offsets, + vertex_t vertex_begin, + vertex_t vertex_end, + cudaStream_t stream) +{ + simple_fill<<<1, 1, 0, stream>>>( + bin_count_offsets.data().get(), bin_count.data().get(), static_cast(bin_count.size())); + + const uint32_t BLOCK_SIZE = 512; + uint32_t blocks = ((input_vertex_ids_len) + BLOCK_SIZE - 1) / BLOCK_SIZE; + count_bin_sizes + <<>>(bin_count.data().get(), + offsets, + input_vertex_ids.data().get(), + static_cast(input_vertex_ids_len), + vertex_begin, + vertex_end); + + exclusive_scan<<<1, 1, 0, stream>>>(bin_count.data().get(), bin_count_offsets.data().get()); + + create_vertex_bins + <<>>(reorganized_vertex_ids.data().get(), + bin_count.data().get(), + offsets, + input_vertex_ids.data().get(), + static_cast(input_vertex_ids_len), + vertex_begin, + vertex_end); +} + +} // namespace detail + +} // namespace mg + +} // namespace cugraph diff --git a/cpp/src/traversal/sssp.cu b/cpp/src/traversal/sssp.cu index f47583fdc9a..4018c9d9878 100644 --- a/cpp/src/traversal/sssp.cu +++ b/cpp/src/traversal/sssp.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,8 +16,8 @@ // Author: Prasun Gera pgera@nvidia.com -#include #include +#include #include "graph.hpp" @@ -211,10 +211,8 @@ void SSSP::traverse(IndexType source_vertex) cudaMemcpyAsync( distances, next_distances, n * sizeof(DistType), cudaMemcpyDeviceToDevice, stream); - CUDA_CHECK_LAST(); - // We need nf for the loop - cudaStreamSynchronize(stream); + CUDA_TRY(cudaStreamSynchronize(stream)); // Swap frontiers // IndexType *tmp = frontier; @@ -244,7 +242,7 @@ void SSSP::clean() * @file sssp.cu * --------------------------------------------------------------------------*/ template -void sssp(experimental::GraphCSRView const &graph, +void sssp(GraphCSRView const &graph, WT *distances, VT *predecessors, const VT source_vertex) @@ -283,7 +281,7 @@ void sssp(experimental::GraphCSRView const &graph, } else { // SSSP is not defined for graphs with negative weight cycles // Warn user about any negative edges - if (graph.prop.has_negative_edges == experimental::PropType::PROP_TRUE) + if (graph.prop.has_negative_edges == PropType::PROP_TRUE) std::cerr << "WARN: The graph has negative weight edges. SSSP will not " "converge if the graph has negative weight cycles\n"; edge_weights_ptr = graph.edge_data; @@ -295,11 +293,11 @@ void sssp(experimental::GraphCSRView const &graph, } // explicit instantiation -template void sssp(experimental::GraphCSRView const &graph, +template void sssp(GraphCSRView const &graph, float *distances, int *predecessors, const int source_vertex); -template void sssp(experimental::GraphCSRView const &graph, +template void sssp(GraphCSRView const &graph, double *distances, int *predecessors, const int source_vertex); diff --git a/cpp/src/traversal/sssp.cuh b/cpp/src/traversal/sssp.cuh index 16dcecf33de..fac66e3d47e 100644 --- a/cpp/src/traversal/sssp.cuh +++ b/cpp/src/traversal/sssp.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/traversal/sssp_kernels.cuh b/cpp/src/traversal/sssp_kernels.cuh index d778372af41..d96540b22b9 100644 --- a/cpp/src/traversal/sssp_kernels.cuh +++ b/cpp/src/traversal/sssp_kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,10 +18,9 @@ #include -#include #include #include "traversal_common.cuh" -#include "utilities/error_utils.h" +#include "utilities/error.hpp" namespace cugraph { namespace detail { namespace sssp_kernels { @@ -548,7 +547,7 @@ void frontier_expand(const IndexType* row_ptr, predecessors, edge_mask); - CUDA_CHECK_LAST(); + CHECK_CUDA(m_stream); } } // namespace sssp_kernels } // namespace detail diff --git a/cpp/src/traversal/traversal_common.cuh b/cpp/src/traversal/traversal_common.cuh index ca36d7edb79..2802fb94be8 100644 --- a/cpp/src/traversal/traversal_common.cuh +++ b/cpp/src/traversal/traversal_common.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ #pragma once #include -#include "utilities/error_utils.h" +#include "utilities/error.hpp" #define MAXBLOCKS 65535 #define WARP_SIZE 32 @@ -107,6 +107,20 @@ struct vec_t { static const int max = std::numeric_limits::max(); }; +template <> +struct vec_t { + typedef long4 vec4; + typedef long2 vec2; + static const long max = std::numeric_limits::max(); +}; + +template <> +struct vec_t { + typedef uint4 vec4; + typedef uint2 vec2; + static const unsigned max = std::numeric_limits::max(); +}; + template <> struct vec_t { typedef longlong4 vec4; @@ -184,7 +198,7 @@ void fill_vec(ValueType* vec, SizeType n, ValueType val, cudaStream_t stream) grid.x = (n + block.x - 1) / block.x; fill_vec_kernel<<>>(vec, n, val); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); } template @@ -204,6 +218,24 @@ binsearch_maxle(const IndexType* vec, const IndexType val, IndexType low, IndexT } } +// FIXME: The atomicAdd wrappers should be moved to RAFT + +template +__device__ static __forceinline__ T atomicAdd(T* addr, T val) +{ + return ::atomicAdd(addr, val); +} + +template <> +__device__ __forceinline__ int64_t atomicAdd(int64_t* addr, int64_t val) +{ + static_assert(sizeof(int64_t) == sizeof(unsigned long long), + "sizeof(int64_t) != sizeof(unsigned long long). Can't use atomicAdd"); + + return ::atomicAdd(reinterpret_cast(addr), + static_cast(val)); +} + __device__ static __forceinline__ float atomicMin(float* addr, float val) { int* addr_as_int = (int*)addr; @@ -286,7 +318,7 @@ __global__ void flag_isolated_vertices_kernel(IndexType n, int local_isolated_bmap = 0; - IndexType imax = (n - thread_off); + IndexType imax = (n > thread_off) ? (n - thread_off) : 0; IndexType local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD]; @@ -314,7 +346,7 @@ __global__ void flag_isolated_vertices_kernel(IndexType n, IndexType total_nisolated = BlockReduce(block_reduce_temp_storage).Sum(local_nisolated); - if (threadIdx.x == 0 && total_nisolated) { atomicAdd(nisolated, total_nisolated); } + if (threadIdx.x == 0 && total_nisolated) { traversal::atomicAdd(nisolated, total_nisolated); } int logicalwarpid = threadIdx.x / FLAG_ISOLATED_VERTICES_THREADS_PER_INT; @@ -347,7 +379,7 @@ void flag_isolated_vertices(IndexType n, flag_isolated_vertices_kernel<<>>( n, isolated_bmap, row_ptr, degrees, nisolated); - CUDA_CHECK_LAST(); + CHECK_CUDA(m_stream); } template @@ -374,7 +406,7 @@ void set_frontier_degree(IndexType* frontier_degree, block.x = 256; grid.x = min((n + block.x - 1) / block.x, (IndexType)MAXBLOCKS); set_frontier_degree_kernel<<>>(frontier_degree, frontier, degree, n); - CUDA_CHECK_LAST(); + CHECK_CUDA(m_stream); } template @@ -439,7 +471,7 @@ void compute_bucket_offsets(IndexType* cumul, compute_bucket_offsets_kernel<<>>( cumul, bucket_offsets, frontier_size, total_degree); - CUDA_CHECK_LAST(); + CHECK_CUDA(m_stream); } } // namespace traversal } // namespace detail diff --git a/cpp/src/traversal/two_hop_neighbors.cu b/cpp/src/traversal/two_hop_neighbors.cu index dc46d56910c..fb984dae0ad 100644 --- a/cpp/src/traversal/two_hop_neighbors.cu +++ b/cpp/src/traversal/two_hop_neighbors.cu @@ -20,9 +20,9 @@ * ---------------------------------------------------------------------------**/ #include -#include #include #include +#include #include "two_hop_neighbors.cuh" #include @@ -32,8 +32,7 @@ namespace cugraph { template -std::unique_ptr> get_two_hop_neighbors( - experimental::GraphCSRView const &graph) +std::unique_ptr> get_two_hop_neighbors(GraphCSRView const &graph) { cudaStream_t stream{nullptr}; @@ -109,8 +108,7 @@ std::unique_ptr> get_two_hop_neighbo // Get things ready to return ET outputSize = tuple_end - tuple_start; - auto result = std::make_unique>( - graph.number_of_vertices, outputSize, false); + auto result = std::make_unique>(graph.number_of_vertices, outputSize, false); cudaMemcpy(result->src_indices(), d_first_pair, sizeof(VT) * outputSize, cudaMemcpyDefault); cudaMemcpy(result->dst_indices(), d_second_pair, sizeof(VT) * outputSize, cudaMemcpyDefault); @@ -118,10 +116,10 @@ std::unique_ptr> get_two_hop_neighbo return result; } -template std::unique_ptr> get_two_hop_neighbors( - experimental::GraphCSRView const &); +template std::unique_ptr> get_two_hop_neighbors( + GraphCSRView const &); -template std::unique_ptr> get_two_hop_neighbors( - experimental::GraphCSRView const &); +template std::unique_ptr> get_two_hop_neighbors( + GraphCSRView const &); } // namespace cugraph diff --git a/cpp/src/traversal/two_hop_neighbors.cuh b/cpp/src/traversal/two_hop_neighbors.cuh index fd29b3e5140..87d3b36b861 100644 --- a/cpp/src/traversal/two_hop_neighbors.cuh +++ b/cpp/src/traversal/two_hop_neighbors.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/utilities/cuda_utils.cuh b/cpp/src/utilities/cuda_utils.cuh deleted file mode 100644 index dfb407aa35d..00000000000 --- a/cpp/src/utilities/cuda_utils.cuh +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -namespace cugraph { -// -// This should go into RAFT... -// -__device__ static __forceinline__ int64_t atomicMin(int64_t *addr, int64_t val) -{ - unsigned long long *addr_as_ull{reinterpret_cast(addr)}; - unsigned long long *val_addr_as_ull{reinterpret_cast(&val)}; - unsigned long long old = *addr_as_ull; - unsigned long long val_as_ull = *val_addr_as_ull; - int64_t *p_old{reinterpret_cast(&old)}; - unsigned long long expected; - - do { - expected = old; - old = ::atomicCAS(addr_as_ull, expected, thrust::min(val_as_ull, expected)); - } while (expected != old); - return *p_old; -} - -__device__ static __forceinline__ int32_t atomicMin(int32_t *addr, int32_t val) -{ - return ::atomicMin(addr, val); -} - -__device__ static __forceinline__ int64_t atomicAdd(int64_t *addr, int64_t val) -{ - unsigned long long *addr_as_ull{reinterpret_cast(addr)}; - unsigned long long *val_addr_as_ull{reinterpret_cast(&val)}; - unsigned long long old = *addr_as_ull; - unsigned long long val_as_ull = *val_addr_as_ull; - int64_t *p_old{reinterpret_cast(&old)}; - unsigned long long expected; - - do { - expected = old; - old = ::atomicCAS(addr_as_ull, expected, (expected + val_as_ull)); - } while (expected != old); - return *p_old; -} - -__device__ static __forceinline__ int32_t atomicAdd(int32_t *addr, int32_t val) -{ - return ::atomicAdd(addr, val); -} - -__device__ static __forceinline__ int32_t atomicAdd(int32_t volatile *addr, int32_t val) -{ - return ::atomicAdd(const_cast(addr), val); -} - -__device__ static __forceinline__ double atomicAdd(double volatile *addr, double val) -{ - return ::atomicAdd(const_cast(addr), val); -} - -__device__ static __forceinline__ float atomicAdd(float volatile *addr, float val) -{ - return ::atomicAdd(const_cast(addr), val); -} - -__device__ static __forceinline__ int32_t atomicCAS(int32_t volatile *addr, - int32_t expected, - int32_t val) -{ - return ::atomicCAS(const_cast(addr), expected, val); -} - -} // namespace cugraph diff --git a/cpp/src/utilities/cusparse_helper.cu b/cpp/src/utilities/cusparse_helper.cu deleted file mode 100644 index 43d19f74547..00000000000 --- a/cpp/src/utilities/cusparse_helper.cu +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include "cusparse_helper.h" - -namespace cugraph { -namespace detail { -cusparseHandle_t Cusparse::m_handle = 0; - -template -CusparseCsrMV::CusparseCsrMV() -{ - if (sizeof(ValueType) == 4) - cuda_type = CUDA_R_32F; - else - cuda_type = CUDA_R_64F; - CHECK_CUSPARSE(cusparseCreateMatDescr(&descrA)); - CHECK_CUSPARSE(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); - CHECK_CUSPARSE(cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - // alg = CUSPARSE_ALG_MERGE_PATH; - alg = CUSPARSE_ALG_NAIVE; - stream = nullptr; -} - -template -CusparseCsrMV::~CusparseCsrMV() -{ -} - -template -void CusparseCsrMV::setup(int m, - int n, - int nnz, - const ValueType* alpha, - const ValueType* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - const ValueType* x, - const ValueType* beta, - ValueType* y) -{ - CHECK_CUSPARSE(cusparseCsrmvEx_bufferSize(Cusparse::get_handle(), - alg, - CUSPARSE_OPERATION_NON_TRANSPOSE, - m, - n, - nnz, - alpha, - cuda_type, - descrA, - csrValA, - cuda_type, - csrRowPtrA, - csrColIndA, - x, - cuda_type, - beta, - cuda_type, - y, - cuda_type, - cuda_type, - &spmv_temp_storage_bytes)); - spmv_temp_storage.resize(spmv_temp_storage_bytes, stream); - spmv_d_temp_storage = spmv_temp_storage.data(); -} -template -void CusparseCsrMV::run(int m, - int n, - int nnz, - const ValueType* alpha, - const ValueType* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - const ValueType* x, - const ValueType* beta, - ValueType* y) -{ - CHECK_CUSPARSE(cusparseCsrmvEx(Cusparse::get_handle(), - alg, - CUSPARSE_OPERATION_NON_TRANSPOSE, - m, - n, - nnz, - alpha, - cuda_type, - descrA, - csrValA, - cuda_type, - csrRowPtrA, - csrColIndA, - x, - cuda_type, - beta, - cuda_type, - y, - cuda_type, - cuda_type, - spmv_d_temp_storage)); -} - -template class CusparseCsrMV; -template class CusparseCsrMV; - -} // namespace detail -} // namespace cugraph diff --git a/cpp/src/utilities/cusparse_helper.h b/cpp/src/utilities/cusparse_helper.h deleted file mode 100644 index d206c824bb6..00000000000 --- a/cpp/src/utilities/cusparse_helper.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -#include -#include -#include "utilities/graph_utils.cuh" - -namespace cugraph { -namespace detail { - -#define CHECK_CUSPARSE(call) \ - { \ - cusparseStatus_t _e = (call); \ - if (_e != CUSPARSE_STATUS_SUCCESS) { CUGRAPH_FAIL("CUSPARSE ERROR"); } \ - } - -class Cusparse { - private: - // global CUSPARSE handle for nvgraph - static cusparseHandle_t m_handle; // Constructor. - Cusparse(); - // Destructor. - ~Cusparse(); - - public: - // Get the handle. - static cusparseHandle_t get_handle() - { - if (m_handle == 0) CHECK_CUSPARSE(cusparseCreate(&m_handle)); - return m_handle; - } - // Destroy handle - static void destroy_handle() - { - if (m_handle != 0) CHECK_CUSPARSE(cusparseDestroy(m_handle)); - m_handle = 0; - } -}; - -template -class CusparseCsrMV { - private: - cusparseMatDescr_t descrA; - cudaDataType cuda_type; - cusparseAlgMode_t alg; - rmm::device_buffer spmv_temp_storage; - void* spmv_d_temp_storage; - size_t spmv_temp_storage_bytes; - cudaStream_t stream; - - public: - CusparseCsrMV(); - - ~CusparseCsrMV(); - void setup(int m, - int n, - int nnz, - const ValueType* alpha, - const ValueType* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - const ValueType* x, - const ValueType* beta, - ValueType* y); - void run(int m, - int n, - int nnz, - const ValueType* alpha, - const ValueType* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - const ValueType* x, - const ValueType* beta, - ValueType* y); -}; - -} // namespace detail -} // namespace cugraph diff --git a/cpp/src/utilities/error_utils.h b/cpp/src/utilities/error_utils.h deleted file mode 100644 index 25179dd201b..00000000000 --- a/cpp/src/utilities/error_utils.h +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef ERRORUTILS_HPP -#define ERRORUTILS_HPP - -#include -#include -#include -#include - -#include - -namespace cugraph { -/**---------------------------------------------------------------------------* - * @brief Exception thrown when logical precondition is violated. - * - * This exception should not be thrown directly and is instead thrown by the - * CUGRAPH_EXPECTS macro. - * - *---------------------------------------------------------------------------**/ -struct logic_error : public std::logic_error { - logic_error(char const* const message) : std::logic_error(message) {} - - logic_error(std::string const& message) : std::logic_error(message) {} - - // TODO Add an error code member? This would be useful for translating an - // exception to an error code in a pure-C API -}; -/**---------------------------------------------------------------------------* - * @brief Exception thrown when a CUDA error is encountered. - * - *---------------------------------------------------------------------------**/ -struct cuda_error : public std::runtime_error { - cuda_error(std::string const& message) : std::runtime_error(message) {} -}; -} // namespace cugraph - -#define STRINGIFY_DETAIL(x) #x -#define CUGRAPH_STRINGIFY(x) STRINGIFY_DETAIL(x) - -/**---------------------------------------------------------------------------* - * @brief Macro for checking (pre-)conditions that throws an exception when - * a condition is violated. - * - * Example usage: - * - * @code - * CUGRAPH_EXPECTS(lhs->dtype == rhs->dtype, "Column type mismatch"); - * @endcode - * - * @param[in] cond Expression that evaluates to true or false - * @param[in] reason String literal description of the reason that cond is - * expected to be true - * @throw cugraph::logic_error if the condition evaluates to false. - *---------------------------------------------------------------------------**/ -#define CUGRAPH_EXPECTS(cond, reason) \ - (!!(cond)) ? static_cast(0) \ - : throw cugraph::logic_error("CUGRAPH failure at: " __FILE__ \ - ":" CUGRAPH_STRINGIFY(__LINE__) ": " reason) - -/**---------------------------------------------------------------------------* - * @brief Try evaluation an expression with a gdf_error type, - * and throw an appropriate exception if it fails. - *---------------------------------------------------------------------------**/ -#define CUGRAPH_TRY(_gdf_error_expression) \ - do { \ - auto _evaluated = _gdf_error_expression; \ - if (_evaluated == GDF_SUCCESS) { break; } \ - throw cugraph::logic_error( \ - ("CUGRAPH error " + std::string(gdf_error_get_name(_evaluated)) + \ - " at " __FILE__ \ - ":" CUGRAPH_STRINGIFY(__LINE__) " evaluating " CUGRAPH_STRINGIFY(#_gdf_error_expression)) \ - .c_str()); \ - } while (0) - -/**---------------------------------------------------------------------------* - * @brief Indicates that an erroneous code path has been taken. - * - * In host code, throws a `cugraph::logic_error`. - * - * - * Example usage: - * ``` - * CUGRAPH_FAIL("Non-arithmetic operation is not supported"); - * ``` - * - * @param[in] reason String literal description of the reason - *---------------------------------------------------------------------------**/ -#define CUGRAPH_FAIL(reason) \ - throw cugraph::logic_error("cuGraph failure at: " __FILE__ \ - ":" CUGRAPH_STRINGIFY(__LINE__) ": " reason) - -namespace cugraph { -namespace detail { - -inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int line) -{ - throw cugraph::cuda_error(std::string{"CUDA error encountered at: " + std::string{file} + ":" + - std::to_string(line) + ": " + std::to_string(error) + " " + - cudaGetErrorName(error) + " " + cudaGetErrorString(error)}); -} - -inline void check_stream(cudaStream_t stream, const char* file, unsigned int line) -{ - cudaError_t error{cudaSuccess}; - error = cudaStreamSynchronize(stream); - if (cudaSuccess != error) { throw_cuda_error(error, file, line); } - - error = cudaGetLastError(); - if (cudaSuccess != error) { throw_cuda_error(error, file, line); } -} -} // namespace detail -} // namespace cugraph - -/**---------------------------------------------------------------------------* - * @brief Error checking macro for CUDA runtime API functions. - * - * Invokes a CUDA runtime API function call, if the call does not return - * cudaSuccess, throws an exception detailing the CUDA error that occurred. - * - * This macro supersedes GDF_REQUIRE and should be preferred in all instances. - * GDF_REQUIRE should be considered deprecated. - * - *---------------------------------------------------------------------------**/ -#ifndef CUDA_TRY -#define CUDA_TRY(call) \ - do { \ - cudaError_t const status = (call); \ - if (cudaSuccess != status) { cugraph::detail::throw_cuda_error(status, __FILE__, __LINE__); } \ - } while (0); -#endif -#endif - -#define CUDA_CHECK_LAST() \ - { \ - cudaError_t const status = cudaGetLastError(); \ - if (status != cudaSuccess) { cugraph::detail::throw_cuda_error(status, __FILE__, __LINE__); } \ - } - -/**---------------------------------------------------------------------------* - * @brief Debug macro to synchronize a stream and check for CUDA errors - * - * In a non-release build, this macro will synchronize the specified stream, and - * check for any CUDA errors returned from cudaGetLastError. If an error is - * reported, an exception is thrown detailing the CUDA error that occurred. - * - * The intent of this macro is to provide a mechanism for synchronous and - * deterministic execution for debugging asynchronous CUDA execution. It should - * be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an - * asynchronous kernel launch. - * - * Similar to assert(), it is only present in non-Release builds. - * - *---------------------------------------------------------------------------**/ -#ifndef NDEBUG -#define CHECK_STREAM(stream) cugraph::detail::check_stream((stream), __FILE__, __LINE__) -#else -#define CHECK_STREAM(stream) static_cast(0) -#endif - -/**---------------------------------------------------------------------------* - * @brief Macro for checking graph object that throws an exception when - * a condition is violated. - * - * Example usage: - * - * @code - * CHECK_GRAPH(graph); - * @endcode - * - * @param[in] the Graph class - * @throw cugraph::logic_error if the condition evaluates to false. - *---------------------------------------------------------------------------**/ -#define CHECK_GRAPH(graph) \ - CUGRAPH_EXPECTS(graph != nullptr, "Invalid API parameter: graph is NULL"); \ - CUGRAPH_EXPECTS(graph->adjList != nullptr || graph->edgeList != nullptr, \ - "Invalid API parameter: graph is empty"); diff --git a/cpp/src/utilities/graph_utils.cuh b/cpp/src/utilities/graph_utils.cuh index efad365aa96..6b7e8558e86 100644 --- a/cpp/src/utilities/graph_utils.cuh +++ b/cpp/src/utilities/graph_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. * * NVIDIA CORPORATION and its licensors retain all intellectual property * and proprietary rights in and to this software, related documentation @@ -13,58 +13,28 @@ // Author: Alex Fender afender@nvidia.com #pragma once +#include + +#include +#include +#include + #include #include -//#include -//#include #include #include #include #include #include -#include -#include - namespace cugraph { namespace detail { -#define USE_CG 1 //#define DEBUG 1 #define CUDA_MAX_BLOCKS 65535 -#define CUDA_MAX_KERNEL_THREADS 256 // kernefgdfl will launch at most 256 threads per block -#define DEFAULT_MASK 0xffffffff +#define CUDA_MAX_KERNEL_THREADS 256 // kernel will launch at most 256 threads per block #define US -template -static __device__ __forceinline__ T -shfl_up(T r, int offset, int bound = 32, int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#if USE_CG - return __shfl_up_sync(mask, r, offset, bound); -#else - return __shfl_up(r, offset, bound); -#endif -#else - return 0.0f; -#endif -} - -template -static __device__ __forceinline__ T shfl(T r, int lane, int bound = 32, int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#if USE_CG - return __shfl_sync(mask, r, lane, bound); -#else - return __shfl(r, lane, bound); -#endif -#else - return 0.0f; -#endif -} - template __inline__ __device__ value_t parallel_prefix_sum(count_t n, index_t const *ind, value_t const *w) { @@ -90,14 +60,14 @@ __inline__ __device__ value_t parallel_prefix_sum(count_t n, index_t const *ind, // iterations it is the value at the last thread of the previous iterations. // get the value of the last thread - last = shfl(sum, blockDim.x - 1, blockDim.x); + last = __shfl_sync(raft::warp_full_mask(), sum, blockDim.x - 1, blockDim.x); // if you are valid read the value from memory, otherwise set your value to 0 sum = (valid) ? w[ind[i]] : 0.0; // do prefix sum (of size warpSize=blockDim.x =< 32) for (j = 1; j < blockDim.x; j *= 2) { - v = shfl_up(sum, j, blockDim.x); + v = __shfl_up_sync(raft::warp_full_mask(), sum, j, blockDim.x); if (threadIdx.x >= j) sum += v; } // shift by last @@ -105,7 +75,7 @@ __inline__ __device__ value_t parallel_prefix_sum(count_t n, index_t const *ind, // notice that no __threadfence or __syncthreads are needed in this implementation } // get the value of the last thread (to all threads) - last = shfl(sum, blockDim.x - 1, blockDim.x); + last = __shfl_sync(raft::warp_full_mask(), sum, blockDim.x - 1, blockDim.x); return last; } @@ -120,7 +90,7 @@ T dot(size_t n, T *x, T *y) thrust::device_pointer_cast(x + n), thrust::device_pointer_cast(y), 0.0f); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); return result; } @@ -142,7 +112,7 @@ void axpy(size_t n, T a, T *x, T *y) thrust::device_pointer_cast(y), thrust::device_pointer_cast(y), axpy_functor(a)); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); } // norm @@ -162,7 +132,7 @@ T nrm2(size_t n, T *x) square(), init, thrust::plus())); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); return result; } @@ -173,7 +143,7 @@ T nrm1(size_t n, T *x) T result = thrust::reduce(rmm::exec_policy(stream)->on(stream), thrust::device_pointer_cast(x), thrust::device_pointer_cast(x + n)); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); return result; } @@ -187,7 +157,7 @@ void scal(size_t n, T val, T *x) thrust::make_constant_iterator(val), thrust::device_pointer_cast(x), thrust::multiplies()); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); } template @@ -200,7 +170,7 @@ void addv(size_t n, T val, T *x) thrust::make_constant_iterator(val), thrust::device_pointer_cast(x), thrust::plus()); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); } template @@ -211,7 +181,7 @@ void fill(size_t n, T *x, T value) thrust::device_pointer_cast(x), thrust::device_pointer_cast(x + n), value); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); } template @@ -223,7 +193,7 @@ void scatter(size_t n, T *src, T *dst, M *map) thrust::device_pointer_cast(src + n), thrust::device_pointer_cast(map), thrust::device_pointer_cast(dst)); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); } template @@ -237,7 +207,7 @@ void printv(size_t n, T *vec, int offset) dev_ptr + offset + n, std::ostream_iterator( std::cout, " ")); // Assume no RMM dependency; TODO: check / test (potential BUG !!!!!) - CUDA_CHECK_LAST(); + CHECK_CUDA(nullptr); std::cout << std::endl; } @@ -248,7 +218,7 @@ void copy(size_t n, T *x, T *res) thrust::device_ptr res_ptr(res); cudaStream_t stream{nullptr}; thrust::copy_n(rmm::exec_policy(stream)->on(stream), dev_ptr, n, res_ptr); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); } template @@ -273,36 +243,39 @@ void update_dangling_nodes(size_t n, T *dangling_nodes, T damping_factor) thrust::device_pointer_cast(dangling_nodes), dangling_functor(1.0 - damping_factor), is_zero()); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); } // google matrix kernels template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - degree_coo(const IndexType n, const IndexType e, const IndexType *ind, ValueType *degree) +__global__ void degree_coo(const IndexType n, + const IndexType e, + const IndexType *ind, + ValueType *degree) { for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x) atomicAdd(°ree[ind[i]], (ValueType)1.0); } template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - flag_leafs_kernel(const size_t n, const IndexType *degree, ValueType *bookmark) +__global__ void flag_leafs_kernel(const size_t n, const IndexType *degree, ValueType *bookmark) { for (auto i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) if (degree[i] == 0) bookmark[i] = 1.0; } template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - degree_offsets(const IndexType n, const IndexType e, const IndexType *ind, ValueType *degree) +__global__ void degree_offsets(const IndexType n, + const IndexType e, + const IndexType *ind, + ValueType *degree) { for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) degree[i] += ind[i + 1] - ind[i]; } template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) type_convert(FromType *array, int n) +__global__ void type_convert(FromType *array, int n) { for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) { ToType val = array[i]; @@ -312,12 +285,12 @@ __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) type_convert(FromType } template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) equi_prob3(const IndexType n, - const IndexType e, - const IndexType *csrPtr, - const IndexType *csrInd, - ValueType *val, - IndexType *degree) +__global__ void equi_prob3(const IndexType n, + const IndexType e, + const IndexType *csrPtr, + const IndexType *csrInd, + ValueType *val, + IndexType *degree) { int j, row, col; for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) { @@ -331,12 +304,12 @@ __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) equi_prob3(const Inde } template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) equi_prob2(const IndexType n, - const IndexType e, - const IndexType *csrPtr, - const IndexType *csrInd, - ValueType *val, - IndexType *degree) +__global__ void equi_prob2(const IndexType n, + const IndexType e, + const IndexType *csrPtr, + const IndexType *csrInd, + ValueType *val, + IndexType *degree) { int row = blockIdx.x * blockDim.x + threadIdx.x; if (row < n) { @@ -371,7 +344,7 @@ void HT_matrix_csc_coo(const IndexType n, nblocks.z = 1; degree_coo <<>>(n, e, csrInd, degree.data().get()); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); int y = 4; nthreads.x = 32 / y; @@ -382,11 +355,11 @@ void HT_matrix_csc_coo(const IndexType n, nblocks.z = min((n + nthreads.z - 1) / nthreads.z, CUDA_MAX_BLOCKS); // 1; equi_prob3 <<>>(n, e, csrPtr, csrInd, val, degree.data().get()); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); ValueType a = 0.0; fill(n, bookmark, a); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); nthreads.x = min(n, CUDA_MAX_KERNEL_THREADS); nthreads.y = 1; @@ -396,12 +369,14 @@ void HT_matrix_csc_coo(const IndexType n, nblocks.z = 1; flag_leafs_kernel <<>>(n, degree.data().get(), bookmark); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); } template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - permute_vals_kernel(const IndexType e, IndexType *perm, ValueType *in, ValueType *out) +__global__ void permute_vals_kernel(const IndexType e, + IndexType *perm, + ValueType *in, + ValueType *out) { for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x) out[i] = in[perm[i]]; @@ -486,8 +461,7 @@ void remove_duplicate( } template -__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - offsets_to_indices_kernel(const IndexType *offsets, IndexType v, IndexType *indices) +__global__ void offsets_to_indices_kernel(const IndexType *offsets, IndexType v, IndexType *indices) { int tid, ctaStart; tid = threadIdx.x; @@ -511,7 +485,7 @@ void offsets_to_indices(const IndexType *offsets, IndexType v, IndexType *indice IndexType nthreads = min(v, (IndexType)CUDA_MAX_KERNEL_THREADS); IndexType nblocks = min((v + nthreads - 1) / nthreads, (IndexType)CUDA_MAX_BLOCKS); offsets_to_indices_kernel<<>>(offsets, v, indices); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); } template @@ -519,7 +493,7 @@ void sequence(IndexType n, IndexType *vec, IndexType init = 0) { thrust::sequence( thrust::device, thrust::device_pointer_cast(vec), thrust::device_pointer_cast(vec + n), init); - CUDA_CHECK_LAST(); + CHECK_CUDA(nullptr); } template @@ -532,7 +506,7 @@ bool has_negative_val(DistType *arr, size_t n) thrust::device_pointer_cast(arr), thrust::device_pointer_cast(arr + n)); - CUDA_CHECK_LAST(); + CHECK_CUDA(stream); return (result < 0); } diff --git a/cpp/src/utilities/heap.cuh b/cpp/src/utilities/heap.cuh index e290337c22d..0747a658324 100644 --- a/cpp/src/utilities/heap.cuh +++ b/cpp/src/utilities/heap.cuh @@ -1,7 +1,7 @@ // -*-c++-*- /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/utilities/sm_utils.h b/cpp/src/utilities/sm_utils.h deleted file mode 100644 index 57e149e7f99..00000000000 --- a/cpp/src/utilities/sm_utils.h +++ /dev/null @@ -1,326 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#ifdef _MSC_VER -#include -#else -#include -#endif - -#define DEFAULT_MASK 0xffffffff - -#define USE_CG 1 -//(__CUDACC_VER__ >= 80500) - -namespace cugraph { -namespace detail { -namespace utils { -static __device__ __forceinline__ int lane_id() -{ - int id; - asm("mov.u32 %0, %%laneid;" : "=r"(id)); - return id; -} - -static __device__ __forceinline__ int lane_mask_lt() -{ - int mask; - asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask)); - return mask; -} - -static __device__ __forceinline__ int lane_mask_le() -{ - int mask; - asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask)); - return mask; -} - -static __device__ __forceinline__ int warp_id() { return threadIdx.x >> 5; } - -static __device__ __forceinline__ unsigned int ballot(int p, int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#if USE_CG - return __ballot_sync(mask, p); -#else - return __ballot(p); -#endif -#else - return 0; -#endif -} - -static __device__ __forceinline__ int shfl(int r, int lane, int bound = 32, int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#if USE_CG - return __shfl_sync(mask, r, lane, bound); -#else - return __shfl(r, lane, bound); -#endif -#else - return 0; -#endif -} - -static __device__ __forceinline__ float shfl(float r, - int lane, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#if USE_CG - return __shfl_sync(mask, r, lane, bound); -#else - return __shfl(r, lane, bound); -#endif -#else - return 0.0f; -#endif -} - -/// Warp shuffle down function -/** Warp shuffle functions on 64-bit floating point values are not - * natively implemented as of Compute Capability 5.0. This - * implementation has been copied from - * (http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler). - * Once this is natively implemented, this function can be replaced - * by __shfl_down. - * - */ -static __device__ __forceinline__ double shfl(double r, - int lane, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - int2 a = *reinterpret_cast(&r); - a.x = __shfl_sync(mask, a.x, lane, bound); - a.y = __shfl_sync(mask, a.y, lane, bound); - return *reinterpret_cast(&a); -#else - int2 a = *reinterpret_cast(&r); - a.x = __shfl(a.x, lane, bound); - a.y = __shfl(a.y, lane, bound); - return *reinterpret_cast(&a); -#endif -#else - return 0.0; -#endif -} - -static __device__ __forceinline__ long long shfl(long long r, - int lane, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - int2 a = *reinterpret_cast(&r); - a.x = __shfl_sync(mask, a.x, lane, bound); - a.y = __shfl_sync(mask, a.y, lane, bound); - return *reinterpret_cast(&a); -#else - int2 a = *reinterpret_cast(&r); - a.x = __shfl(a.x, lane, bound); - a.y = __shfl(a.y, lane, bound); - return *reinterpret_cast(&a); -#endif -#else - return 0.0; -#endif -} - -static __device__ __forceinline__ int shfl_down(int r, - int offset, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - return __shfl_down_sync(mask, r, offset, bound); -#else - return __shfl_down(r, offset, bound); -#endif -#else - return 0.0f; -#endif -} - -static __device__ __forceinline__ float shfl_down(float r, - int offset, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - return __shfl_down_sync(mask, r, offset, bound); -#else - return __shfl_down(r, offset, bound); -#endif -#else - return 0.0f; -#endif -} - -static __device__ __forceinline__ double shfl_down(double r, - int offset, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - int2 a = *reinterpret_cast(&r); - a.x = __shfl_down_sync(mask, a.x, offset, bound); - a.y = __shfl_down_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); -#else - int2 a = *reinterpret_cast(&r); - a.x = __shfl_down(a.x, offset, bound); - a.y = __shfl_down(a.y, offset, bound); - return *reinterpret_cast(&a); -#endif -#else - return 0.0; -#endif -} - -static __device__ __forceinline__ long long shfl_down(long long r, - int offset, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - int2 a = *reinterpret_cast(&r); - a.x = __shfl_down_sync(mask, a.x, offset, bound); - a.y = __shfl_down_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); -#else - int2 a = *reinterpret_cast(&r); - a.x = __shfl_down(a.x, offset, bound); - a.y = __shfl_down(a.y, offset, bound); - return *reinterpret_cast(&a); -#endif -#else - return 0.0; -#endif -} - -// specifically for triangles counting -static __device__ __forceinline__ uint64_t shfl_down(uint64_t r, - int offset, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - int2 a = *reinterpret_cast(&r); - a.x = __shfl_down_sync(mask, a.x, offset, bound); - a.y = __shfl_down_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); -#else - int2 a = *reinterpret_cast(&r); - a.x = __shfl_down(mask, a.x, offset, bound); - a.y = __shfl_down(mask, a.y, offset, bound); - return *reinterpret_cast(&a); -#endif -#else - return 0.0; -#endif -} - -static __device__ __forceinline__ int shfl_up(int r, - int offset, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - return __shfl_up_sync(mask, r, offset, bound); -#else - return __shfl_up(r, offset, bound); -#endif -#else - return 0.0f; -#endif -} - -static __device__ __forceinline__ float shfl_up(float r, - int offset, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - return __shfl_up_sync(mask, r, offset, bound); -#else - return __shfl_up(r, offset, bound); -#endif -#else - return 0.0f; -#endif -} - -static __device__ __forceinline__ double shfl_up(double r, - int offset, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - int2 a = *reinterpret_cast(&r); - a.x = __shfl_up_sync(mask, a.x, offset, bound); - a.y = __shfl_up_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); -#else - int2 a = *reinterpret_cast(&r); - a.x = __shfl_up(a.x, offset, bound); - a.y = __shfl_up(a.y, offset, bound); - return *reinterpret_cast(&a); -#endif -#else - return 0.0; -#endif -} - -static __device__ __forceinline__ long long shfl_up(long long r, - int offset, - int bound = 32, - int mask = DEFAULT_MASK) -{ -#if __CUDA_ARCH__ >= 300 -#ifdef USE_CG - int2 a = *reinterpret_cast(&r); - a.x = __shfl_up_sync(mask, a.x, offset, bound); - a.y = __shfl_up_sync(mask, a.y, offset, bound); - return *reinterpret_cast(&a); -#else - int2 a = *reinterpret_cast(&r); - a.x = __shfl_up(a.x, offset, bound); - a.y = __shfl_up(a.y, offset, bound); - return *reinterpret_cast(&a); -#endif -#else - return 0.0; -#endif -} -} // namespace utils -} // namespace detail -} // namespace cugraph diff --git a/cpp/src/utilities/spmv_1D.cu b/cpp/src/utilities/spmv_1D.cu new file mode 100644 index 00000000000..4aec86919c9 --- /dev/null +++ b/cpp/src/utilities/spmv_1D.cu @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include "spmv_1D.cuh" + +namespace cugraph { +namespace mg { +template +MGcsrmv::MGcsrmv(raft::handle_t const &handle, + vertex_t *local_vertices, + vertex_t *part_off, + edge_t *off, + vertex_t *ind, + weight_t *val, + weight_t *x) + : handle_(handle), + local_vertices_(local_vertices), + part_off_(part_off), + off_(off), + ind_(ind), + val_(val) +{ + i_ = handle_.get_comms().get_rank(); + p_ = handle_.get_comms().get_size(); + v_glob_ = part_off_[p_ - 1] + local_vertices_[p_ - 1]; + v_loc_ = local_vertices_[i_]; + vertex_t tmp; + CUDA_TRY(cudaMemcpy(&tmp, &off_[v_loc_], sizeof(vertex_t), cudaMemcpyDeviceToHost)); + e_loc_ = tmp; + y_loc_.resize(v_loc_); +} + +template +MGcsrmv::~MGcsrmv() +{ +} + +template +void MGcsrmv::run(weight_t *x) +{ + using namespace raft::matrix; + + weight_t h_one = 1.0; + weight_t h_zero = 0.0; + + sparse_matrix_t mat{handle_, // raft handle + off_, // CSR row_offsets + ind_, // CSR col_indices + val_, // CSR values + static_cast(v_loc_), // n_rows + static_cast(v_glob_), // n_cols + static_cast(e_loc_)}; // nnz + + mat.mv(h_one, // alpha + x, // x + h_zero, // beta + y_loc_.data().get(), // y + sparse_mv_alg_t::SPARSE_MV_ALG2); // SpMV algorithm + + auto stream = handle_.get_stream(); + + auto const &comm{handle_.get_comms()}; // local + + std::vector recvbuf(comm.get_size()); + std::copy(local_vertices_, local_vertices_ + comm.get_size(), recvbuf.begin()); + comm.allgatherv(y_loc_.data().get(), x, recvbuf.data(), part_off_, stream); +} + +template class MGcsrmv; +template class MGcsrmv; + +} // namespace mg +} // namespace cugraph diff --git a/cpp/src/utilities/spmv_1D.cuh b/cpp/src/utilities/spmv_1D.cuh new file mode 100644 index 00000000000..81466595c19 --- /dev/null +++ b/cpp/src/utilities/spmv_1D.cuh @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include +#include "utilities/error.hpp" + +namespace cugraph { +namespace mg { + +template +class MGcsrmv { + private: + size_t v_glob_; + size_t v_loc_; + size_t e_loc_; + + raft::handle_t const& handle_; // raft handle propagation for SpMV, etc. + + vertex_t* part_off_; + vertex_t* local_vertices_; + int i_; + int p_; + edge_t* off_; + vertex_t* ind_; + weight_t* val_; + rmm::device_vector y_loc_; + std::vector v_locs_h_; + std::vector displs_h_; + + public: + MGcsrmv(raft::handle_t const& r_handle, + vertex_t* local_vertices, + vertex_t* part_off, + edge_t* row_off, + vertex_t* col_ind, + weight_t* vals, + weight_t* x); + + ~MGcsrmv(); + + void run(weight_t* x); +}; + +} // namespace mg +} // namespace cugraph diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 0b8bec887fb..e0f945639ca 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -1,6 +1,6 @@ #============================================================================= # -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2020, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,26 +29,24 @@ function(ConfigureTest CMAKE_TEST_NAME CMAKE_TEST_SRC CMAKE_EXTRA_LIBS) target_include_directories(${CMAKE_TEST_NAME} PRIVATE + "${CUB_INCLUDE_DIR}" + "${THRUST_INCLUDE_DIR}" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}" "${GTEST_INCLUDE_DIR}" "${RMM_INCLUDE}" "${CUDF_INCLUDE}" "${CUDF_INCLUDE}/libcudf/libcudacxx" - "${CUB_INCLUDE_DIR}" + "${NCCL_INCLUDE_DIRS}" "${CMAKE_SOURCE_DIR}/../thirdparty/mmio" "${CMAKE_SOURCE_DIR}/include" "${CMAKE_SOURCE_DIR}/src" "${CMAKE_CURRENT_SOURCE_DIR}" + "${RAFT_DIR}/cpp/include" ) target_link_libraries(${CMAKE_TEST_NAME} PRIVATE - gtest gmock_main gmock cugraph ${CUDF_LIBRARY} ${RMM_LIBRARY} ${CMAKE_EXTRA_LIBS} cudart cuda) - if (BUILD_MPI) - include_directories(include ${MPI_CXX_INCLUDE_PATH} ${NCCL_INCLUDE_DIRS}) - target_link_libraries(${CMAKE_TEST_NAME} PRIVATE ${MPI_C_LIBRARIES} ${NCCL_LIBRARIES} ) - target_compile_options(${CMAKE_TEST_NAME} PUBLIC ${MPI_C_COMPILE_FLAGS}) - endif(BUILD_MPI) + gtest gmock_main gmock cugraph ${CUDF_LIBRARY} ${RMM_LIBRARY} ${CMAKE_EXTRA_LIBS} ${NCCL_LIBRARIES} cudart cuda cublas cusparse cusolver curand) if(OpenMP_CXX_FOUND) target_link_libraries(${CMAKE_TEST_NAME} PRIVATE @@ -138,12 +136,18 @@ set(BETWEENNESS_TEST_SRC ConfigureTest(BETWEENNESS_TEST "${BETWEENNESS_TEST_SRC}" "") +set(EDGE_BETWEENNESS_TEST_SRC + "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c" + "${CMAKE_CURRENT_SOURCE_DIR}/centrality/edge_betweenness_centrality_test.cu") + + ConfigureTest(EDGE_BETWEENNESS_TEST "${EDGE_BETWEENNESS_TEST_SRC}" "") + ################################################################################################### # - pagerank tests -------------------------------------------------------------------------------- set(PAGERANK_TEST_SRC "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c" - "${CMAKE_CURRENT_SOURCE_DIR}/pagerank/pagerank_test.cu") + "${CMAKE_CURRENT_SOURCE_DIR}/pagerank/pagerank_test.cpp") ConfigureTest(PAGERANK_TEST "${PAGERANK_TEST_SRC}" "") @@ -172,6 +176,15 @@ set(LOUVAIN_TEST_SRC ConfigureTest(LOUVAIN_TEST "${LOUVAIN_TEST_SRC}" "") +################################################################################################### +# - LEIDEN tests --------------------------------------------------------------------------------- + +set(LEIDEN_TEST_SRC + "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c" + "${CMAKE_CURRENT_SOURCE_DIR}/community/leiden_test.cpp") + +ConfigureTest(LEIDEN_TEST "${LEIDEN_TEST_SRC}" "") + ################################################################################################### # - ECG tests --------------------------------------------------------------------------------- @@ -203,7 +216,7 @@ set(RENUMBERING_TEST_SRC "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c" "${CMAKE_CURRENT_SOURCE_DIR}/renumber/renumber_test.cu") -ConfigureTest(RENUMBERING_TEST "${RENUMBERING_TEST_SRC}" "${NVSTRINGS_LIBRARY}") +ConfigureTest(RENUMBERING_TEST "${RENUMBERING_TEST_SRC}" "") ################################################################################################### #-FORCE ATLAS 2 tests ------------------------------------------------------------------------------ @@ -221,7 +234,7 @@ set(CONNECT_TEST_SRC "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c" "${CMAKE_CURRENT_SOURCE_DIR}/components/con_comp_test.cu") - ConfigureTest(CONNECT_TEST "${CONNECT_TEST_SRC}" "") +ConfigureTest(CONNECT_TEST "${CONNECT_TEST_SRC}" "") ################################################################################################### #-STRONGLY CONNECTED COMPONENTS tests --------------------------------------------------------------------- diff --git a/cpp/tests/centrality/betweenness_centrality_test.cu b/cpp/tests/centrality/betweenness_centrality_test.cu index 153e0bc876c..d680574e10b 100644 --- a/cpp/tests/centrality/betweenness_centrality_test.cu +++ b/cpp/tests/centrality/betweenness_centrality_test.cu @@ -14,23 +14,24 @@ * limitations under the License. */ -#include "gmock/gmock.h" -#include "gtest/gtest.h" - -#include -#include -#include "test_utils.h" +#include +#include +#include #include #include -#include -#include +#include +#include -#include +#include + +#include -#include -#include "traversal/bfs_ref.h" +#include +#include +#include +#include #ifndef TEST_EPSILON #define TEST_EPSILON 0.0001 @@ -47,73 +48,122 @@ // ============================================================================ // C++ Reference Implementation // ============================================================================ -template +template void ref_accumulation(result_t *result, - VT const number_of_vertices, - std::stack &S, - std::vector> &pred, + vertex_t const number_of_vertices, + std::stack &S, + std::vector> &pred, std::vector &sigmas, std::vector &deltas, - VT source) + vertex_t source) +{ + for (vertex_t v = 0; v < number_of_vertices; ++v) { deltas[v] = 0; } + while (!S.empty()) { + vertex_t w = S.top(); + S.pop(); + for (vertex_t v : pred[w]) { deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]); } + if (w != source) { result[w] += deltas[w]; } + } +} + +template +void ref_endpoints_accumulation(result_t *result, + vertex_t const number_of_vertices, + std::stack &S, + std::vector> &pred, + std::vector &sigmas, + std::vector &deltas, + vertex_t source) +{ + result[source] += S.size() - 1; + for (vertex_t v = 0; v < number_of_vertices; ++v) { deltas[v] = 0; } + while (!S.empty()) { + vertex_t w = S.top(); + S.pop(); + for (vertex_t v : pred[w]) { deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]); } + if (w != source) { result[w] += deltas[w] + 1; } + } +} + +template +void ref_edge_accumulation(result_t *result, + vertex_t const number_of_vertices, + std::stack &S, + std::vector> &pred, + std::vector &sigmas, + std::vector &deltas, + vertex_t source) { - for (VT v = 0; v < number_of_vertices; ++v) { deltas[v] = 0; } + for (vertex_t v = 0; v < number_of_vertices; ++v) { deltas[v] = 0; } while (!S.empty()) { - VT w = S.top(); + vertex_t w = S.top(); S.pop(); - for (VT v : pred[w]) { deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]); } + for (vertex_t v : pred[w]) { deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]); } if (w != source) { result[w] += deltas[w]; } } } // Algorithm 1: Shortest-path vertex betweenness, (Brandes, 2001) -template -void reference_betweenness_centrality_impl(VT *indices, - ET *offsets, - VT const number_of_vertices, +template +void reference_betweenness_centrality_impl(vertex_t *indices, + edge_t *offsets, + vertex_t const number_of_vertices, result_t *result, - VT const *sources, - VT const number_of_sources) + bool endpoints, + vertex_t const *sources, + vertex_t const number_of_sources) { - std::queue Q; - std::stack S; - // NOTE: dist is of type VT not WT - std::vector dist(number_of_vertices); - std::vector> pred(number_of_vertices); + std::queue Q; + std::stack S; + // NOTE: dist is of type vertex_t not weight_t + std::vector dist(number_of_vertices); + std::vector> pred(number_of_vertices); std::vector sigmas(number_of_vertices); std::vector deltas(number_of_vertices); - std::vector neighbors; + std::vector neighbors; if (sources) { - for (VT source_idx = 0; source_idx < number_of_sources; ++source_idx) { - VT s = sources[source_idx]; + for (vertex_t source_idx = 0; source_idx < number_of_sources; ++source_idx) { + vertex_t s = sources[source_idx]; // Step 1: Single-source shortest-paths problem // a. Initialization - ref_bfs(indices, offsets, number_of_vertices, Q, S, dist, pred, sigmas, s); + ref_bfs(indices, offsets, number_of_vertices, Q, S, dist, pred, sigmas, s); // Step 2: Accumulation // Back propagation of dependencies - ref_accumulation( - result, number_of_vertices, S, pred, sigmas, deltas, s); + if (endpoints) { + ref_endpoints_accumulation( + result, number_of_vertices, S, pred, sigmas, deltas, s); + } else { + ref_accumulation( + result, number_of_vertices, S, pred, sigmas, deltas, s); + } } } else { - for (VT s = 0; s < number_of_vertices; ++s) { + for (vertex_t s = 0; s < number_of_vertices; ++s) { // Step 1: Single-source shortest-paths problem // a. Initialization - ref_bfs(indices, offsets, number_of_vertices, Q, S, dist, pred, sigmas, s); + ref_bfs(indices, offsets, number_of_vertices, Q, S, dist, pred, sigmas, s); // Step 2: Accumulation // Back propagation of dependencies - ref_accumulation( - result, number_of_vertices, S, pred, sigmas, deltas, s); + if (endpoints) { + ref_endpoints_accumulation( + result, number_of_vertices, S, pred, sigmas, deltas, s); + } else { + ref_accumulation( + result, number_of_vertices, S, pred, sigmas, deltas, s); + } } } } -template +template void reference_rescale(result_t *result, - bool normalize, bool directed, - VT const number_of_vertices, - VT const number_of_sources) + bool normalize, + bool endpoints, + vertex_t const number_of_vertices, + vertex_t const number_of_sources) { bool modified = false; result_t rescale_factor = static_cast(1); @@ -121,7 +171,11 @@ void reference_rescale(result_t *result, result_t casted_number_of_vertices = static_cast(number_of_vertices); if (normalize) { if (number_of_vertices > 2) { - rescale_factor /= ((casted_number_of_vertices - 1) * (casted_number_of_vertices - 2)); + if (endpoints) { + rescale_factor /= (casted_number_of_vertices * (casted_number_of_vertices - 1)); + } else { + rescale_factor /= ((casted_number_of_vertices - 1) * (casted_number_of_vertices - 2)); + } modified = true; } } else { @@ -138,47 +192,55 @@ void reference_rescale(result_t *result, for (auto idx = 0; idx < number_of_vertices; ++idx) { result[idx] *= rescale_factor; } } -template -void reference_betweenness_centrality(cugraph::experimental::GraphCSRView const &graph, - result_t *result, - bool normalize, - bool endpoints, // This is not yet implemented - VT const number_of_sources, - VT const *sources) +template +void reference_betweenness_centrality( + cugraph::GraphCSRView const &graph, + result_t *result, + bool normalize, + bool endpoints, // This is not yet implemented + vertex_t const number_of_sources, + vertex_t const *sources) { - VT number_of_vertices = graph.number_of_vertices; - ET number_of_edges = graph.number_of_edges; - thrust::host_vector h_indices(number_of_edges); - thrust::host_vector h_offsets(number_of_vertices + 1); + vertex_t number_of_vertices = graph.number_of_vertices; + edge_t number_of_edges = graph.number_of_edges; + thrust::host_vector h_indices(number_of_edges); + thrust::host_vector h_offsets(number_of_vertices + 1); - thrust::device_ptr d_indices((VT *)&graph.indices[0]); - thrust::device_ptr d_offsets((ET *)&graph.offsets[0]); + thrust::device_ptr d_indices((vertex_t *)&graph.indices[0]); + thrust::device_ptr d_offsets((edge_t *)&graph.offsets[0]); thrust::copy(d_indices, d_indices + number_of_edges, h_indices.begin()); thrust::copy(d_offsets, d_offsets + (number_of_vertices + 1), h_offsets.begin()); cudaDeviceSynchronize(); - reference_betweenness_centrality_impl( - &h_indices[0], &h_offsets[0], number_of_vertices, result, sources, number_of_sources); - reference_rescale( - result, normalize, graph.prop.directed, number_of_vertices, number_of_sources); + reference_betweenness_centrality_impl(&h_indices[0], + &h_offsets[0], + number_of_vertices, + result, + endpoints, + sources, + number_of_sources); + reference_rescale( + result, graph.prop.directed, normalize, endpoints, number_of_vertices, number_of_sources); } -// Explicit declaration +// Explicit instantiation +/* FIXME!!! template void reference_betweenness_centrality( - cugraph::experimental::GraphCSRView const &, + cugraph::GraphCSRView const &, float *, bool, bool, const int, int const *); template void reference_betweenness_centrality( - cugraph::experimental::GraphCSRView const &, + cugraph::GraphCSRView const &, double *, bool, bool, const int, int const *); +*/ // ============================================================================= // Utility functions @@ -198,7 +260,6 @@ bool compare_close(const T &a, const T &b, const precision_t epsilon, precision_ // Defines Betweenness Centrality UseCase // SSSP's test suite code uses type of Graph parameter that could be used // (MTX / RMAT) -// FIXME: Use VT for number_of_sources? typedef struct BC_Usecase_t { std::string config_; // Path to graph file std::string file_path_; // Complete path to graph using dataset_root_dir @@ -208,7 +269,7 @@ typedef struct BC_Usecase_t { { // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR // FIXME: Use platform independent stuff from c++14/17 on compiler update - const std::string &rapidsDatasetRootDir = get_rapids_dataset_root_dir(); + const std::string &rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir(); if ((config_ != "") && (config_[0] != '/')) { file_path_ = rapidsDatasetRootDir + "/" + config_; } else { @@ -218,6 +279,8 @@ typedef struct BC_Usecase_t { } BC_Usecase; class Tests_BC : public ::testing::TestWithParam { + raft::handle_t handle; + public: Tests_BC() {} static void SetupTestCase() {} @@ -225,16 +288,15 @@ class Tests_BC : public ::testing::TestWithParam { virtual void SetUp() {} virtual void TearDown() {} - // FIXME: Should normalize be part of the configuration instead? - // VT vertex identifier data type - // ET edge identifier data type - // WT edge weight data type + // vertex_t vertex identifier data type + // edge_t edge identifier data type + // weight_t edge weight data type // result_t result data type // normalize should the result be normalized - // endpoints should the endpoints be included (Not Implemented Yet) - template @@ -242,11 +304,12 @@ class Tests_BC : public ::testing::TestWithParam { { // Step 1: Construction of the graph based on configuration bool is_directed = false; - auto csr = generate_graph_csr_from_mm(is_directed, configuration.file_path_); + auto csr = cugraph::test::generate_graph_csr_from_mm( + is_directed, configuration.file_path_); cudaDeviceSynchronize(); - cugraph::experimental::GraphCSRView G = csr->view(); - G.prop.directed = is_directed; - CUDA_CHECK_LAST(); + cugraph::GraphCSRView G = csr->view(); + G.prop.directed = is_directed; + CUDA_TRY(cudaGetLastError()); std::vector result(G.number_of_vertices, 0); std::vector expected(G.number_of_vertices, 0); @@ -257,44 +320,27 @@ class Tests_BC : public ::testing::TestWithParam { configuration.number_of_sources_ <= G.number_of_vertices) << "Number number of sources should be >= 0 and" << " less than the number of vertices in the graph"; - std::vector sources(configuration.number_of_sources_); + std::vector sources(configuration.number_of_sources_); thrust::sequence(thrust::host, sources.begin(), sources.end(), 0); - VT *sources_ptr = nullptr; + vertex_t *sources_ptr = nullptr; if (configuration.number_of_sources_ > 0) { sources_ptr = sources.data(); } - reference_betweenness_centrality(G, - expected.data(), - normalize, - endpoints, - // FIXME: weights - configuration.number_of_sources_, - sources_ptr); + reference_betweenness_centrality( + G, expected.data(), normalize, endpoints, configuration.number_of_sources_, sources_ptr); sources_ptr = nullptr; if (configuration.number_of_sources_ > 0) { sources_ptr = sources.data(); } - thrust::device_vector d_result(G.number_of_vertices); - // FIXME: Remove this once endpoints in handled - if (endpoints) { - ASSERT_THROW(cugraph::betweenness_centrality(G, - d_result.data().get(), - normalize, - endpoints, - static_cast(nullptr), - configuration.number_of_sources_, - sources_ptr), - cugraph::logic_error); - return; - } else { - cugraph::betweenness_centrality(G, - d_result.data().get(), - normalize, - endpoints, - static_cast(nullptr), - configuration.number_of_sources_, - sources_ptr); - } + rmm::device_vector d_result(G.number_of_vertices); + cugraph::betweenness_centrality(handle, + G, + d_result.data().get(), + normalize, + endpoints, + static_cast(nullptr), + configuration.number_of_sources_, + sources_ptr); cudaDeviceSynchronize(); CUDA_TRY(cudaMemcpy(result.data(), d_result.data().get(), @@ -312,7 +358,6 @@ class Tests_BC : public ::testing::TestWithParam { // Tests // ============================================================================ // Verifiy Un-Normalized results -// Endpoint parameter is currently not usefull, is for later use TEST_P(Tests_BC, CheckFP32_NO_NORMALIZE_NO_ENDPOINTS) { run_current_test(GetParam()); @@ -323,7 +368,6 @@ TEST_P(Tests_BC, CheckFP64_NO_NORMALIZE_NO_ENDPOINTS) run_current_test(GetParam()); } -// FIXME: Currently endpoints throws and exception as it is not supported TEST_P(Tests_BC, CheckFP32_NO_NORMALIZE_ENDPOINTS) { run_current_test(GetParam()); @@ -335,17 +379,16 @@ TEST_P(Tests_BC, CheckFP64_NO_NORMALIZE_ENDPOINTS) } // Verifiy Normalized results -TEST_P(Tests_BC, CheckFP32_NORMALIZE_NO_ENPOINTS) +TEST_P(Tests_BC, CheckFP32_NORMALIZE_NO_ENDPOINTS) { run_current_test(GetParam()); } -TEST_P(Tests_BC, CheckFP64_NORMALIZE_NO_ENPOINTS) +TEST_P(Tests_BC, CheckFP64_NORMALIZE_NO_ENDPOINTS) { run_current_test(GetParam()); } -// FIXME: Currently endpoints throws and exception as it is not supported TEST_P(Tests_BC, CheckFP32_NORMALIZE_ENDPOINTS) { run_current_test(GetParam()); @@ -356,19 +399,12 @@ TEST_P(Tests_BC, CheckFP64_NORMALIZE_ENDPOINTS) run_current_test(GetParam()); } -// FIXME: There is an InvalidValue on a Memcopy only on tests/datasets/dblp.mtx INSTANTIATE_TEST_CASE_P(simple_test, Tests_BC, ::testing::Values(BC_Usecase("test/datasets/karate.mtx", 0), + BC_Usecase("test/datasets/netscience.mtx", 0), BC_Usecase("test/datasets/netscience.mtx", 4), BC_Usecase("test/datasets/wiki2003.mtx", 4), BC_Usecase("test/datasets/wiki-Talk.mtx", 4))); -int main(int argc, char **argv) -{ - testing::InitGoogleTest(&argc, argv); - auto resource = std::make_unique(); - rmm::mr::set_default_resource(resource.get()); - int rc = RUN_ALL_TESTS(); - return rc; -} +CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/centrality/edge_betweenness_centrality_test.cu b/cpp/tests/centrality/edge_betweenness_centrality_test.cu new file mode 100644 index 00000000000..b6cce8684e8 --- /dev/null +++ b/cpp/tests/centrality/edge_betweenness_centrality_test.cu @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +#include + +#include +#include + +#include +#include +#include +#include + +#ifndef TEST_EPSILON +#define TEST_EPSILON 0.0001 +#endif + +// NOTE: Defines under which values the difference should be discarded when +// considering values are close to zero +// i.e: Do we consider that the difference between 1.3e-9 and 8.e-12 is +// significant +#ifndef TEST_ZERO_THRESHOLD +#define TEST_ZERO_THRESHOLD 1e-10 +#endif + +// ============================================================================ +// C++ Reference Implementation +// ============================================================================ + +template +edge_t get_edge_index_from_source_and_destination(vertex_t source_vertex, + vertex_t destination_vertex, + vertex_t const *indices, + edge_t const *offsets) +{ + edge_t index = -1; + edge_t first_edge_idx = offsets[source_vertex]; + edge_t last_edge_idx = offsets[source_vertex + 1]; + auto index_it = std::find(indices + first_edge_idx, indices + last_edge_idx, destination_vertex); + if (index_it != (indices + last_edge_idx)) { index = std::distance(indices, index_it); } + return index; +} + +template +void ref_accumulation(result_t *result, + vertex_t const *indices, + edge_t const *offsets, + vertex_t const number_of_vertices, + std::stack &S, + std::vector> &pred, + std::vector &sigmas, + std::vector &deltas, + vertex_t source) +{ + for (vertex_t v = 0; v < number_of_vertices; ++v) { deltas[v] = 0; } + while (!S.empty()) { + vertex_t w = S.top(); + S.pop(); + for (vertex_t v : pred[w]) { + edge_t edge_idx = + get_edge_index_from_source_and_destination( + v, w, indices, offsets); + double coefficient = (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]); + + deltas[v] += coefficient; + result[edge_idx] += coefficient; + } + } +} + +// Algorithm 1: Shortest-path vertex betweenness, (Brandes, 2001) +template +void reference_edge_betweenness_centrality_impl(vertex_t *indices, + edge_t *offsets, + vertex_t const number_of_vertices, + result_t *result, + vertex_t const *sources, + vertex_t const number_of_sources) +{ + std::queue Q; + std::stack S; + // NOTE: dist is of type vertex_t not weight_t + std::vector dist(number_of_vertices); + std::vector> pred(number_of_vertices); + std::vector sigmas(number_of_vertices); + std::vector deltas(number_of_vertices); + + std::vector neighbors; + + if (sources) { + for (vertex_t source_idx = 0; source_idx < number_of_sources; ++source_idx) { + vertex_t s = sources[source_idx]; + // Step 1: Single-source shortest-paths problem + // a. Initialization + ref_bfs(indices, offsets, number_of_vertices, Q, S, dist, pred, sigmas, s); + // Step 2: Accumulation + // Back propagation of dependencies + ref_accumulation( + result, indices, offsets, number_of_vertices, S, pred, sigmas, deltas, s); + } + } else { + for (vertex_t s = 0; s < number_of_vertices; ++s) { + // Step 1: Single-source shortest-paths problem + // a. Initialization + ref_bfs(indices, offsets, number_of_vertices, Q, S, dist, pred, sigmas, s); + // Step 2: Accumulation + // Back propagation of dependencies + ref_accumulation( + result, indices, offsets, number_of_vertices, S, pred, sigmas, deltas, s); + } + } +} + +template +void reference_rescale(result_t *result, + bool directed, + bool normalize, + vertex_t const number_of_vertices, + edge_t const number_of_edges) +{ + result_t rescale_factor = static_cast(1); + result_t casted_number_of_vertices = static_cast(number_of_vertices); + if (normalize) { + if (number_of_vertices > 1) { + rescale_factor /= ((casted_number_of_vertices) * (casted_number_of_vertices - 1)); + } + } else { + if (!directed) { rescale_factor /= static_cast(2); } + } + for (auto idx = 0; idx < number_of_edges; ++idx) { result[idx] *= rescale_factor; } +} + +template +void reference_edge_betweenness_centrality( + cugraph::GraphCSRView const &graph, + result_t *result, + bool normalize, + vertex_t const number_of_sources, + vertex_t const *sources) +{ + vertex_t number_of_vertices = graph.number_of_vertices; + edge_t number_of_edges = graph.number_of_edges; + thrust::host_vector h_indices(number_of_edges); + thrust::host_vector h_offsets(number_of_vertices + 1); + + thrust::device_ptr d_indices((vertex_t *)&graph.indices[0]); + thrust::device_ptr d_offsets((edge_t *)&graph.offsets[0]); + + thrust::copy(d_indices, d_indices + number_of_edges, h_indices.begin()); + thrust::copy(d_offsets, d_offsets + (number_of_vertices + 1), h_offsets.begin()); + + cudaDeviceSynchronize(); + + reference_edge_betweenness_centrality_impl( + &h_indices[0], &h_offsets[0], number_of_vertices, result, sources, number_of_sources); + reference_rescale( + result, graph.prop.directed, normalize, number_of_vertices, number_of_edges); +} + +// ============================================================================= +// Utility functions +// ============================================================================= +// Compare while allowing relatie error of epsilon +// zero_threshold indicates when we should drop comparison for small numbers +template +bool compare_close(const T &a, const T &b, const precision_t epsilon, precision_t zero_threshold) +{ + return ((zero_threshold > a && zero_threshold > b)) || + (a >= b * (1.0 - epsilon)) && (a <= b * (1.0 + epsilon)); +} + +// ============================================================================= +// Test Suite +// ============================================================================= +// Defines Betweenness Centrality UseCase +// SSSP's test suite code uses type of Graph parameter that could be used +// (MTX / RMAT) +typedef struct EdgeBC_Usecase_t { + std::string config_; // Path to graph file + std::string file_path_; // Complete path to graph using dataset_root_dir + int number_of_sources_; // Starting point from the traversal + EdgeBC_Usecase_t(const std::string &config, int number_of_sources) + : config_(config), number_of_sources_(number_of_sources) + { + // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR + // FIXME: Use platform independent stuff from c++14/17 on compiler update + const std::string &rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir(); + if ((config_ != "") && (config_[0] != '/')) { + file_path_ = rapidsDatasetRootDir + "/" + config_; + } else { + file_path_ = config_; + } + }; +} EdgeBC_Usecase; + +class Tests_EdgeBC : public ::testing::TestWithParam { + raft::handle_t handle; + + public: + Tests_EdgeBC() {} + static void SetupTestCase() {} + static void TearDownTestCase() {} + + virtual void SetUp() {} + virtual void TearDown() {} + // FIXME: Should normalize be part of the configuration instead? + // vertex_t vertex identifier data type + // edge_t edge identifier data type + // weight_t edge weight data type + // result_t result data type + // normalize should the result be normalized + template + void run_current_test(const EdgeBC_Usecase &configuration) + { + // Step 1: Construction of the graph based on configuration + bool is_directed = false; + auto csr = cugraph::test::generate_graph_csr_from_mm( + is_directed, configuration.file_path_); + cudaDeviceSynchronize(); + cugraph::GraphCSRView G = csr->view(); + G.prop.directed = is_directed; + CUDA_TRY(cudaGetLastError()); + std::vector result(G.number_of_edges, 0); + std::vector expected(G.number_of_edges, 0); + + // Step 2: Generation of sources based on configuration + // if number_of_sources_ is 0 then sources must be nullptr + // Otherwise we only use the first k values + ASSERT_TRUE(configuration.number_of_sources_ >= 0 && + configuration.number_of_sources_ <= G.number_of_vertices) + << "Number number of sources should be >= 0 and" + << " less than the number of vertices in the graph"; + std::vector sources(configuration.number_of_sources_); + thrust::sequence(thrust::host, sources.begin(), sources.end(), 0); + + vertex_t *sources_ptr = nullptr; + if (configuration.number_of_sources_ > 0) { sources_ptr = sources.data(); } + + reference_edge_betweenness_centrality( + G, expected.data(), normalize, configuration.number_of_sources_, sources_ptr); + + sources_ptr = nullptr; + if (configuration.number_of_sources_ > 0) { sources_ptr = sources.data(); } + + rmm::device_vector d_result(G.number_of_edges); + cugraph::edge_betweenness_centrality(handle, + G, + d_result.data().get(), + normalize, + static_cast(nullptr), + configuration.number_of_sources_, + sources_ptr); + CUDA_TRY(cudaMemcpy(result.data(), + d_result.data().get(), + sizeof(result_t) * G.number_of_edges, + cudaMemcpyDeviceToHost)); + for (int i = 0; i < G.number_of_edges; ++i) + EXPECT_TRUE(compare_close(result[i], expected[i], TEST_EPSILON, TEST_ZERO_THRESHOLD)) + << "[MISMATCH] vaid = " << i << ", cugraph = " << result[i] + << " expected = " << expected[i]; + } +}; + +// ============================================================================ +// Tests +// ============================================================================ +// Verifiy Un-Normalized results +TEST_P(Tests_EdgeBC, CheckFP32_NO_NORMALIZE) +{ + run_current_test(GetParam()); +} + +TEST_P(Tests_EdgeBC, CheckFP64_NO_NORMALIZE) +{ + run_current_test(GetParam()); +} + +// Verifiy Normalized results +TEST_P(Tests_EdgeBC, CheckFP32_NORMALIZE) +{ + run_current_test(GetParam()); +} + +TEST_P(Tests_EdgeBC, CheckFP64_NORMALIZE) +{ + run_current_test(GetParam()); +} + +INSTANTIATE_TEST_CASE_P(simple_test, + Tests_EdgeBC, + ::testing::Values(EdgeBC_Usecase("test/datasets/karate.mtx", 0), + EdgeBC_Usecase("test/datasets/netscience.mtx", 0), + EdgeBC_Usecase("test/datasets/netscience.mtx", 4), + EdgeBC_Usecase("test/datasets/wiki2003.mtx", 4), + EdgeBC_Usecase("test/datasets/wiki-Talk.mtx", 4))); + +CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/centrality/katz_centrality_test.cu b/cpp/tests/centrality/katz_centrality_test.cu index 69c543714ca..97f499fc920 100644 --- a/cpp/tests/centrality/katz_centrality_test.cu +++ b/cpp/tests/centrality/katz_centrality_test.cu @@ -1,15 +1,34 @@ -#include -#include +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + #include -#include + +#include #include -#include -#include "cuda_profiler_api.h" -#include "gmock/gmock-generated-matchers.h" -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include "high_res_clock.h" -#include "test_utils.h" + +#include +#include + +#include + +#include std::vector getGoldenTopKIds(std::ifstream& fs_result, int k = 10) { @@ -37,13 +56,13 @@ std::vector getTopKIds(double* p_katz, int count, int k = 10) } template -int getMaxDegree(cugraph::experimental::GraphCSRView const& g) +int getMaxDegree(cugraph::GraphCSRView const& g) { cudaStream_t stream{nullptr}; rmm::device_vector degree_vector(g.number_of_vertices); ET* p_degree = degree_vector.data().get(); - g.degree(p_degree, cugraph::experimental::DegreeDirection::OUT); + g.degree(p_degree, cugraph::DegreeDirection::OUT); ET max_out_degree = thrust::reduce(rmm::exec_policy(stream)->on(stream), p_degree, p_degree + g.number_of_vertices, @@ -58,7 +77,7 @@ typedef struct Katz_Usecase_t { Katz_Usecase_t(const std::string& a, const std::string& b) { // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR - const std::string& rapidsDatasetRootDir = get_rapids_dataset_root_dir(); + const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir(); if ((a != "") && (a[0] != '/')) { matrix_file = rapidsDatasetRootDir + "/" + a; } else { @@ -97,7 +116,7 @@ class Tests_Katz : public ::testing::TestWithParam { int m, k; int nnz; MM_typecode mc; - ASSERT_EQ(mm_properties(fpin, 1, &mc, &m, &k, &nnz), 0) + ASSERT_EQ(cugraph::test::mm_properties(fpin, 1, &mc, &m, &k, &nnz), 0) << "could not read Matrix Market file properties" << "\n"; ASSERT_TRUE(mm_is_matrix(mc)); @@ -111,16 +130,16 @@ class Tests_Katz : public ::testing::TestWithParam { std::vector katz_centrality(m); // Read - ASSERT_EQ((mm_to_coo(fpin, 1, nnz, &cooRowInd[0], &cooColInd[0], &cooVal[0], NULL)), + ASSERT_EQ((cugraph::test::mm_to_coo( + fpin, 1, nnz, &cooRowInd[0], &cooColInd[0], &cooVal[0], NULL)), 0) << "could not read matrix data" << "\n"; ASSERT_EQ(fclose(fpin), 0); - cugraph::experimental::GraphCOOView cooview( - &cooColInd[0], &cooRowInd[0], nullptr, m, nnz); - auto csr = cugraph::coo_to_csr(cooview); - cugraph::experimental::GraphCSRView G = csr->view(); + cugraph::GraphCOOView cooview(&cooColInd[0], &cooRowInd[0], nullptr, m, nnz); + auto csr = cugraph::coo_to_csr(cooview); + cugraph::GraphCSRView G = csr->view(); rmm::device_vector katz_vector(m); double* d_katz = thrust::raw_pointer_cast(katz_vector.data()); @@ -137,7 +156,6 @@ class Tests_Katz : public ::testing::TestWithParam { } }; -// --gtest_filter=*simple_test* INSTANTIATE_TEST_CASE_P( simple_test, Tests_Katz, @@ -148,11 +166,4 @@ INSTANTIATE_TEST_CASE_P( TEST_P(Tests_Katz, Check) { run_current_test(GetParam()); } -int main(int argc, char** argv) -{ - testing::InitGoogleTest(&argc, argv); - auto resource = std::make_unique(); - rmm::mr::set_default_resource(resource.get()); - int rc = RUN_ALL_TESTS(); - return rc; -} +CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/community/balanced_edge_test.cpp b/cpp/tests/community/balanced_edge_test.cpp index 69e34f49e84..81cee945821 100644 --- a/cpp/tests/community/balanced_edge_test.cpp +++ b/cpp/tests/community/balanced_edge_test.cpp @@ -8,14 +8,12 @@ * license agreement from NVIDIA CORPORATION is strictly prohibited. * */ -#include +#include #include #include -#include - TEST(balanced_edge, success) { std::vector off_h = {0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, @@ -50,7 +48,7 @@ TEST(balanced_edge, success) rmm::device_vector weights_v(w_h); rmm::device_vector result_v(cluster_id); - cugraph::experimental::GraphCSRView G( + cugraph::GraphCSRView G( offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges); int num_clusters{8}; @@ -61,25 +59,18 @@ TEST(balanced_edge, success) int kmean_max_iter{100}; float score; - cugraph::nvgraph::balancedCutClustering(G, - num_clusters, - num_eigenvectors, - evs_tolerance, - evs_max_iter, - kmean_tolerance, - kmean_max_iter, - result_v.data().get()); - cugraph::nvgraph::analyzeClustering_edge_cut(G, num_clusters, result_v.data().get(), &score); + cugraph::ext_raft::balancedCutClustering(G, + num_clusters, + num_eigenvectors, + evs_tolerance, + evs_max_iter, + kmean_tolerance, + kmean_max_iter, + result_v.data().get()); + cugraph::ext_raft::analyzeClustering_edge_cut(G, num_clusters, result_v.data().get(), &score); std::cout << "score = " << score << std::endl; ASSERT_LT(score, float{55.0}); } -int main(int argc, char** argv) -{ - testing::InitGoogleTest(&argc, argv); - auto resource = std::make_unique(); - rmm::mr::set_default_resource(resource.get()); - int rc = RUN_ALL_TESTS(); - return rc; -} +CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/community/ecg_test.cu b/cpp/tests/community/ecg_test.cu index b21c2f1d67f..6246a42021d 100644 --- a/cpp/tests/community/ecg_test.cu +++ b/cpp/tests/community/ecg_test.cu @@ -8,17 +8,16 @@ * license agreement from NVIDIA CORPORATION is strictly prohibited. * */ -#include +#include #include #include -#include #include -#include TEST(ecg, success) { + // FIXME: verify that this is the karate dataset std::vector off_h = {0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, 67, 69, 74, 76, 78, 80, 82, 84, 87, 89, 91, 93, 98, 101, 104, 106, 110, 113, 117, 121, 127, 139, 156}; @@ -43,7 +42,7 @@ TEST(ecg, success) rmm::device_vector weights_v(w_h); rmm::device_vector result_v(cluster_id); - cugraph::experimental::GraphCSRView graph_csr( + cugraph::GraphCSRView graph_csr( offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges); cugraph::ecg(graph_csr, .05, 16, result_v.data().get()); @@ -61,14 +60,14 @@ TEST(ecg, success) float modularity{0.0}; - cugraph::nvgraph::analyzeClustering_modularity( + cugraph::ext_raft::analyzeClustering_modularity( graph_csr, max + 1, result_v.data().get(), &modularity); + // 0.399 is 5% below the reference value returned in + // /python/utils/ECG_Golden.ipynb on the same dataset ASSERT_GT(modularity, 0.399); } -// This test currently fails... leaving it in since once louvain is fixed -// it should pass TEST(ecg, dolphin) { std::vector off_h = {0, 6, 14, 18, 21, 22, 26, 32, 37, 43, 50, 55, 56, @@ -104,7 +103,7 @@ TEST(ecg, dolphin) rmm::device_vector weights_v(w_h); rmm::device_vector result_v(cluster_id); - cugraph::experimental::GraphCSRView graph_csr( + cugraph::GraphCSRView graph_csr( offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges); cugraph::ecg(graph_csr, .05, 16, result_v.data().get()); @@ -122,7 +121,7 @@ TEST(ecg, dolphin) float modularity{0.0}; - cugraph::nvgraph::analyzeClustering_modularity( + cugraph::ext_raft::analyzeClustering_modularity( graph_csr, max + 1, result_v.data().get(), &modularity); float random_modularity{0.95 * 0.4962422251701355}; @@ -130,11 +129,4 @@ TEST(ecg, dolphin) ASSERT_GT(modularity, random_modularity); } -int main(int argc, char** argv) -{ - testing::InitGoogleTest(&argc, argv); - auto resource = std::make_unique(); - rmm::mr::set_default_resource(resource.get()); - int rc = RUN_ALL_TESTS(); - return rc; -} +CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/community/leiden_test.cpp b/cpp/tests/community/leiden_test.cpp new file mode 100644 index 00000000000..1e8ba85249d --- /dev/null +++ b/cpp/tests/community/leiden_test.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + * + */ +#include + +#include +#include + +#include + +#include + +#include + +TEST(leiden_karate, success) +{ + std::vector off_h = {0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, + 67, 69, 74, 76, 78, 80, 82, 84, 87, 89, 91, 93, + 98, 101, 104, 106, 110, 113, 117, 121, 127, 139, 156}; + std::vector ind_h = { + 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 0, 2, 3, 7, 13, 17, 19, + 21, 30, 0, 1, 3, 7, 8, 9, 13, 27, 28, 32, 0, 1, 2, 7, 12, 13, 0, 6, 10, 0, 6, + 10, 16, 0, 4, 5, 16, 0, 1, 2, 3, 0, 2, 30, 32, 33, 2, 33, 0, 4, 5, 0, 0, 3, + 0, 1, 2, 3, 33, 32, 33, 32, 33, 5, 6, 0, 1, 32, 33, 0, 1, 33, 32, 33, 0, 1, 32, + 33, 25, 27, 29, 32, 33, 25, 27, 31, 23, 24, 31, 29, 33, 2, 23, 24, 33, 2, 31, 33, 23, 26, + 32, 33, 1, 8, 32, 33, 0, 24, 25, 28, 32, 33, 2, 8, 14, 15, 18, 20, 22, 23, 29, 30, 31, + 33, 8, 9, 13, 14, 15, 18, 19, 20, 22, 23, 26, 27, 28, 29, 30, 31, 32}; + std::vector w_h = { + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + + int num_verts = off_h.size() - 1; + int num_edges = ind_h.size(); + + std::vector cluster_id(num_verts, -1); + + rmm::device_vector offsets_v(off_h); + rmm::device_vector indices_v(ind_h); + rmm::device_vector weights_v(w_h); + rmm::device_vector result_v(cluster_id); + + cugraph::GraphCSRView G( + offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges); + + float modularity{0.0}; + int num_level = 40; + + cugraph::leiden(G, modularity, num_level, result_v.data().get()); + + cudaMemcpy((void*)&(cluster_id[0]), + result_v.data().get(), + sizeof(int) * num_verts, + cudaMemcpyDeviceToHost); + + int min = *min_element(cluster_id.begin(), cluster_id.end()); + + ASSERT_GE(min, 0); + ASSERT_GE(modularity, 0.41116042 * 0.99); +} diff --git a/cpp/tests/community/louvain_test.cpp b/cpp/tests/community/louvain_test.cpp index 7784deec7d6..391af641b73 100644 --- a/cpp/tests/community/louvain_test.cpp +++ b/cpp/tests/community/louvain_test.cpp @@ -8,17 +8,17 @@ * license agreement from NVIDIA CORPORATION is strictly prohibited. * */ -#include +#include #include #include +#include + #include #include -#include - TEST(louvain, success) { std::vector off_h = {0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, @@ -53,7 +53,7 @@ TEST(louvain, success) rmm::device_vector weights_v(w_h); rmm::device_vector result_v(cluster_id); - cugraph::experimental::GraphCSRView G( + cugraph::GraphCSRView G( offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges); float modularity{0.0}; @@ -72,11 +72,140 @@ TEST(louvain, success) ASSERT_GE(modularity, 0.402777 * 0.95); } -int main(int argc, char** argv) +TEST(louvain_modularity, simple) { - testing::InitGoogleTest(&argc, argv); - auto resource = std::make_unique(); - rmm::mr::set_default_resource(resource.get()); - int rc = RUN_ALL_TESTS(); - return rc; + std::vector off_h = {0, 1, 4, 7, 10, 11, 12}; + std::vector src_ind_h = {0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5}; + std::vector ind_h = {1, 0, 2, 3, 1, 3, 4, 1, 2, 5, 2, 3}; + std::vector w_h = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + std::vector v_weights_h = {1.0, 3.0, 3.0, 3.0, 1.0, 1.0}; + + // + // Initial cluster, everything on its own + // + std::vector cluster_h = {0, 1, 2, 3, 4, 5}; + std::vector cluster_weights_h = {1.0, 3.0, 3.0, 3.0, 1.0, 1.0}; + + std::vector cluster_hash_h = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector delta_Q_h = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + std::vector tmp_size_V_h = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + + int num_verts = off_h.size() - 1; + int num_edges = ind_h.size(); + + float q{0.0}; + + rmm::device_vector offsets_v(off_h); + rmm::device_vector src_indices_v(src_ind_h); + rmm::device_vector indices_v(ind_h); + rmm::device_vector weights_v(w_h); + rmm::device_vector vertex_weights_v(v_weights_h); + rmm::device_vector cluster_v(cluster_h); + rmm::device_vector cluster_weights_v(cluster_weights_h); + rmm::device_vector cluster_hash_v(cluster_hash_h); + rmm::device_vector delta_Q_v(delta_Q_h); + rmm::device_vector tmp_size_V_v(tmp_size_V_h); + + cudaStream_t stream{0}; + + // + // Create graph + // + cugraph::GraphCSRView G( + offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges); + + q = cugraph::detail::modularity(float{12}, float{1}, G, cluster_v.data().get()); + + ASSERT_FLOAT_EQ(q, float{-30.0 / 144.0}); + + cugraph::detail::compute_delta_modularity(float{12}, + float{1}, + G, + src_indices_v, + vertex_weights_v, + cluster_weights_v, + cluster_v, + cluster_hash_v, + delta_Q_v, + tmp_size_V_v); + + CUDA_TRY(cudaMemcpy(cluster_hash_h.data(), + cluster_hash_v.data().get(), + sizeof(int) * num_edges, + cudaMemcpyDeviceToHost)); + CUDA_TRY(cudaMemcpy( + delta_Q_h.data(), delta_Q_v.data().get(), sizeof(float) * num_edges, cudaMemcpyDeviceToHost)); + + ASSERT_EQ(cluster_hash_h[0], 1); + ASSERT_EQ(cluster_hash_h[10], 2); + ASSERT_EQ(cluster_hash_h[11], 3); + ASSERT_FLOAT_EQ(delta_Q_h[0], float{1.0 / 8.0}); + ASSERT_FLOAT_EQ(delta_Q_h[10], float{1.0 / 8.0}); + ASSERT_FLOAT_EQ(delta_Q_h[11], float{1.0 / 8.0}); + + // + // Move vertex 0 into cluster 1 + // + cluster_h[0] = 1; + cluster_weights_h[0] = 0.0; + cluster_weights_h[1] = 4.0; + + CUDA_TRY(cudaMemcpy( + cluster_v.data().get(), cluster_h.data(), sizeof(int) * num_verts, cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpy(cluster_weights_v.data().get(), + cluster_weights_h.data(), + sizeof(float) * num_verts, + cudaMemcpyHostToDevice)); + + q = cugraph::detail::modularity(float{12}, float{1}, G, cluster_v.data().get()); + + ASSERT_FLOAT_EQ(q, float{-12.0 / 144.0}); + + cugraph::detail::compute_delta_modularity(float{12}, + float{1}, + G, + src_indices_v, + vertex_weights_v, + cluster_weights_v, + cluster_v, + cluster_hash_v, + delta_Q_v, + tmp_size_V_v); + + CUDA_TRY(cudaMemcpy(cluster_hash_h.data(), + cluster_hash_v.data().get(), + sizeof(int) * num_edges, + cudaMemcpyDeviceToHost)); + CUDA_TRY(cudaMemcpy( + delta_Q_h.data(), delta_Q_v.data().get(), sizeof(float) * num_edges, cudaMemcpyDeviceToHost)); + + ASSERT_EQ(cluster_hash_h[10], 2); + ASSERT_EQ(cluster_hash_h[11], 3); + ASSERT_FLOAT_EQ(delta_Q_h[10], float{1.0 / 8.0}); + ASSERT_FLOAT_EQ(delta_Q_h[11], float{1.0 / 8.0}); + + // + // Move vertex 1 into cluster 2. Not the optimal, in fact it will reduce + // modularity (so Louvain would never do this), but let's see if it reduces + // by the expected amount (-12/144). + // + ASSERT_EQ(cluster_hash_h[3], 2); + ASSERT_FLOAT_EQ(delta_Q_h[3], float{-12.0 / 144.0}); + + cluster_h[1] = 2; + cluster_weights_h[1] = 1.0; + cluster_weights_h[2] = 6.0; + + CUDA_TRY(cudaMemcpy( + cluster_v.data().get(), cluster_h.data(), sizeof(int) * num_verts, cudaMemcpyHostToDevice)); + CUDA_TRY(cudaMemcpy(cluster_weights_v.data().get(), + cluster_weights_h.data(), + sizeof(float) * num_verts, + cudaMemcpyHostToDevice)); + + q = cugraph::detail::modularity(float{12}, float{1}, G, cluster_v.data().get()); + + ASSERT_FLOAT_EQ(q, float{-24.0 / 144.0}); } + +CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/community/triangle_test.cu b/cpp/tests/community/triangle_test.cu index 6440284f099..1c5c99261d2 100644 --- a/cpp/tests/community/triangle_test.cu +++ b/cpp/tests/community/triangle_test.cu @@ -8,14 +8,12 @@ * license agreement from NVIDIA CORPORATION is strictly prohibited. * */ -#include +#include #include #include -#include #include -#include TEST(triangle, dolphin) { @@ -51,16 +49,13 @@ TEST(triangle, dolphin) rmm::device_vector indices_v(ind_h); rmm::device_vector weights_v(w_h); - cugraph::experimental::GraphCSRView graph_csr( + cugraph::GraphCSRView graph_csr( offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges); uint64_t count{0}; - // ASSERT_NO_THROW((count = cugraph::nvgraph::triangle_count(graph_csr))); - try { - count = cugraph::nvgraph::triangle_count(graph_csr); + count = cugraph::triangle::triangle_count(graph_csr); } catch (std::exception& e) { std::cout << "Exception: " << e.what() << std::endl; } @@ -68,11 +63,4 @@ TEST(triangle, dolphin) ASSERT_EQ(count, expected); } -int main(int argc, char** argv) -{ - testing::InitGoogleTest(&argc, argv); - auto resource = std::make_unique(); - rmm::mr::set_default_resource(resource.get()); - int rc = RUN_ALL_TESTS(); - return rc; -} +CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/components/con_comp_test.cu b/cpp/tests/components/con_comp_test.cu index f2a6cba35c3..15d60867753 100644 --- a/cpp/tests/components/con_comp_test.cu +++ b/cpp/tests/components/con_comp_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. * * NVIDIA CORPORATION and its licensors retain all intellectual property * and proprietary rights in and to this software, related documentation @@ -12,17 +12,18 @@ // connected components tests // Author: Andrei Schaffer aschaffer@nvidia.com -#include "cuda_profiler_api.h" -#include "gtest/gtest.h" -#include "high_res_clock.h" +#include +#include +#include + +#include -#include #include #include #include + +#include #include -#include -#include "test_utils.h" // do the perf measurements // enabled by command line parameter s'--perf' @@ -34,7 +35,7 @@ struct Usecase { explicit Usecase(const std::string& a) { // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR - const std::string& rapidsDatasetRootDir = get_rapids_dataset_root_dir(); + const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir(); if ((a != "") && (a[0] != '/')) { matrix_file = rapidsDatasetRootDir + "/" + a; } else { @@ -71,9 +72,10 @@ struct Tests_Weakly_CC : ::testing::TestWithParam { const ::testing::TestInfo* const test_info = ::testing::UnitTest::GetInstance()->current_test_info(); std::stringstream ss; - std::string test_id = - std::string(test_info->test_case_name()) + std::string(".") + std::string(test_info->name()) + - std::string("_") + getFileName(param.get_matrix_file()) + std::string("_") + ss.str().c_str(); + std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + + std::string(test_info->name()) + std::string("_") + + cugraph::test::getFileName(param.get_matrix_file()) + std::string("_") + + ss.str().c_str(); int m, k, nnz; // MM_typecode mc; @@ -84,7 +86,7 @@ struct Tests_Weakly_CC : ::testing::TestWithParam { FILE* fpin = fopen(param.get_matrix_file().c_str(), "r"); ASSERT_NE(fpin, nullptr) << "fopen (" << param.get_matrix_file() << ") failure."; - ASSERT_EQ(mm_properties(fpin, 1, &mc, &m, &k, &nnz), 0) + ASSERT_EQ(cugraph::test::mm_properties(fpin, 1, &mc, &m, &k, &nnz), 0) << "could not read Matrix Market file properties" << "\n"; ASSERT_TRUE(mm_is_matrix(mc)); @@ -104,16 +106,16 @@ struct Tests_Weakly_CC : ::testing::TestWithParam { // Read: COO Format // - ASSERT_EQ((mm_to_coo(fpin, 1, nnz, &cooRowInd[0], &cooColInd[0], nullptr, nullptr)), + ASSERT_EQ((cugraph::test::mm_to_coo( + fpin, 1, nnz, &cooRowInd[0], &cooColInd[0], nullptr, nullptr)), 0) << "could not read matrix data" << "\n"; ASSERT_EQ(fclose(fpin), 0); - cugraph::experimental::GraphCOOView G_coo( - &cooRowInd[0], &cooColInd[0], nullptr, m, nnz); - auto G_unique = cugraph::coo_to_csr(G_coo); - cugraph::experimental::GraphCSRView G = G_unique->view(); + cugraph::GraphCOOView G_coo(&cooRowInd[0], &cooColInd[0], nullptr, m, nnz); + auto G_unique = cugraph::coo_to_csr(G_coo); + cugraph::GraphCSRView G = G_unique->view(); rmm::device_vector d_labels(m); @@ -146,11 +148,4 @@ INSTANTIATE_TEST_CASE_P(simple_test, Usecase("test/datasets/coPapersCiteseer.mtx"), Usecase("test/datasets/hollywood.mtx"))); -int main(int argc, char** argv) -{ - testing::InitGoogleTest(&argc, argv); - auto resource = std::make_unique(); - rmm::mr::set_default_resource(resource.get()); - int rc = RUN_ALL_TESTS(); - return rc; -} +CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/components/scc_test.cu b/cpp/tests/components/scc_test.cu index e8d15790f68..9d5b55f34c6 100644 --- a/cpp/tests/components/scc_test.cu +++ b/cpp/tests/components/scc_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. * * NVIDIA CORPORATION and its licensors retain all intellectual property * and proprietary rights in and to this software, related documentation @@ -12,24 +12,23 @@ // strongly connected components tests // Author: Andrei Schaffer aschaffer@nvidia.com -#include "cuda_profiler_api.h" -#include "gtest/gtest.h" -#include "high_res_clock.h" +#include +#include +#include + +#include +#include +#include +#include +#include + +#include #include #include #include #include -#include "test_utils.h" - -#include -#include -#include - -#include -#include "components/scc_matrix.cuh" -#include "topology/topology.cuh" // do the perf measurements // enabled by command line parameter s'--perf' @@ -37,14 +36,14 @@ static int PERF = 0; template -using DVector = thrust::device_vector; +using DVector = rmm::device_vector; namespace { // un-nammed struct Usecase { explicit Usecase(const std::string& a) { // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR - const std::string& rapidsDatasetRootDir = get_rapids_dataset_root_dir(); + const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir(); if ((a != "") && (a[0] != '/')) { matrix_file = rapidsDatasetRootDir + "/" + a; } else { @@ -120,9 +119,10 @@ struct Tests_Strongly_CC : ::testing::TestWithParam { const ::testing::TestInfo* const test_info = ::testing::UnitTest::GetInstance()->current_test_info(); std::stringstream ss; - std::string test_id = - std::string(test_info->test_case_name()) + std::string(".") + std::string(test_info->name()) + - std::string("_") + getFileName(param.get_matrix_file()) + std::string("_") + ss.str().c_str(); + std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + + std::string(test_info->name()) + std::string("_") + + cugraph::test::getFileName(param.get_matrix_file()) + std::string("_") + + ss.str().c_str(); using ByteT = unsigned char; using IndexT = int; @@ -136,7 +136,7 @@ struct Tests_Strongly_CC : ::testing::TestWithParam { FILE* fpin = fopen(param.get_matrix_file().c_str(), "r"); ASSERT_NE(fpin, nullptr) << "fopen (" << param.get_matrix_file().c_str() << ") failure."; - ASSERT_EQ(mm_properties(fpin, 1, &mc, &m, &k, &nnz), 0) + ASSERT_EQ(cugraph::test::mm_properties(fpin, 1, &mc, &m, &k, &nnz), 0) << "could not read Matrix Market file properties" << "\n"; ASSERT_TRUE(mm_is_matrix(mc)); @@ -159,16 +159,16 @@ struct Tests_Strongly_CC : ::testing::TestWithParam { // Read: COO Format // - ASSERT_EQ( - (mm_to_coo(fpin, 1, nnz, &cooRowInd[0], &cooColInd[0], nullptr, nullptr)), 0) + ASSERT_EQ((cugraph::test::mm_to_coo( + fpin, 1, nnz, &cooRowInd[0], &cooColInd[0], nullptr, nullptr)), + 0) << "could not read matrix data" << "\n"; ASSERT_EQ(fclose(fpin), 0); - cugraph::experimental::GraphCOOView G_coo( - &cooRowInd[0], &cooColInd[0], nullptr, m, nnz); - auto G_unique = cugraph::coo_to_csr(G_coo); - cugraph::experimental::GraphCSRView G = G_unique->view(); + cugraph::GraphCOOView G_coo(&cooRowInd[0], &cooColInd[0], nullptr, m, nnz); + auto G_unique = cugraph::coo_to_csr(G_coo); + cugraph::GraphCSRView G = G_unique->view(); rmm::device_vector d_labels(m); @@ -208,11 +208,4 @@ INSTANTIATE_TEST_CASE_P( Usecase("test/datasets/cage6.mtx") // DG "small" enough to meet SCC GPU memory requirements )); -int main(int argc, char** argv) -{ - testing::InitGoogleTest(&argc, argv); - auto resource = std::make_unique(); - rmm::mr::set_default_resource(resource.get()); - int rc = RUN_ALL_TESTS(); - return rc; -} +CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/db/find_matches_test.cu b/cpp/tests/db/find_matches_test.cu index 3b44b682d34..c1373bb8bf2 100644 --- a/cpp/tests/db/find_matches_test.cu +++ b/cpp/tests/db/find_matches_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,14 +14,14 @@ * limitations under the License. */ -#include -#include "db/db_operators.cuh" -#include "gtest/gtest.h" -#include "high_res_clock.h" -#include "rmm/device_buffer.hpp" -#include "test_utils.h" -#include "utilities/error_utils.h" -#include "utilities/graph_utils.cuh" +#include +#include +#include +#include + +#include + +#include class Test_FindMatches : public ::testing::Test { public: @@ -229,11 +229,4 @@ TEST_F(Test_FindMatches, fifthTest) ASSERT_EQ(resultB[1], 3); } -int main(int argc, char** argv) -{ - testing::InitGoogleTest(&argc, argv); - auto resource = std::make_unique(); - rmm::mr::set_default_resource(resource.get()); - int rc = RUN_ALL_TESTS(); - return rc; -} +CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/layout/force_atlas2_test.cu b/cpp/tests/layout/force_atlas2_test.cu index a18f5525bb6..d564765d0df 100644 --- a/cpp/tests/layout/force_atlas2_test.cu +++ b/cpp/tests/layout/force_atlas2_test.cu @@ -12,17 +12,21 @@ // Force_Atlas2 tests // Author: Hugo Linsenmaier hlinsenmaier@nvidia.com -#include +#include +#include +#include + +#include #include -#include #include + +#include +#include + +#include + +#include #include -#include -#include "cuda_profiler_api.h" -#include "gtest/gtest.h" -#include "high_res_clock.h" -#include "test_utils.h" -#include "trust_worthiness.h" // do the perf measurements // enabled by command line parameter s'--perf' @@ -38,7 +42,7 @@ typedef struct Force_Atlas2_Usecase_t { Force_Atlas2_Usecase_t(const std::string& a, const float b) { // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR - const std::string& rapidsDatasetRootDir = get_rapids_dataset_root_dir(); + const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir(); if ((a != "") && (a[0] != '/')) { matrix_file = rapidsDatasetRootDir + "/" + a; } else { @@ -83,7 +87,8 @@ class Tests_Force_Atlas2 : public ::testing::TestWithParam std::stringstream ss; std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + std::string(test_info->name()) + std::string("_") + - getFileName(param.matrix_file) + std::string("_") + ss.str().c_str(); + cugraph::test::getFileName(param.matrix_file) + std::string("_") + + ss.str().c_str(); int m, k, nnz; MM_typecode mc; @@ -92,7 +97,7 @@ class Tests_Force_Atlas2 : public ::testing::TestWithParam FILE* fpin = fopen(param.matrix_file.c_str(), "r"); ASSERT_NE(fpin, nullptr) << "fopen (" << param.matrix_file << ") failure."; - ASSERT_EQ(mm_properties(fpin, 1, &mc, &m, &k, &nnz), 0) + ASSERT_EQ(cugraph::test::mm_properties(fpin, 1, &mc, &m, &k, &nnz), 0) << "could not read Matrix Market file properties" << "\n"; ASSERT_TRUE(mm_is_matrix(mc)); @@ -111,7 +116,9 @@ class Tests_Force_Atlas2 : public ::testing::TestWithParam float* d_force_atlas2 = force_atlas2_vector.data().get(); // Read - ASSERT_EQ((mm_to_coo(fpin, 1, nnz, &cooRowInd[0], &cooColInd[0], &cooVal[0], NULL)), 0) + ASSERT_EQ((cugraph::test::mm_to_coo( + fpin, 1, nnz, &cooRowInd[0], &cooColInd[0], &cooVal[0], NULL)), + 0) << "could not read matrix data" << "\n"; ASSERT_EQ(fclose(fpin), 0); @@ -132,10 +139,11 @@ class Tests_Force_Atlas2 : public ::testing::TestWithParam int* dests = dests_v.data().get(); T* weights = weights_v.data().get(); + // FIXME: RAFT error handling mechanism should be used instead CUDA_TRY(cudaMemcpy(srcs, &cooRowInd[0], sizeof(int) * nnz, cudaMemcpyDefault)); CUDA_TRY(cudaMemcpy(dests, &cooColInd[0], sizeof(int) * nnz, cudaMemcpyDefault)); CUDA_TRY(cudaMemcpy(weights, &cooVal[0], sizeof(T) * nnz, cudaMemcpyDefault)); - cugraph::experimental::GraphCOOView G(srcs, dests, weights, m, nnz); + cugraph::GraphCOOView G(srcs, dests, weights, m, nnz); const int max_iter = 500; float* x_start = nullptr; @@ -199,8 +207,7 @@ class Tests_Force_Atlas2 : public ::testing::TestWithParam // Copy pos to host std::vector h_pos(m * 2); - CUDA_RT_CALL( - cudaMemcpy(&h_pos[0], d_force_atlas2, sizeof(float) * m * 2, cudaMemcpyDeviceToHost)); + CUDA_TRY(cudaMemcpy(&h_pos[0], d_force_atlas2, sizeof(float) * m * 2, cudaMemcpyDeviceToHost)); // Transpose the data std::vector> C_contiguous_embedding(m, std::vector(2)); @@ -230,11 +237,4 @@ INSTANTIATE_TEST_CASE_P(simple_test, Force_Atlas2_Usecase("test/datasets/netscience.mtx", 0.80))); -int main(int argc, char** argv) -{ - testing::InitGoogleTest(&argc, argv); - auto resource = std::make_unique(); - rmm::mr::set_default_resource(resource.get()); - int rc = RUN_ALL_TESTS(); - return rc; -} +CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/layout/knn.h b/cpp/tests/layout/knn.h index d42318288fc..07d07528769 100644 --- a/cpp/tests/layout/knn.h +++ b/cpp/tests/layout/knn.h @@ -20,6 +20,7 @@ #include #include #include +#include struct point { point() {} diff --git a/cpp/tests/layout/trust_worthiness.h b/cpp/tests/layout/trust_worthiness.h index 5d3f4436950..40c9782a76e 100644 --- a/cpp/tests/layout/trust_worthiness.h +++ b/cpp/tests/layout/trust_worthiness.h @@ -16,6 +16,10 @@ #include "knn.h" +#include +#include +#include + double euclidian_dist(const std::vector& x, const std::vector& y) { double total = 0; diff --git a/cpp/tests/nccl/degree_test.cu b/cpp/tests/nccl/degree_test.cu deleted file mode 100644 index 9bba66efe1e..00000000000 --- a/cpp/tests/nccl/degree_test.cu +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include "gtest/gtest.h" -#include "test_utils.h" - -// ref Degree on the host -template -void ref_degree_h(std::vector &ind_h, std::vector °ree) -{ - for (size_t i = 0; i < degree.size(); i++) degree[i] = 0; - for (size_t i = 0; i < ind_h.size(); i++) degree[ind_h[i]] += 1; -} - -// global to local offsets by shifting all offsets by the first offset value -template -void shift_by_front(std::vector &v) -{ - auto start = v.front(); - for (auto i = size_t{0}; i < v.size(); ++i) v[i] -= start; -} - -// 1D partitioning such as each GPU has about the same number of edges -template -void opg_edge_partioning( - int r, int p, std::vector &ind_h, std::vector &part_offset, size_t &e_loc) -{ - // set first and last partition offsets - part_offset[0] = 0; - part_offset[p] = ind_h.size(); - // part_offset[p] = *(std::max_element(ind_h.begin(), ind_h.end())); - auto loc_nnz = ind_h.size() / p; - for (int i = 1; i < p; i++) { - // get the first vertex ID of each partition - auto start_nnz = i * loc_nnz; - auto start_v = 0; - for (auto j = size_t{0}; j < ind_h.size(); ++j) { - if (j >= start_nnz) { - start_v = j; - break; - } - } - part_offset[i] = start_v; - } - e_loc = part_offset[r + 1] - part_offset[r]; -} -TEST(degree, success) -{ - int v = 6; - - // host - std::vector src_h = {0, 0, 2, 2, 2, 3, 3, 4, 4, 5, 5}, - dest_h = {1, 2, 0, 1, 4, 4, 5, 3, 5, 3, 1}; - std::vector degree_h(v, 0.0), degree_ref(v, 0.0); - - // MG - int p; - MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &p)); - cugraph::experimental::Comm comm(p); - std::vector part_offset(p + 1); - auto i = comm.get_rank(); - size_t e_loc; - - opg_edge_partioning(i, p, src_h, part_offset, e_loc); -#ifdef OPG_VERBOSE - sleep(i); - for (auto j = part_offset.begin(); j != part_offset.end(); ++j) std::cout << *j << ' '; - std::cout << std::endl; - std::cout << "eloc: " << e_loc << std::endl; -#endif - std::vector src_loc_h(src_h.begin() + part_offset[i], - src_h.begin() + part_offset[i] + e_loc), - dest_loc_h(dest_h.begin() + part_offset[i], dest_h.begin() + part_offset[i] + e_loc); - shift_by_front(src_loc_h); - - // print mg info - printf("# Rank %2d - Pid %6d - device %2d\n", comm.get_rank(), getpid(), comm.get_dev()); - - // local device - thrust::device_vector src_d(src_loc_h.begin(), src_loc_h.end()); - thrust::device_vector dest_d(dest_loc_h.begin(), dest_loc_h.end()); - thrust::device_vector degree_d(v); - - // load local chunck to cugraph - cugraph::experimental::GraphCOO G(thrust::raw_pointer_cast(src_d.data()), - thrust::raw_pointer_cast(dest_d.data()), - nullptr, - degree_h.size(), - e_loc); - G.set_communicator(comm); - - // OUT degree - G.degree(thrust::raw_pointer_cast(degree_d.data()), cugraph::experimental::DegreeDirection::IN); - thrust::copy(degree_d.begin(), degree_d.end(), degree_h.begin()); - ref_degree_h(dest_h, degree_ref); - // sleep(i); - for (size_t j = 0; j < degree_h.size(); ++j) EXPECT_EQ(degree_ref[j], degree_h[j]); - std::cout << "Rank " << i << " done checking." << std::endl; -} - -int main(int argc, char **argv) -{ - testing::InitGoogleTest(&argc, argv); - MPI_Init(&argc, &argv); - { - auto resource = std::make_unique(); - rmm::mr::set_default_resource(resource.get()); - int rc = RUN_ALL_TESTS(); - } - MPI_Finalize(); - return rc; -} diff --git a/cpp/tests/nccl/nccl_test.cu b/cpp/tests/nccl/nccl_test.cu deleted file mode 100644 index 6c8bb2043eb..00000000000 --- a/cpp/tests/nccl/nccl_test.cu +++ /dev/null @@ -1,76 +0,0 @@ -#include -#include -#include -#include -#include -#include "gtest/gtest.h" -#include "test_utils.h" - -TEST(allgather, success) -{ - int p = 1, r = 0, dev = 0, dev_count = 0; - MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &p)); - MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &r)); - CUDA_RT_CALL(cudaGetDeviceCount(&dev_count)); - - // shortcut for device ID here - // may need something smarter later - dev = r % dev_count; - // cudaSetDevice must happen before ncclCommInitRank - CUDA_RT_CALL(cudaSetDevice(dev)); - - // print info - printf("# Rank %2d - Pid %6d - device %2d\n", r, getpid(), dev); - - // NCCL init - ncclUniqueId id; - ncclComm_t comm; - if (r == 0) NCCLCHECK(ncclGetUniqueId(&id)); - MPICHECK(MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD)); - NCCLCHECK(ncclCommInitRank(&comm, p, id, r)); - MPICHECK(MPI_Barrier(MPI_COMM_WORLD)); - - // allocate device buffers - int size = 3; - float *sendbuff, *recvbuff; - CUDA_RT_CALL(cudaMalloc(&sendbuff, size * sizeof(float))); - CUDA_RT_CALL(cudaMalloc(&recvbuff, size * p * sizeof(float))); - - // init values - thrust::fill( - thrust::device_pointer_cast(sendbuff), thrust::device_pointer_cast(sendbuff + size), (float)r); - thrust::fill( - thrust::device_pointer_cast(recvbuff), thrust::device_pointer_cast(recvbuff + size * p), -1.0f); - - // ncclAllGather - NCCLCHECK(ncclAllGather( - (const void *)sendbuff, (void *)recvbuff, size, ncclFloat, comm, cudaStreamDefault)); - - // expect each rankid printed size times in ascending order - if (r == 0) { - thrust::device_ptr dev_ptr(recvbuff); - std::cout.precision(15); - thrust::copy(dev_ptr, dev_ptr + size * p, std::ostream_iterator(std::cout, " ")); - std::cout << std::endl; - } - - // free device buffers - CUDA_RT_CALL(cudaFree(sendbuff)); - CUDA_RT_CALL(cudaFree(recvbuff)); - - // finalizing NCCL - NCCLCHECK(ncclCommDestroy(comm)); -} - -int main(int argc, char **argv) -{ - testing::InitGoogleTest(&argc, argv); - MPI_Init(&argc, &argv); - { - auto resource = std::make_unique(); - rmm::mr::set_default_resource(resource.get()); - int rc = RUN_ALL_TESTS(); - } - MPI_Finalize(); - return rc; -} diff --git a/cpp/tests/pagerank/pagerank_test.cu b/cpp/tests/pagerank/pagerank_test.cpp similarity index 74% rename from cpp/tests/pagerank/pagerank_test.cu rename to cpp/tests/pagerank/pagerank_test.cpp index 977650c6c90..48705f7f324 100644 --- a/cpp/tests/pagerank/pagerank_test.cu +++ b/cpp/tests/pagerank/pagerank_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. * * NVIDIA CORPORATION and its licensors retain all intellectual property * and proprietary rights in and to this software, related documentation @@ -12,15 +12,21 @@ // Pagerank solver tests // Author: Alex Fender afender@nvidia.com -#include +#include +#include +#include + #include -#include #include -#include -#include "cuda_profiler_api.h" -#include "gtest/gtest.h" -#include "high_res_clock.h" -#include "test_utils.h" + +#include +#include + +#include + +#include + +#include // do the perf measurements // enabled by command line parameter s'--perf' @@ -36,7 +42,7 @@ typedef struct Pagerank_Usecase_t { Pagerank_Usecase_t(const std::string& a, const std::string& b) { // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR - const std::string& rapidsDatasetRootDir = get_rapids_dataset_root_dir(); + const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir(); if ((a != "") && (a[0] != '/')) { matrix_file = rapidsDatasetRootDir + "/" + a; } else { @@ -81,7 +87,8 @@ class Tests_Pagerank : public ::testing::TestWithParam { std::stringstream ss; std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + std::string(test_info->name()) + std::string("_") + - getFileName(param.matrix_file) + std::string("_") + ss.str().c_str(); + cugraph::test::getFileName(param.matrix_file) + std::string("_") + + ss.str().c_str(); int m, k, nnz; MM_typecode mc; @@ -101,7 +108,7 @@ class Tests_Pagerank : public ::testing::TestWithParam { FILE* fpin = fopen(param.matrix_file.c_str(), "r"); ASSERT_NE(fpin, nullptr) << "fopen (" << param.matrix_file << ") failure."; - ASSERT_EQ(mm_properties(fpin, 1, &mc, &m, &k, &nnz), 0) + ASSERT_EQ(cugraph::test::mm_properties(fpin, 1, &mc, &m, &k, &nnz), 0) << "could not read Matrix Market file properties" << "\n"; ASSERT_TRUE(mm_is_matrix(mc)); @@ -114,37 +121,39 @@ class Tests_Pagerank : public ::testing::TestWithParam { std::vector cooVal(nnz), pagerank(m); // device alloc - rmm::device_vector pagerank_vector(m); - T* d_pagerank = thrust::raw_pointer_cast(pagerank_vector.data()); + rmm::device_uvector pagerank_vector(static_cast(m), nullptr); + T* d_pagerank = pagerank_vector.data(); // Read - ASSERT_EQ((mm_to_coo(fpin, 1, nnz, &cooRowInd[0], &cooColInd[0], &cooVal[0], NULL)), 0) + ASSERT_EQ((cugraph::test::mm_to_coo( + fpin, 1, nnz, &cooRowInd[0], &cooColInd[0], &cooVal[0], NULL)), + 0) << "could not read matrix data" << "\n"; ASSERT_EQ(fclose(fpin), 0); // Pagerank runs on CSC, so feed COOtoCSR the row/col backwards. - cugraph::experimental::GraphCOOView G_coo( - &cooColInd[0], &cooRowInd[0], &cooVal[0], m, nnz); + raft::handle_t handle; + cugraph::GraphCOOView G_coo(&cooColInd[0], &cooRowInd[0], &cooVal[0], m, nnz); auto G_unique = cugraph::coo_to_csr(G_coo); - cugraph::experimental::GraphCSCView G(G_unique->view().offsets, - G_unique->view().indices, - G_unique->view().edge_data, - G_unique->view().number_of_vertices, - G_unique->view().number_of_edges); + cugraph::GraphCSCView G(G_unique->view().offsets, + G_unique->view().indices, + G_unique->view().edge_data, + G_unique->view().number_of_vertices, + G_unique->view().number_of_edges); cudaDeviceSynchronize(); if (PERF) { hr_clock.start(); for (int i = 0; i < PERF_MULTIPLIER; ++i) { - cugraph::pagerank(G, d_pagerank); + cugraph::pagerank(handle, G, d_pagerank); cudaDeviceSynchronize(); } hr_clock.stop(&time_tmp); pagerank_time.push_back(time_tmp); } else { cudaProfilerStart(); - cugraph::pagerank(G, d_pagerank); + cugraph::pagerank(handle, G, d_pagerank); cudaProfilerStop(); cudaDeviceSynchronize(); } @@ -153,14 +162,13 @@ class Tests_Pagerank : public ::testing::TestWithParam { if (param.result_file.length() > 0) { std::vector calculated_res(m); - CUDA_RT_CALL( - cudaMemcpy(&calculated_res[0], d_pagerank, sizeof(T) * m, cudaMemcpyDeviceToHost)); + CUDA_TRY(cudaMemcpy(&calculated_res[0], d_pagerank, sizeof(T) * m, cudaMemcpyDeviceToHost)); std::sort(calculated_res.begin(), calculated_res.end()); fpin = fopen(param.result_file.c_str(), "rb"); ASSERT_TRUE(fpin != NULL) << " Cannot read file with reference data: " << param.result_file << std::endl; std::vector expected_res(m); - ASSERT_EQ(read_binary_vector(fpin, m, expected_res), 0); + ASSERT_EQ(cugraph::test::read_binary_vector(fpin, m, expected_res), 0); fclose(fpin); T err; int n_err = 0; @@ -195,11 +203,4 @@ INSTANTIATE_TEST_CASE_P( Pagerank_Usecase("test/datasets/webbase-1M.mtx", "test/ref/pagerank/webbase-1M.pagerank_val_0.85.bin"))); -int main(int argc, char** argv) -{ - testing::InitGoogleTest(&argc, argv); - auto resource = std::make_unique(); - rmm::mr::set_default_resource(resource.get()); - int rc = RUN_ALL_TESTS(); - return rc; -} +CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/renumber/renumber_test.cu b/cpp/tests/renumber/renumber_test.cu index 1601eff284f..608adc59ccb 100644 --- a/cpp/tests/renumber/renumber_test.cu +++ b/cpp/tests/renumber/renumber_test.cu @@ -1,7 +1,7 @@ // -*-c++-*- /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,20 +16,19 @@ * limitations under the License. */ -#include "gmock/gmock.h" -#include "gtest/gtest.h" +//#include "gmock/gmock.h" -#include "cuda_profiler_api.h" +#include -#include -#include -#include -#include "converters/renumber.cuh" +#include -#include +#include +#include #include +#include + struct RenumberingTest : public ::testing::Test { }; @@ -577,11 +576,4 @@ TEST_F(RenumberingTest, Random500MVertexSet) std::cout << " hash size = " << hash_size << std::endl; } -int main(int argc, char **argv) -{ - testing::InitGoogleTest(&argc, argv); - auto resource = std::make_unique(); - rmm::mr::set_default_resource(resource.get()); - int rc = RUN_ALL_TESTS(); - return rc; -} +CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/test_utils.h b/cpp/tests/test_utils.h deleted file mode 100644 index ca8555c5cc7..00000000000 --- a/cpp/tests/test_utils.h +++ /dev/null @@ -1,691 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -extern "C" { -#include "mmio.h" -} -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "utilities/error_utils.h" - -#include "converters/COOtoCSR.cuh" - -#ifndef CUDA_RT_CALL -#define CUDA_RT_CALL(call) \ - { \ - cudaError_t cudaStatus = call; \ - if (cudaSuccess != cudaStatus) { \ - fprintf(stderr, \ - "ERROR: CUDA RT call \"%s\" in line %d of file %s failed with %s (%d).\n", \ - #call, \ - __LINE__, \ - __FILE__, \ - cudaGetErrorString(cudaStatus), \ - cudaStatus); \ - } \ - } -#endif - -#define NCCLCHECK(cmd) \ - { \ - ncclResult_t nccl_status = cmd; \ - if (nccl_status != ncclSuccess) { \ - printf("NCCL failure %s:%d '%s'\n", __FILE__, __LINE__, ncclGetErrorString(nccl_status)); \ - FAIL(); \ - } \ - } - -#define MPICHECK(cmd) \ - { \ - int e = cmd; \ - if (e != MPI_SUCCESS) { \ - printf("Failed: MPI error %s:%d '%d'\n", __FILE__, __LINE__, e); \ - FAIL(); \ - } \ - } - -std::string getFileName(const std::string& s) -{ - char sep = '/'; - -#ifdef _WIN32 - sep = '\\'; -#endif - - size_t i = s.rfind(sep, s.length()); - if (i != std::string::npos) { return (s.substr(i + 1, s.length() - i)); } - return (""); -} - -template -void verbose_diff(std::vector& v1, std::vector& v2) -{ - for (unsigned int i = 0; i < v1.size(); ++i) { - if (v1[i] != v2[i]) { - std::cout << "[" << i << "] : " << v1[i] << " vs. " << v2[i] << std::endl; - } - } -} - -template -int eq(std::vector& v1, std::vector& v2) -{ - if (v1 == v2) - return 0; - else { - verbose_diff(v1, v2); - return 1; - } -} - -template -void printv(size_t n, T* vec, int offset) -{ - thrust::device_ptr dev_ptr(vec); - std::cout.precision(15); - std::cout << "sample size = " << n << ", offset = " << offset << std::endl; - thrust::copy( - dev_ptr + offset, - dev_ptr + offset + n, - std::ostream_iterator( - std::cout, " ")); // Assume no RMM dependency; FIXME: check / test (potential BUG !!!!!) - std::cout << std::endl; -} - -template -void random_vals(std::vector& v) -{ - srand(42); - for (auto i = size_t{0}; i < v.size(); i++) v[i] = static_cast(std::rand() % 10); -} - -template -void ref_csr2csc(int m, - int n, - int nnz, - const T_ELEM* csrVals, - const int* csrRowptr, - const int* csrColInd, - T_ELEM* cscVals, - int* cscRowind, - int* cscColptr, - int base = 0) -{ - int i, j, row, col, index; - int* counters; - T_ELEM val; - - /* early return */ - if ((m <= 0) || (n <= 0) || (nnz <= 0)) { return; } - - /* build compressed column pointers */ - memset(cscColptr, 0, (n + 1) * sizeof(cscColptr[0])); - cscColptr[0] = base; - for (i = 0; i < nnz; i++) { cscColptr[1 + csrColInd[i] - base]++; } - for (i = 0; i < n; i++) { cscColptr[i + 1] += cscColptr[i]; } - - /* expand row indecis and copy them and values into csc arrays according to permutation */ - counters = (int*)malloc(n * sizeof(counters[0])); - memset(counters, 0, n * sizeof(counters[0])); - for (i = 0; i < m; i++) { - for (j = csrRowptr[i]; j < csrRowptr[i + 1]; j++) { - row = i + base; - col = csrColInd[j - base]; - - index = cscColptr[col - base] - base + counters[col - base]; - counters[col - base]++; - - cscRowind[index] = row; - - if (csrVals != NULL || cscVals != NULL) { - val = csrVals[j - base]; - cscVals[index] = val; - } - } - } - free(counters); -} - -template -int transition_matrix_cpu(int n, int e, int* csrRowPtrA, int* csrColIndA, T* weight, T* is_leaf) -// omp_set_num_threads(4); -//#pragma omp parallel -{ - int j, row, row_size; - //#pragma omp for - for (row = 0; row < n; row++) { - row_size = csrRowPtrA[row + 1] - csrRowPtrA[row]; - if (row_size == 0) - is_leaf[row] = 1.0; - else { - is_leaf[row] = 0.0; - for (j = csrRowPtrA[row]; j < csrRowPtrA[row + 1]; j++) weight[j] = 1.0 / row_size; - } - } - return 0; -} -template -void printCsrMatI(int m, - int n, - int nnz, - std::vector& csrRowPtr, - std::vector& csrColInd, - std::vector& csrVal) -{ - std::vector v(n); - std::stringstream ss; - ss.str(std::string()); - ss << std::fixed; - ss << std::setprecision(2); - for (int i = 0; i < m; i++) { - std::fill(v.begin(), v.end(), 0); - for (int j = csrRowPtr[i]; j < csrRowPtr[i + 1]; j++) v[csrColInd[j]] = csrVal[j]; - - std::copy(v.begin(), v.end(), std::ostream_iterator(ss, " ")); - ss << "\n"; - } - ss << "\n"; - std::cout << ss.str(); -} - -/// Read matrix properties from Matrix Market file -/** Matrix Market file is assumed to be a sparse matrix in coordinate - * format. - * - * @param f File stream for Matrix Market file. - * @param tg Boolean indicating whether to convert matrix to general - * format (from symmetric, Hermitian, or skew symmetric format). - * @param t (Output) MM_typecode with matrix properties. - * @param m (Output) Number of matrix rows. - * @param n (Output) Number of matrix columns. - * @param nnz (Output) Number of non-zero matrix entries. - * @return Zero if properties were read successfully. Otherwise - * non-zero. - */ -template -int mm_properties(FILE* f, int tg, MM_typecode* t, IndexType_* m, IndexType_* n, IndexType_* nnz) -{ - // Read matrix properties from file - int mint, nint, nnzint; - if (fseek(f, 0, SEEK_SET)) { - fprintf(stderr, "Error: could not set position in file\n"); - return -1; - } - if (mm_read_banner(f, t)) { - fprintf(stderr, "Error: could not read Matrix Market file banner\n"); - return -1; - } - if (!mm_is_matrix(*t) || !mm_is_coordinate(*t)) { - fprintf(stderr, "Error: file does not contain matrix in coordinate format\n"); - return -1; - } - if (mm_read_mtx_crd_size(f, &mint, &nint, &nnzint)) { - fprintf(stderr, "Error: could not read matrix dimensions\n"); - return -1; - } - if (!mm_is_pattern(*t) && !mm_is_real(*t) && !mm_is_integer(*t) && !mm_is_complex(*t)) { - fprintf(stderr, "Error: matrix entries are not valid type\n"); - return -1; - } - *m = mint; - *n = nint; - *nnz = nnzint; - - // Find total number of non-zero entries - if (tg && !mm_is_general(*t)) { - // Non-diagonal entries should be counted twice - IndexType_ nnzOld = *nnz; - *nnz *= 2; - - // Diagonal entries should not be double-counted - int i; - int st; - for (i = 0; i < nnzOld; ++i) { - // Read matrix entry - IndexType_ row, col; - double rval, ival; - if (mm_is_pattern(*t)) - st = fscanf(f, "%d %d\n", &row, &col); - else if (mm_is_real(*t) || mm_is_integer(*t)) - st = fscanf(f, "%d %d %lg\n", &row, &col, &rval); - else // Complex matrix - st = fscanf(f, "%d %d %lg %lg\n", &row, &col, &rval, &ival); - if (ferror(f) || (st == EOF)) { - fprintf(stderr, "Error: error %d reading Matrix Market file (entry %d)\n", st, i + 1); - return -1; - } - - // Check if entry is diagonal - if (row == col) --(*nnz); - } - } - - return 0; -} - -/// Read Matrix Market file and convert to COO format matrix -/** Matrix Market file is assumed to be a sparse matrix in coordinate - * format. - * - * @param f File stream for Matrix Market file. - * @param tg Boolean indicating whether to convert matrix to general - * format (from symmetric, Hermitian, or skew symmetric format). - * @param nnz Number of non-zero matrix entries. - * @param cooRowInd (Output) Row indices for COO matrix. Should have - * at least nnz entries. - * @param cooColInd (Output) Column indices for COO matrix. Should - * have at least nnz entries. - * @param cooRVal (Output) Real component of COO matrix - * entries. Should have at least nnz entries. Ignored if null - * pointer. - * @param cooIVal (Output) Imaginary component of COO matrix - * entries. Should have at least nnz entries. Ignored if null - * pointer. - * @return Zero if matrix was read successfully. Otherwise non-zero. - */ -template -int mm_to_coo(FILE* f, - int tg, - IndexType_ nnz, - IndexType_* cooRowInd, - IndexType_* cooColInd, - ValueType_* cooRVal, - ValueType_* cooIVal) -{ - // Read matrix properties from file - MM_typecode t; - int m, n, nnzOld; - if (fseek(f, 0, SEEK_SET)) { - fprintf(stderr, "Error: could not set position in file\n"); - return -1; - } - if (mm_read_banner(f, &t)) { - fprintf(stderr, "Error: could not read Matrix Market file banner\n"); - return -1; - } - if (!mm_is_matrix(t) || !mm_is_coordinate(t)) { - fprintf(stderr, "Error: file does not contain matrix in coordinate format\n"); - return -1; - } - if (mm_read_mtx_crd_size(f, &m, &n, &nnzOld)) { - fprintf(stderr, "Error: could not read matrix dimensions\n"); - return -1; - } - if (!mm_is_pattern(t) && !mm_is_real(t) && !mm_is_integer(t) && !mm_is_complex(t)) { - fprintf(stderr, "Error: matrix entries are not valid type\n"); - return -1; - } - - // Add each matrix entry in file to COO format matrix - IndexType_ i; // Entry index in Matrix Market file - IndexType_ j = 0; // Entry index in COO format matrix - for (i = 0; i < nnzOld; ++i) { - // Read entry from file - int row, col; - double rval, ival; - int st; - if (mm_is_pattern(t)) { - st = fscanf(f, "%d %d\n", &row, &col); - rval = 1.0; - ival = 0.0; - } else if (mm_is_real(t) || mm_is_integer(t)) { - st = fscanf(f, "%d %d %lg\n", &row, &col, &rval); - ival = 0.0; - } else // Complex matrix - st = fscanf(f, "%d %d %lg %lg\n", &row, &col, &rval, &ival); - if (ferror(f) || (st == EOF)) { - fprintf(stderr, "Error: error %d reading Matrix Market file (entry %d)\n", st, i + 1); - return -1; - } - - // Switch to 0-based indexing - --row; - --col; - - // Record entry - cooRowInd[j] = row; - cooColInd[j] = col; - if (cooRVal != NULL) cooRVal[j] = rval; - if (cooIVal != NULL) cooIVal[j] = ival; - ++j; - - // Add symmetric complement of non-diagonal entries - if (tg && !mm_is_general(t) && (row != col)) { - // Modify entry value if matrix is skew symmetric or Hermitian - if (mm_is_skew(t)) { - rval = -rval; - ival = -ival; - } else if (mm_is_hermitian(t)) { - ival = -ival; - } - - // Record entry - cooRowInd[j] = col; - cooColInd[j] = row; - if (cooRVal != NULL) cooRVal[j] = rval; - if (cooIVal != NULL) cooIVal[j] = ival; - ++j; - } - } - return 0; -} - -/// Compare two tuples based on the element indexed by i -class lesser_tuple { - const int i; - - public: - lesser_tuple(int _i) : i(_i) {} - template - __host__ __device__ bool operator()(const Tuple1 t1, const Tuple2 t2) - { - switch (i) { - case 0: - return (thrust::get<0>(t1) == thrust::get<0>(t2) ? thrust::get<1>(t1) < thrust::get<1>(t2) - : thrust::get<0>(t1) < thrust::get<0>(t2)); - case 1: - return (thrust::get<1>(t1) == thrust::get<1>(t2) ? thrust::get<0>(t1) < thrust::get<0>(t2) - : thrust::get<1>(t1) < thrust::get<1>(t2)); - default: - return (thrust::get<0>(t1) == thrust::get<0>(t2) ? thrust::get<1>(t1) < thrust::get<1>(t2) - : thrust::get<0>(t1) < thrust::get<0>(t2)); - } - } -}; - -/// Sort entries in COO format matrix -/** Sort is stable. - * - * @param nnz Number of non-zero matrix entries. - * @param sort_by_row Boolean indicating whether matrix entries - * will be sorted by row index or by column index. - * @param cooRowInd Row indices for COO matrix. - * @param cooColInd Column indices for COO matrix. - * @param cooRVal Real component for COO matrix entries. Ignored if - * null pointer. - * @param cooIVal Imaginary component COO matrix entries. Ignored if - * null pointer. - */ -template -void coo_sort(IndexType_ nnz, - int sort_by_row, - IndexType_* cooRowInd, - IndexType_* cooColInd, - ValueType_* cooRVal, - ValueType_* cooIVal) -{ - // Determine whether to sort by row or by column - int i; - if (sort_by_row == 0) - i = 1; - else - i = 0; - - // Apply stable sort - using namespace thrust; - if ((cooRVal == NULL) && (cooIVal == NULL)) - stable_sort(make_zip_iterator(make_tuple(cooRowInd, cooColInd)), - make_zip_iterator(make_tuple(cooRowInd + nnz, cooColInd + nnz)), - lesser_tuple(i)); - else if ((cooRVal == NULL) && (cooIVal != NULL)) - stable_sort(make_zip_iterator(make_tuple(cooRowInd, cooColInd, cooIVal)), - make_zip_iterator(make_tuple(cooRowInd + nnz, cooColInd + nnz, cooIVal + nnz)), - lesser_tuple(i)); - else if ((cooRVal != NULL) && (cooIVal == NULL)) - stable_sort(make_zip_iterator(make_tuple(cooRowInd, cooColInd, cooRVal)), - make_zip_iterator(make_tuple(cooRowInd + nnz, cooColInd + nnz, cooRVal + nnz)), - lesser_tuple(i)); - else - stable_sort( - make_zip_iterator(make_tuple(cooRowInd, cooColInd, cooRVal, cooIVal)), - make_zip_iterator(make_tuple(cooRowInd + nnz, cooColInd + nnz, cooRVal + nnz, cooIVal + nnz)), - lesser_tuple(i)); -} - -template -void coo2csr(std::vector& cooRowInd, // in: I[] (overwrite) - const std::vector& cooColInd, // in: J[] - std::vector& csrRowPtr, // out - std::vector& csrColInd) // out -{ - std::vector> items; - for (auto i = size_t{0}; i < cooRowInd.size(); ++i) - items.push_back(std::make_pair(cooRowInd[i], cooColInd[i])); - // sort pairs - std::sort(items.begin(), - items.end(), - [](const std::pair& left, const std::pair& right) { - return left.first < right.first; - }); - for (auto i = size_t{0}; i < cooRowInd.size(); ++i) { - cooRowInd[i] = items[i].first; // save the sorted rows to compress them later - csrColInd[i] = items[i].second; // save the col idx, not sure if they are sorted for each row - } - // Count number of elements per row - for (auto i = size_t{0}; i < cooRowInd.size(); ++i) ++(csrRowPtr[cooRowInd[i] + 1]); - - // Compute cumulative sum to obtain row offsets/pointers - for (auto i = size_t{0}; i < csrRowPtr.size() - 1; ++i) csrRowPtr[i + 1] += csrRowPtr[i]; -} - -/// Compress sorted list of indices -/** For use in converting COO format matrix to CSR or CSC format. - * - * @param n Maximum index. - * @param nnz Number of non-zero matrix entries. - * @param sortedIndices Sorted list of indices (COO format). - * @param compressedIndices (Output) Compressed list of indices (CSR - * or CSC format). Should have at least n+1 entries. - */ -template -void coo_compress(IndexType_ m, - IndexType_ n, - IndexType_ nnz, - const IndexType_* __restrict__ sortedIndices, - IndexType_* __restrict__ compressedIndices) -{ - IndexType_ i; - - // Initialize everything to zero - memset(compressedIndices, 0, (m + 1) * sizeof(IndexType_)); - - // Count number of elements per row - for (i = 0; i < nnz; ++i) ++(compressedIndices[sortedIndices[i] + 1]); - - // Compute cumulative sum to obtain row offsets/pointers - for (i = 0; i < m; ++i) compressedIndices[i + 1] += compressedIndices[i]; -} - -/// Convert COO format matrix to CSR format -/** On output, matrix entries in COO format matrix will be sorted - * (primarily by row index, secondarily by column index). - * - * @param m Number of matrix rows. - * @param n Number of matrix columns. - * @param nnz Number of non-zero matrix entries. - * @param cooRowInd Row indices for COO matrix. - * @param cooColInd Column indices for COO matrix. - * @param cooRVal Real component of COO matrix entries. Ignored if - * null pointer. - * @param cooIVal Imaginary component of COO matrix entries. Ignored - * if null pointer. - * @param csrRowPtr Row pointers for CSR matrix. Should have at least - * n+1 entries. - * @param csrColInd Column indices for CSR matrix (identical to - * output of cooColInd). Should have at least nnz entries. Ignored if - * null pointer. - * @param csrRVal Real component of CSR matrix entries (identical to - * output of cooRVal). Should have at least nnz entries. Ignored if - * null pointer. - * @param csrIVal Imaginary component of CSR matrix entries - * (identical to output of cooIVal). Should have at least nnz - * entries. Ignored if null pointer. - * @return Zero if matrix was converted successfully. Otherwise - * non-zero. - */ -template -int coo_to_csr(IndexType_ m, - IndexType_ n, - IndexType_ nnz, - IndexType_* __restrict__ cooRowInd, - IndexType_* __restrict__ cooColInd, - ValueType_* __restrict__ cooRVal, - ValueType_* __restrict__ cooIVal, - IndexType_* __restrict__ csrRowPtr, - IndexType_* __restrict__ csrColInd, - ValueType_* __restrict__ csrRVal, - ValueType_* __restrict__ csrIVal) -{ - // Convert COO to CSR matrix - coo_sort(nnz, 0, cooRowInd, cooColInd, cooRVal, cooIVal); - coo_sort(nnz, 1, cooRowInd, cooColInd, cooRVal, cooIVal); - // coo_sort2(m, nnz, cooRowInd, cooColInd); - coo_compress(m, n, nnz, cooRowInd, csrRowPtr); - - // Copy arrays - if (csrColInd != NULL) memcpy(csrColInd, cooColInd, nnz * sizeof(IndexType_)); - if ((cooRVal != NULL) && (csrRVal != NULL)) memcpy(csrRVal, cooRVal, nnz * sizeof(ValueType_)); - if ((cooIVal != NULL) && (csrIVal != NULL)) memcpy(csrIVal, cooIVal, nnz * sizeof(ValueType_)); - - return 0; -} - -int read_binary_vector(FILE* fpin, int n, std::vector& val) -{ - size_t is_read1; - - double* t_storage = new double[n]; - is_read1 = fread(t_storage, sizeof(double), n, fpin); - for (int i = 0; i < n; i++) { - if (t_storage[i] == DBL_MAX) - val[i] = FLT_MAX; - else if (t_storage[i] == -DBL_MAX) - val[i] = -FLT_MAX; - else - val[i] = static_cast(t_storage[i]); - } - delete[] t_storage; - - if (is_read1 != (size_t)n) { - printf("%s", "I/O fail\n"); - return 1; - } - return 0; -} - -int read_binary_vector(FILE* fpin, int n, std::vector& val) -{ - size_t is_read1; - - is_read1 = fread(&val[0], sizeof(double), n, fpin); - - if (is_read1 != (size_t)n) { - printf("%s", "I/O fail\n"); - return 1; - } - return 0; -} - -// FIXME: A similar function could be useful for CSC format -// There are functions above that operate coo -> csr and coo->csc -/** - * @tparam - */ -template -std::unique_ptr> generate_graph_csr_from_mm( - bool& directed, std::string mm_file) -{ - VT number_of_vertices; - ET number_of_edges; - - FILE* fpin = fopen(mm_file.c_str(), "r"); - EXPECT_NE(fpin, nullptr); - - VT number_of_columns = 0; - MM_typecode mm_typecode{0}; - EXPECT_EQ(mm_properties( - fpin, 1, &mm_typecode, &number_of_vertices, &number_of_columns, &number_of_edges), - 0); - EXPECT_TRUE(mm_is_matrix(mm_typecode)); - EXPECT_TRUE(mm_is_coordinate(mm_typecode)); - EXPECT_FALSE(mm_is_complex(mm_typecode)); - EXPECT_FALSE(mm_is_skew(mm_typecode)); - - directed = !mm_is_symmetric(mm_typecode); - - // Allocate memory on host - std::vector coo_row_ind(number_of_edges); - std::vector coo_col_ind(number_of_edges); - std::vector coo_val(number_of_edges); - - // Read - EXPECT_EQ((mm_to_coo( - fpin, 1, number_of_edges, &coo_row_ind[0], &coo_col_ind[0], &coo_val[0], NULL)), - 0); - EXPECT_EQ(fclose(fpin), 0); - - cugraph::experimental::GraphCOOView cooview( - &coo_col_ind[0], &coo_row_ind[0], &coo_val[0], number_of_vertices, number_of_edges); - - return cugraph::coo_to_csr(cooview); -} - -//////////////////////////////////////////////////////////////////////////////// -// FIXME: move this code to rapids-core -//////////////////////////////////////////////////////////////////////////////// - -// Define RAPIDS_DATASET_ROOT_DIR using a preprocessor variable to -// allow for a build to override the default. This is useful for -// having different builds for specific default dataset locations. -#ifndef RAPIDS_DATASET_ROOT_DIR -#define RAPIDS_DATASET_ROOT_DIR "/datasets" -#endif - -static const std::string& get_rapids_dataset_root_dir() -{ - static std::string rdrd(""); - // Env var always overrides the value of RAPIDS_DATASET_ROOT_DIR - if (rdrd == "") { - const char* envVar = std::getenv("RAPIDS_DATASET_ROOT_DIR"); - rdrd = (envVar != NULL) ? envVar : RAPIDS_DATASET_ROOT_DIR; - } - return rdrd; -} diff --git a/cpp/tests/test_utils.hpp b/cpp/tests/test_utils.hpp deleted file mode 100644 index f711705699a..00000000000 --- a/cpp/tests/test_utils.hpp +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include - -#include - -#include - -namespace detail { - -template -rmm::device_buffer make_elements(InputIterator begin, InputIterator end) -{ - static_assert(cudf::is_fixed_width(), "Unexpected non-fixed width type."); - std::vector elements(begin, end); - return rmm::device_buffer{elements.data(), elements.size() * sizeof(Element)}; -} - -template -std::unique_ptr create_column(iterator_t begin, iterator_t end) -{ - cudf::size_type size = thrust::distance(begin, end); - - return std::unique_ptr( - new cudf::column{cudf::data_type{cudf::experimental::type_to_id()}, - size, - detail::make_elements(begin, end)}); -} - -} // namespace detail diff --git a/cpp/tests/traversal/bfs_ref.h b/cpp/tests/traversal/bfs_ref.h index c13342fa4f5..a32b2f99787 100644 --- a/cpp/tests/traversal/bfs_ref.h +++ b/cpp/tests/traversal/bfs_ref.h @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include #include @@ -69,4 +70,4 @@ void ref_bfs(VT *indices, } } } -} \ No newline at end of file +} diff --git a/cpp/tests/traversal/bfs_test.cu b/cpp/tests/traversal/bfs_test.cu index 46ba2af2e6a..d90da4367a0 100644 --- a/cpp/tests/traversal/bfs_test.cu +++ b/cpp/tests/traversal/bfs_test.cu @@ -14,20 +14,20 @@ * limitations under the License. */ -#include -#include -#include +#include "bfs_ref.h" + +#include +#include #include #include -#include -#include "gtest/gtest.h" -#include "test_utils.h" +#include -#include -#include "bfs_ref.h" +#include +#include +#include // NOTE: This could be common to other files but we might not want the same precision // depending on the algorithm @@ -61,7 +61,7 @@ typedef struct BFS_Usecase_t { int source_; // Starting point from the traversal BFS_Usecase_t(const std::string &config, int source) : config_(config), source_(source) { - const std::string &rapidsDatasetRootDir = get_rapids_dataset_root_dir(); + const std::string &rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir(); if ((config_ != "") && (config_[0] != '/')) { file_path_ = rapidsDatasetRootDir + "/" + config_; } else { @@ -71,6 +71,8 @@ typedef struct BFS_Usecase_t { } BFS_Usecase; class Tests_BFS : public ::testing::TestWithParam { + raft::handle_t handle; + public: Tests_BFS() {} static void SetupTestCase() {} @@ -90,13 +92,13 @@ class Tests_BFS : public ::testing::TestWithParam { VT number_of_vertices; ET number_of_edges; bool directed = false; - auto csr = generate_graph_csr_from_mm(directed, configuration.file_path_); + auto csr = + cugraph::test::generate_graph_csr_from_mm(directed, configuration.file_path_); cudaDeviceSynchronize(); - cugraph::experimental::GraphCSRView G = csr->view(); - G.prop.directed = directed; - CUDA_CHECK_LAST(); + cugraph::GraphCSRView G = csr->view(); + G.prop.directed = directed; - ASSERT_TRUE(configuration.source_ >= 0 && configuration.source_ <= G.number_of_vertices) + ASSERT_TRUE(configuration.source_ >= 0 && (VT)configuration.source_ < G.number_of_vertices) << "Starting sources should be >= 0 and" << " less than the number of vertices in the graph"; @@ -138,10 +140,13 @@ class Tests_BFS : public ::testing::TestWithParam { std::vector cugraph_pred(number_of_vertices); std::vector cugraph_sigmas(number_of_vertices); - cugraph::bfs(G, + // Don't pass valid sp_sp_counter ptr unless needed because it disables + // the bottom up flow + cugraph::bfs(handle, + G, d_cugraph_dist.data().get(), d_cugraph_pred.data().get(), - d_cugraph_sigmas.data().get(), + (return_sp_counter) ? d_cugraph_sigmas.data().get() : nullptr, source, G.prop.directed); CUDA_TRY(cudaMemcpy(cugraph_dist.data(), @@ -152,10 +157,13 @@ class Tests_BFS : public ::testing::TestWithParam { d_cugraph_pred.data().get(), sizeof(VT) * d_cugraph_pred.size(), cudaMemcpyDeviceToHost)); - CUDA_TRY(cudaMemcpy(cugraph_sigmas.data(), - d_cugraph_sigmas.data().get(), - sizeof(double) * d_cugraph_sigmas.size(), - cudaMemcpyDeviceToHost)); + + if (return_sp_counter) { + CUDA_TRY(cudaMemcpy(cugraph_sigmas.data(), + d_cugraph_sigmas.data().get(), + sizeof(double) * d_cugraph_sigmas.size(), + cudaMemcpyDeviceToHost)); + } for (VT i = 0; i < number_of_vertices; ++i) { // Check distances: should be an exact match as we use signed int 32-bit @@ -166,7 +174,8 @@ class Tests_BFS : public ::testing::TestWithParam { // that the predecessor obtained with the GPU implementation is one of the // predecessors obtained during the C++ BFS traversal VT pred = cugraph_pred[i]; // It could be equal to -1 if the node is never reached - if (pred == -1) { + constexpr VT invalid_vid = cugraph::invalid_vertex_id::value; + if (pred == invalid_vid) { EXPECT_TRUE(ref_bfs_pred[i].empty()) << "[MISMATCH][PREDECESSOR] vaid = " << i << " cugraph had not predecessor," << "while c++ ref found at least one."; @@ -179,10 +188,6 @@ class Tests_BFS : public ::testing::TestWithParam { << "[MISMATCH][PREDECESSOR] vaid = " << i << " cugraph = " << cugraph_sigmas[i] << " , c++ ref did not consider it as a predecessor."; } - EXPECT_TRUE( - compare_close(cugraph_sigmas[i], ref_bfs_sigmas[i], TEST_EPSILON, TEST_ZERO_THRESHOLD)) - << "[MISMATCH] vaid = " << i << ", cugraph = " << cugraph_sigmas[i] - << " c++ ref = " << ref_bfs_sigmas[i]; if (return_sp_counter) { EXPECT_TRUE( @@ -197,16 +202,27 @@ class Tests_BFS : public ::testing::TestWithParam { // ============================================================================ // Tests // ============================================================================ -TEST_P(Tests_BFS, CheckFP32_NO_SP_COUNTER) { run_current_test(GetParam()); } -TEST_P(Tests_BFS, CheckFP64_NO_SP_COUNTER) +// We don't need to test WT for both float and double since it's anyway ignored in BFS +TEST_P(Tests_BFS, CheckUint32_NO_SP_COUNTER) { - run_current_test(GetParam()); + run_current_test(GetParam()); +} +TEST_P(Tests_BFS, CheckInt_NO_SP_COUNTER) { run_current_test(GetParam()); } +TEST_P(Tests_BFS, CheckInt64_NO_SP_COUNTER) +{ + run_current_test(GetParam()); } -TEST_P(Tests_BFS, CheckFP32_SP_COUNTER) { run_current_test(GetParam()); } - -TEST_P(Tests_BFS, CheckFP64_SP_COUNTER) { run_current_test(GetParam()); } +TEST_P(Tests_BFS, CheckUint32_SP_COUNTER) +{ + run_current_test(GetParam()); +} +TEST_P(Tests_BFS, CheckInt_SP_COUNTER) { run_current_test(GetParam()); } +TEST_P(Tests_BFS, CheckInt64_SP_COUNTER) +{ + run_current_test(GetParam()); +} INSTANTIATE_TEST_CASE_P(simple_test, Tests_BFS, @@ -217,11 +233,4 @@ INSTANTIATE_TEST_CASE_P(simple_test, BFS_Usecase("test/datasets/wiki2003.mtx", 1000), BFS_Usecase("test/datasets/wiki-Talk.mtx", 1000))); -int main(int argc, char **argv) -{ - testing::InitGoogleTest(&argc, argv); - auto resource = std::make_unique(); - rmm::mr::set_default_resource(resource.get()); - int rc = RUN_ALL_TESTS(); - return rc; -} +CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/traversal/sssp_test.cu b/cpp/tests/traversal/sssp_test.cu index 0c27674f94a..ea56d1d79cb 100644 --- a/cpp/tests/traversal/sssp_test.cu +++ b/cpp/tests/traversal/sssp_test.cu @@ -9,21 +9,20 @@ * */ -#include -#include +#include +#include +#include + +#include +#include +#include + #include + #include #include #include #include -#include "high_res_clock.h" -#include "test_utils.h" - -#include - -#include -#include "algorithms.hpp" -#include "graph.hpp" typedef enum graph_type { RMAT, MTX } GraphType; @@ -128,7 +127,7 @@ typedef struct SSSP_Usecase_t { // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR // FIXME: Use platform independent stuff from c++14/17 on compiler update if (type_ == MTX) { - const std::string& rapidsDatasetRootDir = get_rapids_dataset_root_dir(); + const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir(); if ((config_ != "") && (config_[0] != '/')) { file_path_ = rapidsDatasetRootDir + "/" + config_; } else { @@ -203,7 +202,7 @@ class Tests_SSSP : public ::testing::TestWithParam { ASSERT_NE(fpin, static_cast(nullptr)) << "fopen (" << param.file_path_ << ") failure."; // mm_properties has only one template param which should be fixed there - ASSERT_EQ(mm_properties(fpin, 1, &mc, &m, &k, &nnz), 0) + ASSERT_EQ(cugraph::test::mm_properties(fpin, 1, &mc, &m, &k, &nnz), 0) << "could not read Matrix Market file properties" << "\n"; ASSERT_TRUE(mm_is_matrix(mc)); @@ -218,24 +217,24 @@ class Tests_SSSP : public ::testing::TestWithParam { // Read weights if given if (!mm_is_pattern(mc)) { cooVal.resize(nnz); - ASSERT_EQ((mm_to_coo(fpin, - 1, - nnz, - &cooRowInd[0], - &cooColInd[0], - &cooVal[0], - static_cast(nullptr))), + ASSERT_EQ((cugraph::test::mm_to_coo(fpin, + 1, + nnz, + &cooRowInd[0], + &cooColInd[0], + &cooVal[0], + static_cast(nullptr))), 0) << "could not read matrix data" << "\n"; } else { - ASSERT_EQ((mm_to_coo(fpin, - 1, - nnz, - &cooRowInd[0], - &cooColInd[0], - static_cast(nullptr), - static_cast(nullptr))), + ASSERT_EQ((cugraph::test::mm_to_coo(fpin, + 1, + nnz, + &cooRowInd[0], + &cooColInd[0], + static_cast(nullptr), + static_cast(nullptr))), 0) << "could not read matrix data" << "\n"; @@ -256,14 +255,14 @@ class Tests_SSSP : public ::testing::TestWithParam { ASSERT_TRUE(0); } - cugraph::experimental::GraphCOOView G_coo( + cugraph::GraphCOOView G_coo( &cooRowInd[0], &cooColInd[0], (DoRandomWeights ? &cooVal[0] : nullptr), num_vertices, num_edges); - auto G_unique = cugraph::coo_to_csr(G_coo); - cugraph::experimental::GraphCSRView G = G_unique->view(); + auto G_unique = cugraph::coo_to_csr(G_coo); + cugraph::GraphCSRView G = G_unique->view(); cudaDeviceSynchronize(); std::vector dist_vec; @@ -432,11 +431,4 @@ INSTANTIATE_TEST_CASE_P(simple_test, SSSP_Usecase(MTX, "test/datasets/wiki2003.mtx", 100000), SSSP_Usecase(MTX, "test/datasets/karate.mtx", 1))); -int main(int argc, char** argv) -{ - testing::InitGoogleTest(&argc, argv); - auto resource = std::make_unique(); - rmm::mr::set_default_resource(resource.get()); - int rc = RUN_ALL_TESTS(); - return rc; -} +CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/utilities/base_fixture.hpp b/cpp/tests/utilities/base_fixture.hpp new file mode 100644 index 00000000000..535b4b9c79e --- /dev/null +++ b/cpp/tests/utilities/base_fixture.hpp @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace cugraph { +namespace test { + +/** + * @brief Base test fixture class from which all libcudf tests should inherit. + * + * Example: + * ``` + * class MyTestFixture : public cudf::test::BaseFixture {}; + * ``` + **/ +class BaseFixture : public ::testing::Test { + rmm::mr::device_memory_resource *_mr{rmm::mr::get_current_device_resource()}; + + public: + /** + * @brief Returns pointer to `device_memory_resource` that should be used for + * all tests inheriting from this fixture + **/ + rmm::mr::device_memory_resource *mr() { return _mr; } +}; + +/// MR factory functions +inline auto make_cuda() { return std::make_shared(); } + +inline auto make_managed() { return std::make_shared(); } + +inline auto make_pool() +{ + return rmm::mr::make_owning_wrapper(make_cuda()); +} + +inline auto make_binning() +{ + auto pool = make_pool(); + // Add a fixed_size_memory_resource for bins of size 256, 512, 1024, 2048 and 4096KiB + // Larger allocations will use the pool resource + auto mr = rmm::mr::make_owning_wrapper(pool, 18, 22); + return mr; +} + +/** + * @brief Creates a memory resource for the unit test environment + * given the name of the allocation mode. + * + * The returned resource instance must be kept alive for the duration of + * the tests. Attaching the resource to a TestEnvironment causes + * issues since the environment objects are not destroyed until + * after the runtime is shutdown. + * + * @throw cudf::logic_error if the `allocation_mode` is unsupported. + * + * @param allocation_mode String identifies which resource type. + * Accepted types are "pool", "cuda", and "managed" only. + * @return Memory resource instance + */ +inline std::shared_ptr create_memory_resource( + std::string const &allocation_mode) +{ + if (allocation_mode == "binning") return make_binning(); + if (allocation_mode == "cuda") return make_cuda(); + if (allocation_mode == "pool") return make_pool(); + if (allocation_mode == "managed") make_managed(); + CUGRAPH_FAIL("Invalid RMM allocation mode"); +} + +} // namespace test +} // namespace cugraph + +/** + * @brief Parses the cuDF test command line options. + * + * Currently only supports 'rmm_mode' string paramater, which set the rmm + * allocation mode. The default value of the parameter is 'pool'. + * + * @return Parsing results in the form of cxxopts::ParseResult + */ +inline auto parse_test_options(int argc, char **argv) +{ + try { + cxxopts::Options options(argv[0], " - cuDF tests command line options"); + options.allow_unrecognised_options().add_options()( + "rmm_mode", "RMM allocation mode", cxxopts::value()->default_value("pool")); + + return options.parse(argc, argv); + } catch (const cxxopts::OptionException &e) { + CUGRAPH_FAIL("Error parsing command line options"); + } +} + +/** + * @brief Macro that defines main function for gtest programs that use rmm + * + * Should be included in every test program that uses rmm allocators since + * it maintains the lifespan of the rmm default memory resource. + * This `main` function is a wrapper around the google test generated `main`, + * maintaining the original functionality. In addition, this custom `main` + * function parses the command line to customize test behavior, like the + * allocation mode used for creating the default memory resource. + * + */ +#define CUGRAPH_TEST_PROGRAM_MAIN() \ + int main(int argc, char **argv) \ + { \ + ::testing::InitGoogleTest(&argc, argv); \ + auto const cmd_opts = parse_test_options(argc, argv); \ + auto const rmm_mode = cmd_opts["rmm_mode"].as(); \ + auto resource = cugraph::test::create_memory_resource(rmm_mode); \ + rmm::mr::set_current_device_resource(resource.get()); \ + return RUN_ALL_TESTS(); \ + } diff --git a/cpp/tests/utilities/cxxopts.hpp b/cpp/tests/utilities/cxxopts.hpp new file mode 100644 index 00000000000..9a0b6e500d6 --- /dev/null +++ b/cpp/tests/utilities/cxxopts.hpp @@ -0,0 +1,1497 @@ +/* +Copyright (c) 2014, 2015, 2016, 2017 Jarryd Beck +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef CXXOPTS_HPP_INCLUDED +#define CXXOPTS_HPP_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef CXXOPTS_VECTOR_DELIMITER +#define CXXOPTS_VECTOR_DELIMITER ',' +#endif + +#define CXXOPTS__VERSION_MAJOR 2 +#define CXXOPTS__VERSION_MINOR 2 +#define CXXOPTS__VERSION_PATCH 0 + +namespace cxxopts { +static constexpr struct { + uint8_t major, minor, patch; +} version = {CXXOPTS__VERSION_MAJOR, CXXOPTS__VERSION_MINOR, CXXOPTS__VERSION_PATCH}; +} // namespace cxxopts + +// when we ask cxxopts to use Unicode, help strings are processed using ICU, +// which results in the correct lengths being computed for strings when they +// are formatted for the help output +// it is necessary to make sure that can be found by the +// compiler, and that icu-uc is linked in to the binary. + +#ifdef CXXOPTS_USE_UNICODE +#include + +namespace cxxopts { +typedef icu::UnicodeString String; + +inline String toLocalString(std::string s) { return icu::UnicodeString::fromUTF8(std::move(s)); } + +class UnicodeStringIterator : public std::iterator { + public: + UnicodeStringIterator(const icu::UnicodeString* string, int32_t pos) : s(string), i(pos) {} + + value_type operator*() const { return s->char32At(i); } + + bool operator==(const UnicodeStringIterator& rhs) const { return s == rhs.s && i == rhs.i; } + + bool operator!=(const UnicodeStringIterator& rhs) const { return !(*this == rhs); } + + UnicodeStringIterator& operator++() + { + ++i; + return *this; + } + + UnicodeStringIterator operator+(int32_t v) { return UnicodeStringIterator(s, i + v); } + + private: + const icu::UnicodeString* s; + int32_t i; +}; + +inline String& stringAppend(String& s, String a) { return s.append(std::move(a)); } + +inline String& stringAppend(String& s, int n, UChar32 c) +{ + for (int i = 0; i != n; ++i) { s.append(c); } + + return s; +} + +template +String& stringAppend(String& s, Iterator begin, Iterator end) +{ + while (begin != end) { + s.append(*begin); + ++begin; + } + + return s; +} + +inline size_t stringLength(const String& s) { return s.length(); } + +inline std::string toUTF8String(const String& s) +{ + std::string result; + s.toUTF8String(result); + + return result; +} + +inline bool empty(const String& s) { return s.isEmpty(); } +} // namespace cxxopts + +namespace std { +inline cxxopts::UnicodeStringIterator begin(const icu::UnicodeString& s) +{ + return cxxopts::UnicodeStringIterator(&s, 0); +} + +inline cxxopts::UnicodeStringIterator end(const icu::UnicodeString& s) +{ + return cxxopts::UnicodeStringIterator(&s, s.length()); +} +} // namespace std + +// ifdef CXXOPTS_USE_UNICODE +#else + +namespace cxxopts { +typedef std::string String; + +template +T toLocalString(T&& t) +{ + return std::forward(t); +} + +inline size_t stringLength(const String& s) { return s.length(); } + +inline String& stringAppend(String& s, String a) { return s.append(std::move(a)); } + +inline String& stringAppend(String& s, size_t n, char c) { return s.append(n, c); } + +template +String& stringAppend(String& s, Iterator begin, Iterator end) +{ + return s.append(begin, end); +} + +template +std::string toUTF8String(T&& t) +{ + return std::forward(t); +} + +inline bool empty(const std::string& s) { return s.empty(); } +} // namespace cxxopts + +// ifdef CXXOPTS_USE_UNICODE +#endif + +namespace cxxopts { +namespace { +#ifdef _WIN32 +const std::string LQUOTE("\'"); +const std::string RQUOTE("\'"); +#else +const std::string LQUOTE("‘"); +const std::string RQUOTE("’"); +#endif +} // namespace + +class Value : public std::enable_shared_from_this { + public: + virtual ~Value() = default; + + virtual std::shared_ptr clone() const = 0; + + virtual void parse(const std::string& text) const = 0; + + virtual void parse() const = 0; + + virtual bool has_default() const = 0; + + virtual bool is_container() const = 0; + + virtual bool has_implicit() const = 0; + + virtual std::string get_default_value() const = 0; + + virtual std::string get_implicit_value() const = 0; + + virtual std::shared_ptr default_value(const std::string& value) = 0; + + virtual std::shared_ptr implicit_value(const std::string& value) = 0; + + virtual std::shared_ptr no_implicit_value() = 0; + + virtual bool is_boolean() const = 0; +}; + +class OptionException : public std::exception { + public: + OptionException(const std::string& message) : m_message(message) {} + + virtual const char* what() const noexcept { return m_message.c_str(); } + + private: + std::string m_message; +}; + +class OptionSpecException : public OptionException { + public: + OptionSpecException(const std::string& message) : OptionException(message) {} +}; + +class OptionParseException : public OptionException { + public: + OptionParseException(const std::string& message) : OptionException(message) {} +}; + +class option_exists_error : public OptionSpecException { + public: + option_exists_error(const std::string& option) + : OptionSpecException("Option " + LQUOTE + option + RQUOTE + " already exists") + { + } +}; + +class invalid_option_format_error : public OptionSpecException { + public: + invalid_option_format_error(const std::string& format) + : OptionSpecException("Invalid option format " + LQUOTE + format + RQUOTE) + { + } +}; + +class option_syntax_exception : public OptionParseException { + public: + option_syntax_exception(const std::string& text) + : OptionParseException("Argument " + LQUOTE + text + RQUOTE + + " starts with a - but has incorrect syntax") + { + } +}; + +class option_not_exists_exception : public OptionParseException { + public: + option_not_exists_exception(const std::string& option) + : OptionParseException("Option " + LQUOTE + option + RQUOTE + " does not exist") + { + } +}; + +class missing_argument_exception : public OptionParseException { + public: + missing_argument_exception(const std::string& option) + : OptionParseException("Option " + LQUOTE + option + RQUOTE + " is missing an argument") + { + } +}; + +class option_requires_argument_exception : public OptionParseException { + public: + option_requires_argument_exception(const std::string& option) + : OptionParseException("Option " + LQUOTE + option + RQUOTE + " requires an argument") + { + } +}; + +class option_not_has_argument_exception : public OptionParseException { + public: + option_not_has_argument_exception(const std::string& option, const std::string& arg) + : OptionParseException("Option " + LQUOTE + option + RQUOTE + + " does not take an argument, but argument " + LQUOTE + arg + RQUOTE + + " given") + { + } +}; + +class option_not_present_exception : public OptionParseException { + public: + option_not_present_exception(const std::string& option) + : OptionParseException("Option " + LQUOTE + option + RQUOTE + " not present") + { + } +}; + +class argument_incorrect_type : public OptionParseException { + public: + argument_incorrect_type(const std::string& arg) + : OptionParseException("Argument " + LQUOTE + arg + RQUOTE + " failed to parse") + { + } +}; + +class option_required_exception : public OptionParseException { + public: + option_required_exception(const std::string& option) + : OptionParseException("Option " + LQUOTE + option + RQUOTE + " is required but not present") + { + } +}; + +template +void throw_or_mimic(const std::string& text) +{ + static_assert(std::is_base_of::value, + "throw_or_mimic only works on std::exception and " + "deriving classes"); + +#ifndef CXXOPTS_NO_EXCEPTIONS + // If CXXOPTS_NO_EXCEPTIONS is not defined, just throw + throw T{text}; +#else + // Otherwise manually instantiate the exception, print what() to stderr, + // and abort + T exception{text}; + std::cerr << exception.what() << std::endl; + std::cerr << "Aborting (exceptions disabled)..." << std::endl; + std::abort(); +#endif +} + +namespace values { +namespace { +std::basic_regex integer_pattern("(-)?(0x)?([0-9a-zA-Z]+)|((0x)?0)"); +std::basic_regex truthy_pattern("(t|T)(rue)?|1"); +std::basic_regex falsy_pattern("(f|F)(alse)?|0"); +} // namespace + +namespace detail { +template +struct SignedCheck; + +template +struct SignedCheck { + template + void operator()(bool negative, U u, const std::string& text) + { + if (negative) { + if (u > static_cast((std::numeric_limits::min)())) { + throw_or_mimic(text); + } + } else { + if (u > static_cast((std::numeric_limits::max)())) { + throw_or_mimic(text); + } + } + } +}; + +template +struct SignedCheck { + template + void operator()(bool, U, const std::string&) + { + } +}; + +template +void check_signed_range(bool negative, U value, const std::string& text) +{ + SignedCheck::is_signed>()(negative, value, text); +} +} // namespace detail + +template +R checked_negate(T&& t, const std::string&, std::true_type) +{ + // if we got to here, then `t` is a positive number that fits into + // `R`. So to avoid MSVC C4146, we first cast it to `R`. + // See https://github.com/jarro2783/cxxopts/issues/62 for more details. + return static_cast(-static_cast(t - 1) - 1); +} + +template +T checked_negate(T&& t, const std::string& text, std::false_type) +{ + throw_or_mimic(text); + return t; +} + +template +void integer_parser(const std::string& text, T& value) +{ + std::smatch match; + std::regex_match(text, match, integer_pattern); + + if (match.length() == 0) { throw_or_mimic(text); } + + if (match.length(4) > 0) { + value = 0; + return; + } + + using US = typename std::make_unsigned::type; + + constexpr bool is_signed = std::numeric_limits::is_signed; + const bool negative = match.length(1) > 0; + const uint8_t base = match.length(2) > 0 ? 16 : 10; + + auto value_match = match[3]; + + US result = 0; + + for (auto iter = value_match.first; iter != value_match.second; ++iter) { + US digit = 0; + + if (*iter >= '0' && *iter <= '9') { + digit = static_cast(*iter - '0'); + } else if (base == 16 && *iter >= 'a' && *iter <= 'f') { + digit = static_cast(*iter - 'a' + 10); + } else if (base == 16 && *iter >= 'A' && *iter <= 'F') { + digit = static_cast(*iter - 'A' + 10); + } else { + throw_or_mimic(text); + } + + const US next = static_cast(result * base + digit); + if (result > next) { throw_or_mimic(text); } + + result = next; + } + + detail::check_signed_range(negative, result, text); + + if (negative) { + value = checked_negate(result, text, std::integral_constant()); + } else { + value = static_cast(result); + } +} + +template +void stringstream_parser(const std::string& text, T& value) +{ + std::stringstream in(text); + in >> value; + if (!in) { throw_or_mimic(text); } +} + +inline void parse_value(const std::string& text, uint8_t& value) { integer_parser(text, value); } + +inline void parse_value(const std::string& text, int8_t& value) { integer_parser(text, value); } + +inline void parse_value(const std::string& text, uint16_t& value) { integer_parser(text, value); } + +inline void parse_value(const std::string& text, int16_t& value) { integer_parser(text, value); } + +inline void parse_value(const std::string& text, uint32_t& value) { integer_parser(text, value); } + +inline void parse_value(const std::string& text, int32_t& value) { integer_parser(text, value); } + +inline void parse_value(const std::string& text, uint64_t& value) { integer_parser(text, value); } + +inline void parse_value(const std::string& text, int64_t& value) { integer_parser(text, value); } + +inline void parse_value(const std::string& text, bool& value) +{ + std::smatch result; + std::regex_match(text, result, truthy_pattern); + + if (!result.empty()) { + value = true; + return; + } + + std::regex_match(text, result, falsy_pattern); + if (!result.empty()) { + value = false; + return; + } + + throw_or_mimic(text); +} + +inline void parse_value(const std::string& text, std::string& value) { value = text; } + +// The fallback parser. It uses the stringstream parser to parse all types +// that have not been overloaded explicitly. It has to be placed in the +// source code before all other more specialized templates. +template +void parse_value(const std::string& text, T& value) +{ + stringstream_parser(text, value); +} + +template +void parse_value(const std::string& text, std::vector& value) +{ + std::stringstream in(text); + std::string token; + while (in.eof() == false && std::getline(in, token, CXXOPTS_VECTOR_DELIMITER)) { + T v; + parse_value(token, v); + value.emplace_back(std::move(v)); + } +} + +inline void parse_value(const std::string& text, char& c) +{ + if (text.length() != 1) { throw_or_mimic(text); } + + c = text[0]; +} + +template +struct type_is_container { + static constexpr bool value = false; +}; + +template +struct type_is_container> { + static constexpr bool value = true; +}; + +template +class abstract_value : public Value { + using Self = abstract_value; + + public: + abstract_value() : m_result(std::make_shared()), m_store(m_result.get()) {} + + abstract_value(T* t) : m_store(t) {} + + virtual ~abstract_value() = default; + + abstract_value(const abstract_value& rhs) + { + if (rhs.m_result) { + m_result = std::make_shared(); + m_store = m_result.get(); + } else { + m_store = rhs.m_store; + } + + m_default = rhs.m_default; + m_implicit = rhs.m_implicit; + m_default_value = rhs.m_default_value; + m_implicit_value = rhs.m_implicit_value; + } + + void parse(const std::string& text) const { parse_value(text, *m_store); } + + bool is_container() const { return type_is_container::value; } + + void parse() const { parse_value(m_default_value, *m_store); } + + bool has_default() const { return m_default; } + + bool has_implicit() const { return m_implicit; } + + std::shared_ptr default_value(const std::string& value) + { + m_default = true; + m_default_value = value; + return shared_from_this(); + } + + std::shared_ptr implicit_value(const std::string& value) + { + m_implicit = true; + m_implicit_value = value; + return shared_from_this(); + } + + std::shared_ptr no_implicit_value() + { + m_implicit = false; + return shared_from_this(); + } + + std::string get_default_value() const { return m_default_value; } + + std::string get_implicit_value() const { return m_implicit_value; } + + bool is_boolean() const { return std::is_same::value; } + + const T& get() const + { + if (m_store == nullptr) { + return *m_result; + } else { + return *m_store; + } + } + + protected: + std::shared_ptr m_result; + T* m_store; + + bool m_default = false; + bool m_implicit = false; + + std::string m_default_value; + std::string m_implicit_value; +}; + +template +class standard_value : public abstract_value { + public: + using abstract_value::abstract_value; + + std::shared_ptr clone() const { return std::make_shared>(*this); } +}; + +template <> +class standard_value : public abstract_value { + public: + ~standard_value() = default; + + standard_value() { set_default_and_implicit(); } + + standard_value(bool* b) : abstract_value(b) { set_default_and_implicit(); } + + std::shared_ptr clone() const { return std::make_shared>(*this); } + + private: + void set_default_and_implicit() + { + m_default = true; + m_default_value = "false"; + m_implicit = true; + m_implicit_value = "true"; + } +}; +} // namespace values + +template +std::shared_ptr value() +{ + return std::make_shared>(); +} + +template +std::shared_ptr value(T& t) +{ + return std::make_shared>(&t); +} + +class OptionAdder; + +class OptionDetails { + public: + OptionDetails(const std::string& short_, + const std::string& long_, + const String& desc, + std::shared_ptr val) + : m_short(short_), m_long(long_), m_desc(desc), m_value(val), m_count(0) + { + } + + OptionDetails(const OptionDetails& rhs) : m_desc(rhs.m_desc), m_count(rhs.m_count) + { + m_value = rhs.m_value->clone(); + } + + OptionDetails(OptionDetails&& rhs) = default; + + const String& description() const { return m_desc; } + + const Value& value() const { return *m_value; } + + std::shared_ptr make_storage() const { return m_value->clone(); } + + const std::string& short_name() const { return m_short; } + + const std::string& long_name() const { return m_long; } + + private: + std::string m_short; + std::string m_long; + String m_desc; + std::shared_ptr m_value; + int m_count; +}; + +struct HelpOptionDetails { + std::string s; + std::string l; + String desc; + bool has_default; + std::string default_value; + bool has_implicit; + std::string implicit_value; + std::string arg_help; + bool is_container; + bool is_boolean; +}; + +struct HelpGroupDetails { + std::string name; + std::string description; + std::vector options; +}; + +class OptionValue { + public: + void parse(std::shared_ptr details, const std::string& text) + { + ensure_value(details); + ++m_count; + m_value->parse(text); + } + + void parse_default(std::shared_ptr details) + { + ensure_value(details); + m_default = true; + m_value->parse(); + } + + size_t count() const noexcept { return m_count; } + + // TODO: maybe default options should count towards the number of arguments + bool has_default() const noexcept { return m_default; } + + template + const T& as() const + { + if (m_value == nullptr) { throw_or_mimic("No value"); } + +#ifdef CXXOPTS_NO_RTTI + return static_cast&>(*m_value).get(); +#else + return dynamic_cast&>(*m_value).get(); +#endif + } + + private: + void ensure_value(std::shared_ptr details) + { + if (m_value == nullptr) { m_value = details->make_storage(); } + } + + std::shared_ptr m_value; + size_t m_count = 0; + bool m_default = false; +}; + +class KeyValue { + public: + KeyValue(std::string key_, std::string value_) + : m_key(std::move(key_)), m_value(std::move(value_)) + { + } + + const std::string& key() const { return m_key; } + + const std::string& value() const { return m_value; } + + template + T as() const + { + T result; + values::parse_value(m_value, result); + return result; + } + + private: + std::string m_key; + std::string m_value; +}; + +class ParseResult { + public: + ParseResult( + const std::shared_ptr>>, + std::vector, + bool allow_unrecognised, + int&, + char**&); + + size_t count(const std::string& o) const + { + auto iter = m_options->find(o); + if (iter == m_options->end()) { return 0; } + + auto riter = m_results.find(iter->second); + + return riter->second.count(); + } + + const OptionValue& operator[](const std::string& option) const + { + auto iter = m_options->find(option); + + if (iter == m_options->end()) { throw_or_mimic(option); } + + auto riter = m_results.find(iter->second); + + return riter->second; + } + + const std::vector& arguments() const { return m_sequential; } + + private: + void parse(int& argc, char**& argv); + + void add_to_option(const std::string& option, const std::string& arg); + + bool consume_positional(std::string a); + + void parse_option(std::shared_ptr value, + const std::string& name, + const std::string& arg = ""); + + void parse_default(std::shared_ptr details); + + void checked_parse_arg(int argc, + char* argv[], + int& current, + std::shared_ptr value, + const std::string& name); + + const std::shared_ptr>> m_options; + std::vector m_positional; + std::vector::iterator m_next_positional; + std::unordered_set m_positional_set; + std::unordered_map, OptionValue> m_results; + + bool m_allow_unrecognised; + + std::vector m_sequential; +}; + +struct Option { + Option(const std::string& opts, + const std::string& desc, + const std::shared_ptr& value = ::cxxopts::value(), + const std::string& arg_help = "") + : opts_(opts), desc_(desc), value_(value), arg_help_(arg_help) + { + } + + std::string opts_; + std::string desc_; + std::shared_ptr value_; + std::string arg_help_; +}; + +class Options { + typedef std::unordered_map> OptionMap; + + public: + Options(std::string program, std::string help_string = "") + : m_program(std::move(program)), + m_help_string(toLocalString(std::move(help_string))), + m_custom_help("[OPTION...]"), + m_positional_help("positional parameters"), + m_show_positional(false), + m_allow_unrecognised(false), + m_options(std::make_shared()), + m_next_positional(m_positional.end()) + { + } + + Options& positional_help(std::string help_text) + { + m_positional_help = std::move(help_text); + return *this; + } + + Options& custom_help(std::string help_text) + { + m_custom_help = std::move(help_text); + return *this; + } + + Options& show_positional_help() + { + m_show_positional = true; + return *this; + } + + Options& allow_unrecognised_options() + { + m_allow_unrecognised = true; + return *this; + } + + ParseResult parse(int& argc, char**& argv); + + OptionAdder add_options(std::string group = ""); + + void add_options(const std::string& group, std::initializer_list