diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 776c7ae761..63bc954711 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -22,7 +22,7 @@ on: default: nightly concurrency: - group: ${{ github.workflow }}-${{ github.ref }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} cancel-in-progress: true jobs: diff --git a/BUILD.md b/BUILD.md index e2d54310af..afdcfc86dd 100644 --- a/BUILD.md +++ b/BUILD.md @@ -19,8 +19,8 @@ It is recommended to use conda for environment/package management. If doing so, ```bash conda create -n cuml_dev python=3.10 +conda env update -n cuml_dev --file=conda/environments/all_cuda-118_arch-x86_64.yaml conda activate cuml_dev -conda env update --file=conda/environments/all_cuda-118_arch-x86_64.yaml ``` ## Installing from Source: diff --git a/VERSION b/VERSION new file mode 100644 index 0000000000..a193fff41e --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +23.12.00 diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index bc6b2f23b4..0a6a649fd0 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -9,8 +9,10 @@ export CMAKE_GENERATOR=Ninja rapids-print-env +version=$(rapids-generate-version) + rapids-logger "Begin cpp build" -rapids-conda-retry mambabuild conda/recipes/libcuml +RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild conda/recipes/libcuml rapids-upload-conda-to-s3 cpp diff --git a/ci/build_python.sh b/ci/build_python.sh index 9bee12371c..1332062770 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -9,6 +9,16 @@ export CMAKE_GENERATOR=Ninja rapids-print-env +package_name="cuml" +package_dir="python" + +version=$(rapids-generate-version) +git_commit=$(git rev-parse HEAD) +export RAPIDS_PACKAGE_VERSION=${version} + +echo "${version}" > VERSION +sed -i "/^__git_commit__/ s/= .*/= \"${git_commit}\"/g" "${package_dir}/${package_name}/_version.py" + rapids-logger "Begin py build" CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index 979a17014b..e4941ad1a8 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -3,20 +3,51 @@ set -euo pipefail +package_name="cuml" +package_dir="python" + source rapids-configure-sccache source rapids-date-string -# Use gha-tools rapids-pip-wheel-version to generate wheel version then -# update the necessary files -version_override="$(rapids-pip-wheel-version ${RAPIDS_DATE_STRING})" +version=$(rapids-generate-version) +git_commit=$(git rev-parse HEAD) RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -ci/release/apply_wheel_modifications.sh ${version_override} "-${RAPIDS_PY_CUDA_SUFFIX}" -echo "The package name and/or version was modified in the package source. The git diff is:" -git diff +# This is the version of the suffix with a preceding hyphen. It's used +# everywhere except in the final wheel name. +PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}" + +# Patch project metadata files to include the CUDA version suffix and version override. +pyproject_file="${package_dir}/pyproject.toml" + +sed -i "/^name.*cuml/ s/= \"cuml\"/= \"cuml${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file} +echo "${version}" > VERSION +sed -i "/^__git_commit__ / s/= .*/= \"${git_commit}\"/g" "${package_dir}/${package_name}/_version.py" + +# For nightlies we want to ensure that we're pulling in alphas as well. The +# easiest way to do so is to augment the spec with a constraint containing a +# min alpha version that doesn't affect the version bounds but does allow usage +# of alpha versions for that dependency without --pre +alpha_spec='' +if ! rapids-is-release-build; then + alpha_spec=',>=0.0.0a0' +fi + +for dep in cudf pylibraft raft-dask rmm; do + sed -r -i "s/${dep}==(.*)\"/${dep}${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file} +done + +for dep in dask-cuda rapids-dask-dependency; do + sed -r -i "s/${dep}==(.*)\"/${dep}==\1${alpha_spec}\"/g" ${pyproject_file} +done + +if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then + sed -i "s/cuda-python[<=>\.,0-9]*/cuda-python>=12.0,<13.0/g" ${pyproject_file} + sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file} +fi -cd python/ +cd ${package_dir} SKBUILD_CONFIGURE_OPTIONS="-DCUML_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DDISABLE_DEPRECATION_WARNINGS=ON -DCPM_cumlprims_mg_SOURCE=${GITHUB_WORKSPACE}/cumlprims_mg/" \ python -m pip wheel . \ diff --git a/ci/release/apply_wheel_modifications.sh b/ci/release/apply_wheel_modifications.sh deleted file mode 100755 index fb5971fa5e..0000000000 --- a/ci/release/apply_wheel_modifications.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -# Copyright (c) 2023, NVIDIA CORPORATION. -# -# Usage: bash apply_wheel_modifications.sh - -VERSION=${1} -CUDA_SUFFIX=${2} - -# pyproject.toml versions -sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/pyproject.toml - -# pyproject.toml cuda suffixes -sed -i "s/^name = \"cuml\"/name = \"cuml${CUDA_SUFFIX}\"/g" python/pyproject.toml -sed -i "s/cudf/cudf${CUDA_SUFFIX}/g" python/pyproject.toml -sed -i "s/pylibraft/pylibraft${CUDA_SUFFIX}/g" python/pyproject.toml -sed -i "s/raft-dask/raft-dask${CUDA_SUFFIX}/g" python/pyproject.toml -sed -i "s/rmm/rmm${CUDA_SUFFIX}/g" python/pyproject.toml - -if [[ $CUDA_SUFFIX == "-cu12" ]]; then - sed -i "s/cuda-python[<=>\.,0-9]*/cuda-python>=12.0,<13.0/g" python/pyproject.toml - sed -i "s/cupy-cuda11x/cupy-cuda12x/g" python/pyproject.toml -fi diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 4162f19ce9..6e8f43cea7 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -35,9 +35,10 @@ function sed_runner() { } -# __init__.py and pyproject.toml versions -sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/cuml/__init__.py -sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/pyproject.toml +# Centralized version file update +echo "${NEXT_FULL_TAG}" > VERSION + +# pyproject.toml versions sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/pyproject.toml sed_runner "s/cudf==.*\",/cudf==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/pyproject.toml sed_runner "s/pylibraft==.*\",/pylibraft==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/pyproject.toml @@ -72,6 +73,7 @@ DEPENDENCIES=( librmm pylibraft raft-dask + rapids-dask-dependency rmm ) for FILE in dependencies.yaml conda/environments/*.yaml; do @@ -80,17 +82,13 @@ for FILE in dependencies.yaml conda/environments/*.yaml; do done done -sed_runner "s|/branch-.*?/|/branch-${NEXT_SHORT_TAG}/|g" README.md -sed_runner "s|/branch-.*?/|/branch-${NEXT_SHORT_TAG}/|g" python/README.md +sed_runner "s|/branch-[^/]*/|/branch-${NEXT_SHORT_TAG}/|g" README.md +sed_runner "s|/branch-[^/]*/|/branch-${NEXT_SHORT_TAG}/|g" python/README.md +sed_runner "/- rapids-dask-dependency==/ s/==.*/==${NEXT_SHORT_TAG}\.*/g" python/README.md # Wheel builds clone cumlprims_mg, update its branch sed_runner "s/extra-repo-sha: branch-.*/extra-repo-sha: branch-${NEXT_SHORT_TAG}/g" .github/workflows/*.yaml -# Wheel builds install dask-cuda from source, update its branch -for FILE in .github/workflows/*.yaml; do - sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE}; -done - # CI files for FILE in .github/workflows/*.yaml; do sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index bf3b3845f7..d1cb6e8e27 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -12,9 +12,6 @@ if [[ "$(arch)" == "aarch64" ]]; then python -m pip install cmake fi -# Always install latest dask for testing -python -m pip install git+https://github.com/dask/dask.git@2023.9.2 git+https://github.com/dask/distributed.git@2023.9.2 git+https://github.com/rapidsai/dask-cuda.git@branch-23.12 - # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install $(echo ./dist/cuml*.whl)[test] diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 6038499d0c..b650ab412b 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -16,12 +16,9 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 -- dask-core==2023.9.2 - dask-cuda==23.12.* - dask-cudf==23.12.* - dask-ml -- dask==2023.9.2 -- distributed==2023.9.2 - doxygen=1.9.1 - gcc_linux-64=11.* - gmock>=1.13.0 @@ -53,7 +50,7 @@ dependencies: - numpydoc - nvcc_linux-64=11.8 - pip -- pydata-sphinx-theme +- pydata-sphinx-theme!=0.14.2 - pylibraft==23.12.* - pynndescent==0.5.8 - pytest @@ -63,6 +60,7 @@ dependencies: - pytest-xdist - python>=3.9,<3.11 - raft-dask==23.12.* +- rapids-dask-dependency==23.12.* - recommonmark - rmm==23.12.* - scikit-build>=0.13.1 @@ -77,5 +75,5 @@ dependencies: - treelite==3.9.1 - umap-learn==0.5.3 - pip: - - git+https://github.com/dask/dask-glm@main + - dask-glm==0.3.0 name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index 2f81709f6e..ffe3e3d0ff 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -18,12 +18,9 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 -- dask-core==2023.9.2 - dask-cuda==23.12.* - dask-cudf==23.12.* - dask-ml -- dask==2023.9.2 -- distributed==2023.9.2 - doxygen=1.9.1 - gcc_linux-64=11.* - gmock>=1.13.0 @@ -49,7 +46,7 @@ dependencies: - numba>=0.57 - numpydoc - pip -- pydata-sphinx-theme +- pydata-sphinx-theme!=0.14.2 - pylibraft==23.12.* - pynndescent==0.5.8 - pytest @@ -59,6 +56,7 @@ dependencies: - pytest-xdist - python>=3.9,<3.11 - raft-dask==23.12.* +- rapids-dask-dependency==23.12.* - recommonmark - rmm==23.12.* - scikit-build>=0.13.1 @@ -73,5 +71,5 @@ dependencies: - treelite==3.9.1 - umap-learn==0.5.3 - pip: - - git+https://github.com/dask/dask-glm@main + - dask-glm==0.3.0 name: all_cuda-120_arch-x86_64 diff --git a/conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml b/conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml index 515abd8929..3f63d4b3f6 100644 --- a/conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml +++ b/conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml @@ -8,8 +8,8 @@ channels: - nvidia dependencies: - c-compiler -- clang-tools==16.0.6 -- clang==16.0.6 +- clang-tools==15.0.7 +- clang==15.0.7 - cmake>=3.26.4 - cuda-version=11.8 - cudatoolkit diff --git a/conda/recipes/cuml-cpu/meta.yaml b/conda/recipes/cuml-cpu/meta.yaml index d4497a65fb..cb88ac22b7 100644 --- a/conda/recipes/cuml-cpu/meta.yaml +++ b/conda/recipes/cuml-cpu/meta.yaml @@ -2,7 +2,7 @@ # Usage: # conda build . -c conda-forge -c numba -c rapidsai -c pytorch -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set py_version = environ['CONDA_PY'] %} {% set date_string = environ['RAPIDS_DATE_STRING'] %} @@ -11,7 +11,7 @@ package: version: {{ version }} source: - git_url: ../../.. + path: ../../.. build: number: {{ GIT_DESCRIBE_NUMBER }} diff --git a/conda/recipes/cuml/meta.yaml b/conda/recipes/cuml/meta.yaml index 817776fa13..bcafb63bb6 100644 --- a/conda/recipes/cuml/meta.yaml +++ b/conda/recipes/cuml/meta.yaml @@ -1,6 +1,6 @@ # Copyright (c) 2018-2023, NVIDIA CORPORATION. -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} {% set cuda_major = cuda_version.split('.')[0] %} @@ -12,7 +12,7 @@ package: version: {{ version }} source: - git_url: ../../.. + path: ../../.. build: number: {{ GIT_DESCRIBE_NUMBER }} @@ -76,15 +76,13 @@ requirements: - cudf ={{ minor_version }} - cupy >=12.0.0 - dask-cudf ={{ minor_version }} - - dask ==2023.9.2 - - dask-core==2023.9.2 - - distributed ==2023.9.2 - joblib >=0.11 - libcuml ={{ version }} - libcumlprims ={{ minor_version }} - pylibraft ={{ minor_version }} - python x.x - raft-dask ={{ minor_version }} + - rapids-dask-dependency ={{ minor_version }} - treelite {{ treelite_version }} tests: diff --git a/conda/recipes/libcuml/meta.yaml b/conda/recipes/libcuml/meta.yaml index 34d3521869..b190a39625 100644 --- a/conda/recipes/libcuml/meta.yaml +++ b/conda/recipes/libcuml/meta.yaml @@ -1,6 +1,6 @@ # Copyright (c) 2018-2023, NVIDIA CORPORATION. -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} {% set cuda_major = cuda_version.split('.')[0] %} @@ -11,7 +11,7 @@ package: name: libcuml-split source: - git_url: ../../.. + path: ../../.. build: ignore_run_exports_from: diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py index 678534b899..67189573f9 100755 --- a/cpp/scripts/run-clang-tidy.py +++ b/cpp/scripts/run-clang-tidy.py @@ -25,7 +25,7 @@ import tomli -EXPECTED_VERSION = "16.0.6" +EXPECTED_VERSION = "15.0.7" VERSION_REGEX = re.compile(r" LLVM version ([0-9.]+)") GPU_ARCH_REGEX = re.compile(r"sm_(\d+)") SPACES = re.compile(r"\s+") diff --git a/cpp/src/svm/kernelcache.cuh b/cpp/src/svm/kernelcache.cuh index e20c4d5d12..4583581d2d 100644 --- a/cpp/src/svm/kernelcache.cuh +++ b/cpp/src/svm/kernelcache.cuh @@ -130,8 +130,16 @@ class BatchCache : public raft::cache::Cache { RAFT_CUDA_TRY(cudaMemsetAsync(tmp_buffer, 0, n_ws * 2 * sizeof(int), stream)); // Init cub buffers - cub::DeviceRadixSort::SortKeys( - NULL, d_temp_storage_size, tmp_buffer, tmp_buffer, n_ws, 0, sizeof(int) * 8, stream); + cub::DeviceRadixSort::SortPairs(NULL, + d_temp_storage_size, + tmp_buffer, + tmp_buffer, + tmp_buffer, + tmp_buffer, + n_ws, + 0, + sizeof(int) * 8, + stream); d_temp_storage.resize(d_temp_storage_size, stream); } diff --git a/dependencies.yaml b/dependencies.yaml index 86307617b4..1a875c75ec 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -95,8 +95,9 @@ dependencies: common: - output_types: [conda, requirements] packages: - - clang==16.0.6 - - clang-tools==16.0.6 + # clang 15 required by libcudacxx. + - clang==15.0.7 + - clang-tools==15.0.7 - ninja - tomli common_build: @@ -175,10 +176,8 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cudf==23.12.* - - dask==2023.9.2 - dask-cuda==23.12.* - dask-cudf==23.12.* - - distributed==2023.9.2 - joblib>=0.11 - numba>=0.57 # TODO: Is scipy really a hard dependency, or should @@ -186,13 +185,11 @@ dependencies: # installation/run_constrained for conda)? - scipy>=1.8.0 - raft-dask==23.12.* + - rapids-dask-dependency==23.12.* - *treelite - output_types: [conda, requirements] packages: - cupy>=12.0.0 - - output_types: conda - packages: - - dask-core==2023.9.2 - output_types: pyproject packages: - *treelite_runtime @@ -294,7 +291,8 @@ dependencies: - ipykernel - nbsphinx - numpydoc - - pydata-sphinx-theme + # https://github.com/pydata/pydata-sphinx-theme/issues/1539 + - pydata-sphinx-theme!=0.14.2 - recommonmark - &scikit_learn scikit-learn==1.2 - sphinx<6 @@ -345,15 +343,13 @@ dependencies: packages: - pip - pip: - # TODO: Figure out what to do with this dependency - # since the repo is now archived. - - git+https://github.com/dask/dask-glm@main + - dask-glm==0.3.0 # TODO: remove pin once a release that includes fixes for the error # is released: https://github.com/rapidsai/cuml/issues/5514 - hdbscan<=0.8.30 - output_types: pyproject packages: - - dask-glm @ git+https://github.com/dask/dask-glm@main + - dask-glm==0.3.0 # TODO: Can we stop pulling from the master branch now that there was a release in October? - hdbscan @ git+https://github.com/scikit-learn-contrib/hdbscan.git@master test_notebooks: diff --git a/docs/source/execution_device_interoperability.ipynb b/docs/source/execution_device_interoperability.ipynb index f267a9198c..1738ee99d0 100644 --- a/docs/source/execution_device_interoperability.ipynb +++ b/docs/source/execution_device_interoperability.ipynb @@ -6,14 +6,16 @@ "source": [ "# cuML on GPU and CPU\n", "\n", - "cuML is a Scikit-learn-based suite of fast, GPU-accelerated machine learning algorithms designed for data science and analytical tasks. Starting with version 23.10, a new version of cuML can also be run on CPU systems, increasing its ease of use (without code changes) in the following manners: \n", + "cuML is a Scikit-learn-like suite of fast, GPU-accelerated machine learning algorithms designed for data science and analytical tasks.\n", "\n", - "- Allow users to prototype in systems without GPUs. \n", - "- Allow library integrations without the need of dispatching and boilerplate code. \n", - "- Allow users to train on one type of system and infer with the other in a subset of estimators (that will grow with each version). \n", - "- Provide compatibility with the GPU/CPU open source pydata ecosystem.\n", + "Starting with version 23.10, cuML provides both GPU-based and CPU-based execution capabilities with zero code change required to switch between them. This unified CPU/GPU cuML: \n", "\n", - "The majority of estimators of cuML can run in both CPU and GPU systems, with a subset of them allowing exporting models between GPU and CPU systems. The following table shows support for the most common estimators: \n", + "- Allows users to prototype in systems without GPUs. \n", + "- Allows library integrations without the need for dispatching and boilerplate code. \n", + "- Allows users to train on one type of system and infer with the other for a subset of estimators (that will expand over time). \n", + "- Provides compatibility with the broader GPU/CPU open source pydata ecosystem.\n", + "\n", + "The majority of estimators of cuML can run in both CPU and GPU systems, with a subset of them supporting exporting models between GPU and CPU systems. The following table shows support for the most common estimators: \n", "\n", "| Category | Algorithm | Supports Execution on CPU | Supports Exporting between CPU and GPU | \n", "| --- | --- | --- | --- |\n", @@ -45,7 +47,9 @@ "| **Time Series** | Holt-Winters Exponential Smoothing | No | No |\n", "| | Auto-regressive Integrated Moving Average (ARIMA) | No | No |\n", "\n", - "This allows the same code to be guaranteed to run in both GPU and CPU systems. Version 23.12 is scheduled to add the following algorithms: Random Forest and Support Vector Machine estimators. \n", + "This allows the same code to be guaranteed to run in both GPU and CPU systems. Version 23.12 is scheduled to add the following algorithms:\n", + "- Random Forest\n", + "- Support Vector Machine estimators\n", "\n" ] }, @@ -57,7 +61,7 @@ "\n", "## Installation\n", "\n", - "For GPU systems, cuML still follows the [RAPIDS requirements] and nothing has changed for installing it. The cuML package and wheels are universal and can run in both GPU and CPU modes. For installing in CPU systems, similar to other packages it can be installed from conda/mamba with:\n", + "For GPU systems, cuML still follows the [RAPIDS requirements](https://rapids.ai/#quick-start). The cuML package and wheels are universal and can run in both GPU and CPU modes. To use cuML in CPU-only systems, you can install using conda/mamba with:\n", "\n", "```bash\n", "mamba install -c rapidsai -c nvidia -c conda-forge cuml-cpu=23.10 \n", @@ -65,7 +69,7 @@ "```\n", "\n", "- cuML 23.10 supports Linux and WSL2 on GPU and CPU systems using conda. \n", - "- cuML 23.12 will bring support for pip wheels and macos support for CPU execution. \n", + "- cuML 23.12 will bring support for pip wheels and MacOS support for CPU execution. \n", "\n", "### How to Use\n", "\n", @@ -73,7 +77,7 @@ "\n", "#### 1. Using CPU Package directly\n", "\n", - "The CPU package, `cuml-cpu` is a subset of the `cuml` package, so besides the difference in installation there is no changes needed to the code of supported estimators to run code. For example, the following script can be run both in a system with GPU and `cuml`, as well as a system without GPU and `cuml-cpu`:" + "The CPU package, `cuml-cpu` is a subset of the `cuml` package, so there are zero code changes required to run the code when using a CPU-only system. For example, the following script can be run both in a system with GPU and `cuml`, as well as a system without GPU and `cuml-cpu`:" ] }, { @@ -110,7 +114,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This allows to prototype on CPU systems and then run code on GPU servers, or the other way around. Some estimators support training on one type of system and then exporting models to the other type, as can be seen in [the corresponding section](#Cross-Device-Training-and-Inference-Serialization)." + "This allows easy prototyping on CPU systems and running production code on GPU servers, or the other way around. Some estimators support training on one type of system and then exporting models to the other type, as noted above and explained by example in [the corresponding section](#Cross-Device-Training-and-Inference-Serialization)." ] }, { @@ -119,7 +123,7 @@ "source": [ "#### 2. Managing Execution Platform with GPU package\n", "\n", - "Additionally to allowing the same code to be run in CPU systems, users can control which device executes parts of the code. So in addition to the first example that can just be run in a CPU system with `cuml-cpu`, a system with the full cuML can execute in CPU mode as well. \n", + "In addition to allowing the zero-code change execution in CPU systems, users can also manually control which device executes parts of the code when using a system with the full cuML.\n", "\n", "For example, using the following data: " ] @@ -155,7 +159,7 @@ "source": [ "There are two ways to control the execution of the code:\n", "\n", - "#### a) `using_device_type` context manager:" + "#### a) `using_device_type` context manager" ] }, { @@ -177,9 +181,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This allows to prototype but also to run different estimators on different devices, for example in the case where data is small so that moving the data around wouldn't allow the GPU to accelerate an estimator. \n", + "This makes it easy to prototype and run different estimators on different devices, for example in the case where data is small so that moving the data around wouldn't allow the GPU to accelerate an estimator. \n", "\n", - "Additionally, it allows to run estimators using unsupported parameter: " + "It also allows running estimators using unsupported parameters: " ] }, { @@ -201,14 +205,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "An upcoming feature will allow for this to also dispatch automatically. This can be very useful for library integrators, so that if users use parameters not supported on GPUs, the code automatically will dispatch to a CPU implementation. " + "An upcoming feature will allow for this dispatch to occur automatically under-the-hood. This can be very useful for when integrating cuML into other libraries, so that if users use parameters not supported on GPUs, the code automatically will dispatch to a CPU implementation. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### b) Global configuration. " + "#### b) Global configuration with `set_global_device_type`" ] }, { @@ -248,7 +252,7 @@ "source": [ "## Cross Device Training and Inference Serialization\n", "\n", - "As stated before, a subset of the estimators that can be executed on the CPU, also allow to serialize estimators trained on one type of device (CPU or GPU) and then deserialize it on the other one. \n", + "As stated above, a subset of the estimators support training on one type of device (CPU or GPU), serializing the trained model, and then deserializing and executing it on the other type of device. \n", "\n", "To do this, a simple API is provided. For example, To train a model on GPU but deploy it on CPU, first, train the estimator on device and save it to disk:" ] @@ -291,20 +295,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Conclusions\n", + "## Conclusion\n", "\n", - "cuML's CPU capabilities are designed to facilitate different usecases, lower the requirements to use the capabilities of cuML, as well as increasing the flexibility and capabilities of integration and deployment of the library. \n", + "cuML's CPU capabilities are designed to facilitate different use cases, lower the barriers to using the capabilities of cuML, an streamline integrating cuML into other tools and deploying models. \n", "\n", - "Upcoming versions of cuML will increase the supported estimators, both for CPU execution as well as serializing/exporting models between systems with and without GPUs. " + "Upcoming versions of cuML will expand the supported estimators, both for CPU execution as well as serializing/exporting models between systems with and without GPUs. " ] } ], "metadata": { - "interpreter": { - "hash": "35840739db47a5016f18b089945bf3e154a2dca6d71cfb13687d370b69a146e3" - }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3.10.12 ('cuml_dev')", "language": "python", "name": "python3" }, @@ -318,11 +319,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.10.12" }, "vscode": { "interpreter": { - "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" + "hash": "975233ed6ddd7eb5f50db124c7eb6e9abd7f2428099fbb1c703209662350014b" } } }, diff --git a/python/README.md b/python/README.md index 41a1d366cd..4a19e16e99 100644 --- a/python/README.md +++ b/python/README.md @@ -38,7 +38,7 @@ example `setup.py --singlegpu`) are: RAFT's Python and Cython is located in the [RAFT repository](https://github.com/rapidsai/raft/python). It was designed to be included in projects as opposed to be distributed by itself, so at build time, **setup.py creates a symlink from cuML, located in `/python/cuml/raft/` to the Python folder of RAFT**. -For developers that need to modify RAFT code, please refer to the [RAFT Developer Guide](https://github.com/rapidsai/raft/blob/branch-23.04/BUILD.md#developer-guide) for recommendations. +For developers that need to modify RAFT code, please refer to the [RAFT Developer Guide](https://github.com/rapidsai/raft/blob/branch-23.12/docs/source/build.md) for recommendations. To configure RAFT at build time: @@ -50,7 +50,7 @@ The RAFT Python code gets included in the cuML build and distributable artifacts ### Build Requirements -cuML's convenience [development yaml files](https://github.com/rapidsai/cuml/tree/branch-23.04/environments) includes all dependencies required to build cuML. +cuML's convenience [development yaml files](https://github.com/rapidsai/cuml/tree/branch-23.12/environments) includes all dependencies required to build cuML. To build cuML's Python package, the following dependencies are required: @@ -70,8 +70,7 @@ Packages required for multigpu algorithms*: - ucx-py version matching the cuML version - dask-cudf version matching the cuML version - nccl>=2.5 -- dask==2023.9.2 -- distributed==2023.9.2 +- rapids-dask-dependency==23.12.* * this can be avoided with `--singlegpu` argument flag. diff --git a/python/cuml/VERSION b/python/cuml/VERSION new file mode 120000 index 0000000000..558194c5a5 --- /dev/null +++ b/python/cuml/VERSION @@ -0,0 +1 @@ +../../VERSION \ No newline at end of file diff --git a/python/cuml/__init__.py b/python/cuml/__init__.py index 6c25b5d2d8..62ab93c1b4 100644 --- a/python/cuml/__init__.py +++ b/python/cuml/__init__.py @@ -114,9 +114,7 @@ from cuml.solvers.cd import CD from cuml.solvers.sgd import SGD from cuml.solvers.qn import QN - -# Version configuration -__version__ = "23.12.00" +from cuml._version import __version__, __git_commit__ def __getattr__(name): diff --git a/python/cuml/_version.py b/python/cuml/_version.py new file mode 100644 index 0000000000..87cb6a74d5 --- /dev/null +++ b/python/cuml/_version.py @@ -0,0 +1,21 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import importlib.resources + +__version__ = ( + importlib.resources.files("cuml").joinpath("VERSION").read_text().strip() +) +__git_commit__ = "" diff --git a/python/cuml/benchmark/automated/dask/conftest.py b/python/cuml/benchmark/automated/dask/conftest.py index 4e406ed5a6..8d2bb4e49c 100644 --- a/python/cuml/benchmark/automated/dask/conftest.py +++ b/python/cuml/benchmark/automated/dask/conftest.py @@ -18,6 +18,7 @@ from dask_cuda import initialize from dask_cuda import LocalCUDACluster +from dask_cuda.utils_test import IncreasedCloseTimeoutNanny from dask.distributed import Client enable_tcp_over_ucx = True @@ -28,7 +29,11 @@ @pytest.fixture(scope="module") def cluster(): - cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0) + cluster = LocalCUDACluster( + protocol="tcp", + scheduler_port=0, + worker_class=IncreasedCloseTimeoutNanny, + ) yield cluster cluster.close() @@ -54,6 +59,7 @@ def ucx_cluster(): enable_tcp_over_ucx=enable_tcp_over_ucx, enable_nvlink=enable_nvlink, enable_infiniband=enable_infiniband, + worker_class=IncreasedCloseTimeoutNanny, ) yield cluster cluster.close() diff --git a/python/cuml/common/__init__.py b/python/cuml/common/__init__.py index 6a46462878..e267bf668b 100644 --- a/python/cuml/common/__init__.py +++ b/python/cuml/common/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -31,6 +31,7 @@ from cuml.internals.input_utils import input_to_cuml_array from cuml.internals.input_utils import input_to_host_array +from cuml.internals.input_utils import input_to_host_array_with_sparse_support from cuml.internals.memory_utils import rmm_cupy_ary from cuml.internals.memory_utils import set_global_output_type @@ -59,6 +60,7 @@ "has_scipy", "input_to_cuml_array", "input_to_host_array", + "input_to_host_array_with_sparse_support", "rmm_cupy_ary", "set_global_output_type", "using_device_type", diff --git a/python/cuml/common/doc_utils.py b/python/cuml/common/doc_utils.py index 5421bbb6d3..03054f0664 100644 --- a/python/cuml/common/doc_utils.py +++ b/python/cuml/common/doc_utils.py @@ -94,6 +94,8 @@ " Ignored when return_sparse=False.\n" " If True, values in the inverse transform below this parameter\n" " are clipped to 0.", + None: "{name} : None\n" + " Ignored. This parameter exists for compatibility only.", } _parameter_possible_values = [ @@ -222,7 +224,6 @@ def deco(func): if ( "X" in params or "y" in params or parameters ) and not skip_parameters_heading: - func.__doc__ += "\nParameters\n----------\n" # Check if we want to prepend the parameters diff --git a/python/cuml/dask/common/base.py b/python/cuml/dask/common/base.py index 718056e01c..a9949310be 100644 --- a/python/cuml/dask/common/base.py +++ b/python/cuml/dask/common/base.py @@ -36,6 +36,7 @@ np = cpu_only_import("numpy") +dask_cudf = gpu_only_import("dask_cudf") dcDataFrame = gpu_only_import_from("dask_cudf.core", "DataFrame") @@ -343,7 +344,7 @@ def _run_parallel_func( if output_futures: return self.client.compute(preds) else: - output = dask.dataframe.from_delayed(preds) + output = dask_cudf.from_delayed(preds) return output if delayed else output.persist() else: raise ValueError( diff --git a/python/cuml/dask/preprocessing/__init__.py b/python/cuml/dask/preprocessing/__init__.py index 17380238ef..f5959467ae 100644 --- a/python/cuml/dask/preprocessing/__init__.py +++ b/python/cuml/dask/preprocessing/__init__.py @@ -13,12 +13,13 @@ # limitations under the License. # +from cuml.dask.preprocessing.encoders import OneHotEncoder, OrdinalEncoder from cuml.dask.preprocessing.label import LabelBinarizer -from cuml.dask.preprocessing.encoders import OneHotEncoder from cuml.dask.preprocessing.LabelEncoder import LabelEncoder __all__ = [ "LabelBinarizer", "OneHotEncoder", + "OrdinalEncoder", "LabelEncoder", ] diff --git a/python/cuml/dask/preprocessing/encoders.py b/python/cuml/dask/preprocessing/encoders.py index 0033f89eca..8bf2503578 100644 --- a/python/cuml/dask/preprocessing/encoders.py +++ b/python/cuml/dask/preprocessing/encoders.py @@ -12,23 +12,46 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from dask_cudf.core import Series as daskSeries +from collections.abc import Sequence + from cuml.common import with_cupy_rmm +from cuml.dask.common.base import ( + BaseEstimator, + DelayedInverseTransformMixin, + DelayedTransformMixin, +) +from cuml.internals.safe_imports import gpu_only_import_from, gpu_only_import +from dask_cudf.core import Series as daskSeries +from toolz import first -from cuml.dask.common.base import BaseEstimator -from cuml.dask.common.base import DelayedTransformMixin -from cuml.dask.common.base import DelayedInverseTransformMixin +dask_cudf = gpu_only_import("dask_cudf") +dcDataFrame = gpu_only_import_from("dask_cudf.core", "DataFrame") -from toolz import first -from collections.abc import Sequence -from cuml.internals.safe_imports import gpu_only_import_from +class DelayedFitTransformMixin: + def fit_transform(self, X, delayed=True): + """Fit the encoder to X, then transform X. Equivalent to fit(X).transform(X). -dcDataFrame = gpu_only_import_from("dask_cudf.core", "DataFrame") + Parameters + ---------- + X : Dask cuDF DataFrame or CuPy backed Dask Array + The data to encode. + delayed : bool (default = True) + Whether to execute as a delayed task or eager. + + Returns + ------- + out : Dask cuDF DataFrame or CuPy backed Dask Array + Distributed object containing the transformed data + """ + return self.fit(X).transform(X, delayed=delayed) class OneHotEncoder( - BaseEstimator, DelayedTransformMixin, DelayedInverseTransformMixin + BaseEstimator, + DelayedTransformMixin, + DelayedInverseTransformMixin, + DelayedFitTransformMixin, ): """ Encode categorical features as a one-hot numeric array. @@ -83,13 +106,9 @@ class OneHotEncoder( will be denoted as None. """ - def __init__(self, *, client=None, verbose=False, **kwargs): - super().__init__(client=client, verbose=verbose, **kwargs) - @with_cupy_rmm def fit(self, X): - """ - Fit a multi-node multi-gpu OneHotEncoder to X. + """Fit a multi-node multi-gpu OneHotEncoder to X. Parameters ---------- @@ -111,10 +130,9 @@ def fit(self, X): return self - def fit_transform(self, X, delayed=True): - """ - Fit OneHotEncoder to X, then transform X. - Equivalent to fit(X).transform(X). + @with_cupy_rmm + def transform(self, X, delayed=True): + """Transform X using one-hot encoding. Parameters ---------- @@ -126,52 +144,137 @@ def fit_transform(self, X, delayed=True): Returns ------- out : Dask cuDF DataFrame or CuPy backed Dask Array - Distributed object containing the transformed data + Distributed object containing the transformed input. """ - return self.fit(X).transform(X, delayed=delayed) + return self._transform( + X, + n_dims=2, + delayed=delayed, + output_dtype=self._get_internal_model().dtype, + output_collection_type="cupy", + ) @with_cupy_rmm - def transform(self, X, delayed=True): - """ - Transform X using one-hot encoding. + def inverse_transform(self, X, delayed=True): + """Convert the data back to the original representation. In case unknown + categories are encountered (all zeros in the one-hot encoding), ``None`` is used + to represent this category. Parameters ---------- - X : Dask cuDF DataFrame or CuPy backed Dask Array - The data to encode. + X : CuPy backed Dask Array, shape [n_samples, n_encoded_features] + The transformed data. delayed : bool (default = True) Whether to execute as a delayed task or eager. Returns ------- - out : Dask cuDF DataFrame or CuPy backed Dask Array - Distributed object containing the transformed input. + X_tr : Dask cuDF DataFrame or CuPy backed Dask Array + Distributed object containing the inverse transformed array. + """ + dtype = self._get_internal_model().dtype + return self._inverse_transform( + X, + n_dims=2, + delayed=delayed, + output_dtype=dtype, + output_collection_type=self.datatype, + ) + + +class OrdinalEncoder( + BaseEstimator, + DelayedTransformMixin, + DelayedInverseTransformMixin, + DelayedFitTransformMixin, +): + """Encode categorical features as an integer array. + + The input to this transformer should be an :py:class:`dask_cudf.DataFrame` or a + :py:class:`dask.array.Array` backed by cupy, denoting the unique values taken on by + categorical (discrete) features. The features are converted to ordinal + integers. This results in a single column of integers (0 to n_categories - 1) per + feature. + + Parameters + ---------- + categories : :py:class:`cupy.ndarray` or :py:class`cudf.DataFrameq, default='auto' + Categories (unique values) per feature. All categories are expected to + fit on one GPU. + - 'auto' : Determine categories automatically from the training data. + - DataFrame/ndarray : ``categories[col]`` holds the categories expected + in the feature col. + handle_unknown : {'error', 'ignore'}, default='error' + Whether to raise an error or ignore if an unknown categorical feature is + present during transform (default is to raise). When this parameter is set + to 'ignore' and an unknown category is encountered during transform, the + resulting encoded value would be null when output type is cudf + dataframe. + verbose : int or boolean, default=False + Sets logging level. It must be one of `cuml.common.logger.level_*`. See + :ref:`verbosity-levels` for more info. + """ + + @with_cupy_rmm + def fit(self, X): + """Fit Ordinal to X. + + Parameters + ---------- + X : :py:class:`dask_cudf.DataFrame` or a CuPy backed :py:class:`dask.array.Array`. + shape = (n_samples, n_features) The data to determine the categories of each + feature. + + Returns + ------- + self + """ + from cuml.preprocessing.ordinalencoder_mg import OrdinalEncoderMG + + el = first(X) if isinstance(X, Sequence) else X + self.datatype = ( + "cudf" if isinstance(el, (dcDataFrame, daskSeries)) else "cupy" + ) + + self._set_internal_model(OrdinalEncoderMG(**self.kwargs).fit(X)) + + return self + + @with_cupy_rmm + def transform(self, X, delayed=True): + """Transform X using ordinal encoding. + + Parameters + ---------- + X : :py:class:`dask_cudf.DataFrame` or cupy backed dask array. The data to + encode. + + Returns + ------- + X_out : + Transformed input. """ return self._transform( X, n_dims=2, delayed=delayed, output_dtype=self._get_internal_model().dtype, - output_collection_type="cupy", + output_collection_type=self.datatype, ) @with_cupy_rmm def inverse_transform(self, X, delayed=True): - """ - Convert the data back to the original representation. - In case unknown categories are encountered (all zeros in the - one-hot encoding), ``None`` is used to represent this category. + """Convert the data back to the original representation. Parameters ---------- - X : CuPy backed Dask Array, shape [n_samples, n_encoded_features] - The transformed data. + X : :py:class:`dask_cudf.DataFrame` or cupy backed dask array. delayed : bool (default = True) Whether to execute as a delayed task or eager. Returns ------- - X_tr : Dask cuDF DataFrame or CuPy backed Dask Array + X_tr : Distributed object containing the inverse transformed array. """ dtype = self._get_internal_model().dtype diff --git a/python/cuml/feature_extraction/_vectorizers.py b/python/cuml/feature_extraction/_vectorizers.py index 78172ec690..0133195b20 100644 --- a/python/cuml/feature_extraction/_vectorizers.py +++ b/python/cuml/feature_extraction/_vectorizers.py @@ -598,7 +598,9 @@ def fit_transform(self, raw_documents, y=None): if self._fixed_vocabulary: self.vocabulary_ = self.vocabulary else: - self.vocabulary_ = tokenized_df["token"].unique().sort_values() + self.vocabulary_ = ( + tokenized_df["token"].drop_duplicates().sort_values() + ) count_df = self._count_vocab(tokenized_df) diff --git a/python/cuml/internals/base.pyx b/python/cuml/internals/base.pyx index 4fb03fdac9..c00ed17f98 100644 --- a/python/cuml/internals/base.pyx +++ b/python/cuml/internals/base.pyx @@ -28,6 +28,12 @@ from cuml.internals.safe_imports import ( np = cpu_only_import('numpy') nvtx_annotate = gpu_only_import_from("nvtx", "annotate", alt=null_decorator) +try: + from sklearn.utils import estimator_html_repr +except ImportError: + estimator_html_repr = None + + import cuml import cuml.common import cuml.internals.logger as logger @@ -443,6 +449,13 @@ class Base(TagsMixin, return {'preserves_dtype': [self.dtype]} return {} + def _repr_mimebundle_(self, **kwargs): + """Prepare representations used by jupyter kernels to display estimator""" + if estimator_html_repr is not None: + output = {"text/plain": repr(self)} + output["text/html"] = estimator_html_repr(self) + return output + def set_nvtx_annotations(self): for func_name in ['fit', 'transform', 'predict', 'fit_transform', 'fit_predict']: diff --git a/python/cuml/internals/input_utils.py b/python/cuml/internals/input_utils.py index bb9e8bc3e3..edcbffabaa 100644 --- a/python/cuml/internals/input_utils.py +++ b/python/cuml/internals/input_utils.py @@ -497,6 +497,20 @@ def input_to_host_array( return out_data._replace(array=out_data.array.to_output("numpy")) +def input_to_host_array_with_sparse_support(X): + _array_type, is_sparse = determine_array_type_full(X) + if is_sparse: + if _array_type == "cupy": + return SparseCumlArray(X).to_output(output_type="scipy") + elif _array_type == "cuml": + return X.to_output(output_type="scipy") + elif _array_type == "numpy": + return X + else: + raise ValueError(f"Unsupported sparse array type: {_array_type}.") + return input_to_host_array(X).array + + def convert_dtype(X, to_dtype=np.float32, legacy=True, safe_dtype=True): """ Convert X to be of dtype `dtype`, raising a TypeError diff --git a/python/cuml/model_selection/_split.py b/python/cuml/model_selection/_split.py index 21800959fd..cb58db4f5f 100644 --- a/python/cuml/model_selection/_split.py +++ b/python/cuml/model_selection/_split.py @@ -13,12 +13,15 @@ # limitations under the License. # -from typing import Union -from cuml.internals.safe_imports import gpu_only_import_from +from typing import Optional, Union + from cuml.common import input_to_cuml_array from cuml.internals.array import array_to_memory_order -from cuml.internals.safe_imports import cpu_only_import -from cuml.internals.safe_imports import gpu_only_import +from cuml.internals.safe_imports import ( + cpu_only_import, + gpu_only_import, + gpu_only_import_from, +) cudf = gpu_only_import("cudf") cp = gpu_only_import("cupy") @@ -138,7 +141,6 @@ def _stratify_split( if hasattr(X, "__cuda_array_interface__") or isinstance( X, cupyx.scipy.sparse.csr_matrix ): - X_train_i = cp.array( X[perm_indices_class_i[: n_i[i]]], order=x_order ) @@ -244,11 +246,11 @@ def _approximate_mode(class_counts, n_draws, rng): def train_test_split( X, y=None, - test_size: Union[float, int] = None, - train_size: Union[float, int] = None, + test_size: Optional[Union[float, int]] = None, + train_size: Optional[Union[float, int]] = None, shuffle: bool = True, - random_state: Union[ - int, cp.random.RandomState, np.random.RandomState + random_state: Optional[ + Union[int, cp.random.RandomState, np.random.RandomState] ] = None, stratify=None, ): diff --git a/python/cuml/multiclass/multiclass.py b/python/cuml/multiclass/multiclass.py index e97de7256b..65b378a17b 100644 --- a/python/cuml/multiclass/multiclass.py +++ b/python/cuml/multiclass/multiclass.py @@ -20,7 +20,15 @@ from cuml.internals.import_utils import has_sklearn from cuml.internals.mixins import ClassifierMixin from cuml.common.doc_utils import generate_docstring -from cuml.common import input_to_host_array +from cuml.common import ( + input_to_host_array, + input_to_host_array_with_sparse_support, +) +from cuml.internals.input_utils import ( + input_to_cupy_array, + determine_array_type_full, +) +from cuml.internals.array_sparse import SparseCumlArray from cuml.internals import _deprecate_pos_args @@ -142,7 +150,9 @@ def fit(self, X, y) -> "MulticlassClassifier": + ", must be one of " '{"ovr", "ovo"}' ) - X = input_to_host_array(X).array + + X = input_to_host_array_with_sparse_support(X) + y = input_to_host_array(y).array with cuml.internals.exit_internal_api(): self.multiclass_estimator.fit(X, y) @@ -160,7 +170,8 @@ def predict(self, X) -> CumlArray: """ Predict using multi class classifier. """ - X = input_to_host_array(X).array + X = input_to_host_array_with_sparse_support(X) + with cuml.internals.exit_internal_api(): return self.multiclass_estimator.predict(X) @@ -177,7 +188,7 @@ def decision_function(self, X) -> CumlArray: """ Calculate the decision function. """ - X = input_to_host_array(X).array + X = input_to_host_array_with_sparse_support(X) with cuml.internals.exit_internal_api(): return self.multiclass_estimator.decision_function(X) diff --git a/python/cuml/preprocessing/LabelEncoder.py b/python/cuml/preprocessing/LabelEncoder.py index 882e552511..aceed2766a 100644 --- a/python/cuml/preprocessing/LabelEncoder.py +++ b/python/cuml/preprocessing/LabelEncoder.py @@ -171,19 +171,19 @@ def fit(self, y, _classes=None): A fitted instance of itself to allow method chaining """ - if _classes is None: - y = self._to_cudf_series(y) - self._validate_keywords() - self.dtype = y.dtype if y.dtype != cp.dtype("O") else str - if _classes is not None: - self.classes_ = _classes - else: - self.classes_ = y.unique().sort_values( - ignore_index=True + if _classes is None: + y = ( + self._to_cudf_series(y) + .drop_duplicates() + .sort_values(ignore_index=True) ) # dedupe and sort + self.classes_ = y + else: + self.classes_ = _classes + self.dtype = y.dtype if y.dtype != cp.dtype("O") else str self._fitted = True return self diff --git a/python/cuml/preprocessing/__init__.py b/python/cuml/preprocessing/__init__.py index 368c570b09..fc07aba50c 100644 --- a/python/cuml/preprocessing/__init__.py +++ b/python/cuml/preprocessing/__init__.py @@ -16,7 +16,7 @@ from cuml.model_selection import train_test_split from cuml.preprocessing.LabelEncoder import LabelEncoder from cuml.preprocessing.label import LabelBinarizer, label_binarize -from cuml.preprocessing.encoders import OneHotEncoder +from cuml.preprocessing.encoders import OneHotEncoder, OrdinalEncoder from cuml.preprocessing.TargetEncoder import TargetEncoder from cuml.preprocessing import text @@ -63,6 +63,7 @@ "MissingIndicator", "Normalizer", "OneHotEncoder", + "OrdinalEncoder", "PolynomialFeatures", "PowerTransformer", "QuantileTransformer", diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py index 32a8defc69..272655b552 100644 --- a/python/cuml/preprocessing/encoders.py +++ b/python/cuml/preprocessing/encoders.py @@ -13,31 +13,165 @@ # limitations under the License. # import warnings +from typing import List, Optional, TypeVar + import cuml.internals.logger as logger -from cuml.internals.safe_imports import gpu_only_import_from from cudf import DataFrame, Series -from cuml.preprocessing import LabelEncoder from cuml import Base +from cuml.common.doc_utils import generate_docstring from cuml.common.exceptions import NotFittedError -from cuml.internals.safe_imports import gpu_only_import -from cuml.internals.safe_imports import cpu_only_import +from cuml.internals.safe_imports import ( + cpu_only_import, + gpu_only_import, + gpu_only_import_from, +) +from cuml.preprocessing import LabelEncoder np = cpu_only_import("numpy") +cudf = gpu_only_import("cudf") cp = gpu_only_import("cupy") cupyx = gpu_only_import("cupyx") GenericIndex = gpu_only_import_from("cudf", "GenericIndex") -class OneHotEncoder(Base): +class CheckFeaturesMixIn: + def _check_n_features(self, X, reset: bool = False): + n_features = X.shape[1] + if reset: + self.n_features_in_ = n_features + if hasattr(X, "columns"): + self.feature_names_in_ = [str(c) for c in X.columns] + else: + if not hasattr(self, "n_features_in_"): + raise RuntimeError( + "The reset parameter is False but there is no " + "n_features_in_ attribute. Is this estimator fitted?" + ) + if n_features != self.n_features_in_: + raise ValueError( + "X has {} features, but this {} is expecting {} features " + "as input.".format( + n_features, + self.__class__.__name__, + self.n_features_in_, + ) + ) + + +class BaseEncoder(Base, CheckFeaturesMixIn): + """Base implementation for encoding categorical values, uses + :py:class:`~cuml.preprocessing.LabelEncoder` for obtaining unique values. + + Parameters + ---------- + + handle : cuml.Handle + Specifies the cuml.handle that holds internal CUDA state for + computations in this model. Most importantly, this specifies the CUDA + stream that will be used for the model's computations, so users can + run different models concurrently in different streams by creating + handles in several streams. + If it is None, a new one is created. + verbose : int or boolean, default=False + Sets logging level. It must be one of `cuml.common.logger.level_*`. + See :ref:`verbosity-levels` for more info. + output_type : {'input', 'array', 'dataframe', 'series', 'df_obj', \ + 'numba', 'cupy', 'numpy', 'cudf', 'pandas'}, default=None + Return results and set estimator attributes to the indicated output + type. If None, the output type set at the module level + (`cuml.global_settings.output_type`) will be used. See + :ref:`output-data-type-configuration` for more info. + """ + + def _set_input_type(self, value): + if self.input_type is None: + self.input_type = value + + def _check_input(self, X, is_categories=False): + """If input is cupy, convert it to a DataFrame with 0 copies.""" + if isinstance(X, cp.ndarray): + self._set_input_type("array") + if is_categories: + X = X.transpose() + return DataFrame(X) + else: + self._set_input_type("df") + return X + + def _check_input_fit(self, X, is_categories=False): + """Helper function used in fit, can be overridden in subclasses.""" + self._check_n_features(X, reset=True) + return self._check_input(X, is_categories=is_categories) + + def _unique(self, inp): + """Helper function used in fit. Can be overridden in subclasses.""" + + # Default implementation passes input through directly since this is + # performed in `LabelEncoder.fit()` + return inp + + def _fit(self, X, need_drop: bool): + X = self._check_input_fit(X) + if type(self.categories) is str and self.categories == "auto": + self._features = X.columns + self._encoders = { + feature: LabelEncoder( + handle=self.handle, + verbose=self.verbose, + output_type=self.output_type, + handle_unknown=self.handle_unknown, + ).fit(self._unique(X[feature])) + for feature in self._features + } + else: + self.categories = self._check_input_fit(self.categories, True) + self._features = self.categories.columns + if len(self._features) != X.shape[1]: + raise ValueError( + "Shape mismatch: if categories is not 'auto'," + " it has to be of shape (n_features, _)." + ) + self._encoders = dict() + for feature in self._features: + le = LabelEncoder( + handle=self.handle, + verbose=self.verbose, + output_type=self.output_type, + handle_unknown=self.handle_unknown, + ) + + self._encoders[feature] = le.fit(self.categories[feature]) + + if self.handle_unknown == "error": + if self._has_unknown( + X[feature], self._encoders[feature].classes_ + ): + msg = ( + "Found unknown categories in column {0}" + " during fit".format(feature) + ) + raise KeyError(msg) + + if need_drop: + self.drop_idx_ = self._compute_drop_idx() + self._fitted = True + + @property + def categories_(self): + """Returns categories used for the one hot encoding in the correct order.""" + return [self._encoders[f].classes_ for f in self._features] + + +class OneHotEncoder(BaseEncoder): """ Encode categorical features as a one-hot numeric array. - The input to this estimator should be a cuDF.DataFrame or a cupy.ndarray, - denoting the unique values taken on by categorical (discrete) features. - The features are encoded using a one-hot (aka 'one-of-K' or 'dummy') - encoding scheme. This creates a binary column for each category and - returns a sparse matrix or dense array (depending on the ``sparse`` - parameter). + The input to this estimator should be a :py:class:`cuDF.DataFrame` or a + :py:class:`cupy.ndarray`, denoting the unique values taken on by categorical + (discrete) features. The features are encoded using a one-hot (aka 'one-of-K' or + 'dummy') encoding scheme. This creates a binary column for each category and returns + a sparse matrix or dense array (depending on the ``sparse`` parameter). + By default, the encoder derives the categories based on the unique values in each feature. Alternatively, you can also specify the `categories` manually. @@ -105,7 +239,6 @@ class OneHotEncoder(Base): ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to be dropped for each feature. None if all the transformed features will be retained. - """ def __init__( @@ -165,7 +298,7 @@ def _check_is_fitted(self): raise NotFittedError(msg) def _compute_drop_idx(self): - """Helper to compute indices to drop from category to drop""" + """Helper to compute indices to drop from category to drop.""" if self.drop is None: return None elif isinstance(self.drop, str) and self.drop == "first": @@ -209,141 +342,46 @@ def _compute_drop_idx(self): ) raise ValueError(msg.format(type(self.drop))) - @property - def categories_(self): - """ - Returns categories used for the one hot encoding in the correct order. - """ - return [self._encoders[f].classes_ for f in self._features] - - def _set_input_type(self, value): - if self.input_type is None: - self.input_type = value - - def _check_input(self, X, is_categories=False): - """ - If input is cupy, convert it to a DataFrame with 0 copies - """ - if isinstance(X, cp.ndarray): - self._set_input_type("array") - if is_categories: - X = X.transpose() - return DataFrame(X) - else: - self._set_input_type("df") - return X - def _check_input_fit(self, X, is_categories=False): """Helper function used in fit. Can be overridden in subclasses.""" return self._check_input(X, is_categories=is_categories) - def _unique(self, inp): - """Helper function used in fit. Can be overridden in subclasses.""" - - # Default implementation passes input through directly since this is - # performed in `LabelEncoder.fit()` - return inp - def _has_unknown(self, X_cat, encoder_cat): - """Check if X_cat has categories that are not present in encoder_cat""" + """Check if X_cat has categories that are not present in encoder_cat.""" return not X_cat.isin(encoder_cat).all() + @generate_docstring(y=None) def fit(self, X, y=None): - """ - Fit OneHotEncoder to X. - - Parameters - ---------- - X : cuDF.DataFrame or cupy.ndarray, shape = (n_samples, n_features) - The data to determine the categories of each feature. - y : None - Ignored. This parameter exists for compatibility only. - - Returns - ------- - self - - """ + """Fit OneHotEncoder to X.""" self._validate_keywords() - X = self._check_input_fit(X) - if type(self.categories) is str and self.categories == "auto": - self._features = X.columns - self._encoders = { - feature: LabelEncoder( - handle=self.handle, - verbose=self.verbose, - output_type=self.output_type, - handle_unknown=self.handle_unknown, - ).fit(self._unique(X[feature])) - for feature in self._features - } - else: - self.categories = self._check_input_fit(self.categories, True) - self._features = self.categories.columns - if len(self._features) != X.shape[1]: - raise ValueError( - "Shape mismatch: if categories is not 'auto'," - " it has to be of shape (n_features, _)." - ) - self._encoders = dict() - for feature in self._features: - - le = LabelEncoder( - handle=self.handle, - verbose=self.verbose, - output_type=self.output_type, - handle_unknown=self.handle_unknown, - ) - - self._encoders[feature] = le.fit(self.categories[feature]) - - if self.handle_unknown == "error": - if self._has_unknown( - X[feature], self._encoders[feature].classes_ - ): - msg = ( - "Found unknown categories in column {0}" - " during fit".format(feature) - ) - raise KeyError(msg) - - self.drop_idx_ = self._compute_drop_idx() - self._fitted = True + self._fit(X, True) return self + @generate_docstring( + y=None, + return_values={ + "name": "X_out", + "description": "Transformed input.", + "type": "sparse matrix if sparse=True else a 2-d array", + }, + ) def fit_transform(self, X, y=None): """ - Fit OneHotEncoder to X, then transform X. - Equivalent to fit(X).transform(X). - - Parameters - ---------- - X : cudf.DataFrame or cupy.ndarray, shape = (n_samples, n_features) - The data to encode. - - Returns - ------- - X_out : sparse matrix if sparse=True else a 2-d array - Transformed input. + Fit OneHotEncoder to X, then transform X. Equivalent to fit(X).transform(X). """ X = self._check_input(X) return self.fit(X).transform(X) + @generate_docstring( + return_values={ + "name": "X_out", + "description": "Transformed input.", + "type": "sparse matrix if sparse=True else a 2-d array", + } + ) def transform(self, X): - """ - Transform X using one-hot encoding. - - Parameters - ---------- - X : cudf.DataFrame or cupy.ndarray - The data to encode. - - Returns - ------- - X_out : sparse matrix if sparse=True else a 2-d array - Transformed input. - """ + """Transform X using one-hot encoding.""" self._check_is_fitted() X = self._check_input(X) @@ -425,10 +463,9 @@ def transform(self, X): ) def inverse_transform(self, X): - """ - Convert the data back to the original representation. - In case unknown categories are encountered (all zeros in the - one-hot encoding), ``None`` is used to represent this category. + """Convert the data back to the original representation. In case unknown + categories are encountered (all zeros in the one-hot encoding), ``None`` is used + to represent this category. The return type is the same as the type of the input used by the first call to fit on this estimator instance. @@ -544,3 +581,165 @@ def get_param_names(self): "dtype", "handle_unknown", ] + + +def _slice_feat(X, i): + if hasattr(X, "iloc"): + return X[i] + return X[:, i] + + +def _get_output( + output_type: Optional[str], + input_type: Optional[str], + out: DataFrame, + dtype, +): + if output_type == "input": + if input_type == "array": + output_type = "cupy" + elif input_type == "df": + output_type = "cudf" + + if output_type is None: + output_type = "cupy" + + if output_type == "cudf": + return out + elif output_type == "cupy": + return out.astype(dtype).to_cupy(na_value=np.nan) + elif output_type == "numpy": + return cp.asnumpy(out.to_cupy(na_value=np.nan, dtype=dtype)) + elif output_type == "pandas": + return out.to_pandas() + else: + raise ValueError("Unsupported output type.") + + +class OrdinalEncoder(BaseEncoder): + def __init__( + self, + *, + categories="auto", + dtype=np.float64, + handle_unknown="error", + handle=None, + verbose=False, + output_type=None, + ) -> None: + """Encode categorical features as an integer array. + + The input to this transformer should be an :py:class:`cudf.DataFrame` or a + :py:class:`cupy.ndarray`, denoting the unique values taken on by categorical + (discrete) features. The features are converted to ordinal integers. This + results in a single column of integers (0 to n_categories - 1) per feature. + + Parameters + ---------- + categories : 'auto' an cupy.ndarray or a cudf.DataFrame, default='auto' + Categories (unique values) per feature: + - 'auto' : Determine categories automatically from the training data. + - DataFrame/ndarray : ``categories[col]`` holds the categories expected + in the feature col. + handle_unknown : {'error', 'ignore'}, default='error' + Whether to raise an error or ignore if an unknown categorical feature is + present during transform (default is to raise). When this parameter is set + to 'ignore' and an unknown category is encountered during transform, the + resulting encoded value would be null when output type is cudf + dataframe. + handle : cuml.Handle + Specifies the cuml.handle that holds internal CUDA state for computations in + this model. Most importantly, this specifies the CUDA stream that will be + used for the model's computations, so users can run different models + concurrently in different streams by creating handles in several streams. + + If it is None, a new one is created. + verbose : int or boolean, default=False + Sets logging level. It must be one of `cuml.common.logger.level_*`. See + :ref:`verbosity-levels` for more info. + output_type : {'input', 'array', 'dataframe', 'series', 'df_obj', \ + 'numba', 'cupy', 'numpy', 'cudf', 'pandas'}, default=None + Return results and set estimator attributes to the indicated output + type. If None, the output type set at the module level + (`cuml.global_settings.output_type`) will be used. See + :ref:`output-data-type-configuration` for more info. + """ + super().__init__( + handle=handle, verbose=verbose, output_type=output_type + ) + + self.categories = categories + self.dtype = dtype + self.handle_unknown = handle_unknown + + self.input_type = None + + @generate_docstring(y=None) + def fit(self, X, y=None) -> "OrdinalEncoder": + """Fit Ordinal to X.""" + self._fit(X, need_drop=False) + return self + + @generate_docstring( + return_values={ + "name": "X_out", + "description": "Transformed input.", + "type": "Type is specified by the `output_type` parameter.", + } + ) + def transform(self, X): + """Transform X using ordinal encoding.""" + self._check_n_features(X, reset=False) + + result = {} + for feature in self._features: + Xi = _slice_feat(X, feature) + col_idx = self._encoders[feature].transform(Xi) + result[feature] = col_idx + + r = DataFrame(result) + return _get_output(self.output_type, self.input_type, r, self.dtype) + + @generate_docstring( + y=None, + return_values={ + "name": "X_out", + "description": "Transformed input.", + "type": "Type is specified by the `output_type` parameter.", + }, + ) + def fit_transform(self, X, y=None): + """Fit OrdinalEncoder to X, then transform X. Equivalent to fit(X).transform(X).""" + X = self._check_input(X) + return self.fit(X).transform(X) + + def inverse_transform(self, X): + """Convert the data back to the original representation. + + Parameters + ---------- + X : array-like or sparse matrix, shape [n_samples, n_encoded_features] + The transformed data. + + Returns + ------- + X_tr : Type is specified by the `output_type` parameter. + Inverse transformed array. + """ + self._check_n_features(X, reset=False) + + result = {} + for feature in self._features: + Xi = _slice_feat(X, feature) + inv = self._encoders[feature].inverse_transform(Xi) + result[feature] = inv + + r = DataFrame(result) + return _get_output(self.output_type, self.input_type, r, self.dtype) + + def get_param_names(self): + return super().get_param_names() + [ + "categories", + "dtype", + "handle_unknown", + ] diff --git a/python/cuml/preprocessing/ordinalencoder_mg.py b/python/cuml/preprocessing/ordinalencoder_mg.py new file mode 100644 index 0000000000..8b47f67819 --- /dev/null +++ b/python/cuml/preprocessing/ordinalencoder_mg.py @@ -0,0 +1,49 @@ +# +# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import cupy as cp +import dask +from cuml.dask.common.dask_arr_utils import to_dask_cudf +from cuml.internals.safe_imports import gpu_only_import, gpu_only_import_from +from cuml.preprocessing.encoders import OrdinalEncoder + +cp = gpu_only_import("cupy") +DataFrame = gpu_only_import_from("cudf", "DataFrame") + + +class OrdinalEncoderMG(OrdinalEncoder): + def __init__(self, *, client=None, **kwargs): + super().__init__(**kwargs) + self.client = client + + def _check_input_fit(self, X, is_categories=False): + """Helper function to check input of fit within the multi-gpu model""" + if isinstance(X, (dask.array.core.Array, cp.ndarray)): + self._set_input_type("array") + if is_categories: + X = X.transpose() + if isinstance(X, cp.ndarray): + return DataFrame(X) + else: + return to_dask_cudf(X, client=self.client) + else: + self._set_input_type("df") + return X + + def _unique(self, inp): + return inp.unique().compute() + + def _has_unknown(self, X_cat, encoder_cat): + return not X_cat.isin(encoder_cat).all().compute() diff --git a/python/cuml/svm/linear.pyx b/python/cuml/svm/linear.pyx index a0bcf5d3a5..470e867f60 100644 --- a/python/cuml/svm/linear.pyx +++ b/python/cuml/svm/linear.pyx @@ -300,8 +300,8 @@ cdef class LinearSVMWrapper: if self.dtype != np.float32 and self.dtype != np.float64: raise TypeError('Input data type must be float32 or float64') - cdef uintptr_t Xptr = X.ptr - cdef uintptr_t yptr = y.ptr + cdef uintptr_t Xptr = X.ptr if X is not None else 0 + cdef uintptr_t yptr = y.ptr if y is not None else 0 cdef uintptr_t swptr = sampleWeight.ptr \ if sampleWeight is not None else 0 cdef size_t nCols = 0 diff --git a/python/cuml/svm/svc.pyx b/python/cuml/svm/svc.pyx index 2fff2672fd..d5d5c35e3e 100644 --- a/python/cuml/svm/svc.pyx +++ b/python/cuml/svm/svc.pyx @@ -35,7 +35,7 @@ from cuml.common.doc_utils import generate_docstring from cuml.internals.logger import warn from pylibraft.common.handle cimport handle_t from pylibraft.common.interruptible import cuda_interruptible -from cuml.common import input_to_cuml_array, input_to_host_array +from cuml.common import input_to_cuml_array, input_to_host_array, input_to_host_array_with_sparse_support from cuml.internals.input_utils import input_to_cupy_array, determine_array_type_full from cuml.preprocessing import LabelEncoder from libcpp cimport nullptr @@ -449,7 +449,7 @@ class SVC(SVMBase, # Currently CalibratedClassifierCV expects data on the host, see # https://github.com/rapidsai/cuml/issues/2608 - X = input_to_host_array(X).array + X = input_to_host_array_with_sparse_support(X) y = input_to_host_array(y).array if not has_sklearn(): @@ -485,8 +485,6 @@ class SVC(SVMBase, return self._fit_proba(X, y, sample_weight) if self.n_classes_ > 2: - if is_sparse: - raise ValueError("Multiclass SVM does not support sparse input.") return self._fit_multiclass(X, y, sample_weight) if is_sparse: @@ -594,7 +592,7 @@ class SVC(SVMBase, if self.probability: self._check_is_fitted('prob_svc') - X = input_to_host_array(X).array + X = input_to_host_array_with_sparse_support(X) with cuml.internals.exit_internal_api(): preds = self.prob_svc.predict(X) @@ -628,7 +626,7 @@ class SVC(SVMBase, if self.probability: self._check_is_fitted('prob_svc') - X = input_to_host_array(X).array + X = input_to_host_array_with_sparse_support(X) # Exit the internal API when calling sklearn code (forces numpy # conversion) diff --git a/python/cuml/testing/strategies.py b/python/cuml/testing/strategies.py index 39b2068e5d..a8f849866a 100644 --- a/python/cuml/testing/strategies.py +++ b/python/cuml/testing/strategies.py @@ -187,9 +187,7 @@ def create_cuml_array_input(input_type, dtype, shape, order): input_type = "cupy" if input_type is None else input_type - multidimensional = ( - isinstance(shape, tuple) and len([d for d in shape if d > 1]) > 1 - ) + multidimensional = isinstance(shape, tuple) and len(shape) > 1 assume( not ( input_type == "series" diff --git a/python/cuml/tests/dask/conftest.py b/python/cuml/tests/dask/conftest.py index 29f09a44c9..3c6311dc03 100644 --- a/python/cuml/tests/dask/conftest.py +++ b/python/cuml/tests/dask/conftest.py @@ -4,6 +4,7 @@ from dask_cuda import initialize from dask_cuda import LocalCUDACluster +from dask_cuda.utils_test import IncreasedCloseTimeoutNanny from dask.distributed import Client enable_tcp_over_ucx = True @@ -14,7 +15,11 @@ @pytest.fixture(scope="module") def cluster(): - cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0) + cluster = LocalCUDACluster( + protocol="tcp", + scheduler_port=0, + worker_class=IncreasedCloseTimeoutNanny, + ) yield cluster cluster.close() @@ -40,6 +45,7 @@ def ucx_cluster(): enable_tcp_over_ucx=enable_tcp_over_ucx, enable_nvlink=enable_nvlink, enable_infiniband=enable_infiniband, + worker_class=IncreasedCloseTimeoutNanny, ) yield cluster cluster.close() diff --git a/python/cuml/tests/dask/test_dask_ordinal_encoder.py b/python/cuml/tests/dask/test_dask_ordinal_encoder.py new file mode 100644 index 0000000000..36b5fa92d3 --- /dev/null +++ b/python/cuml/tests/dask/test_dask_ordinal_encoder.py @@ -0,0 +1,117 @@ +# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import cupy as cp +import dask_cudf +import numpy as np +import pandas as pd +import pytest +from cudf import DataFrame +from cuml.dask.preprocessing import OrdinalEncoder +from distributed import Client + + +@pytest.mark.mg +def test_ordinal_encoder_df(client: Client) -> None: + X = DataFrame({"cat": ["M", "F", "F"], "int": [1, 3, 2]}) + X = dask_cudf.from_cudf(X, npartitions=2) + + enc = OrdinalEncoder() + enc.fit(X) + Xt = enc.transform(X) + + X_1 = DataFrame({"cat": ["F", "F"], "int": [1, 2]}) + X_1 = dask_cudf.from_cudf(X_1, npartitions=2) + + enc = OrdinalEncoder(client=client) + enc.fit(X) + Xt_1 = enc.transform(X_1) + + Xt_r = Xt.compute() + Xt_1_r = Xt_1.compute() + assert Xt_1_r.iloc[0, 0] == Xt_r.iloc[1, 0] + assert Xt_1_r.iloc[1, 0] == Xt_r.iloc[1, 0] + assert Xt_1_r.iloc[0, 1] == Xt_r.iloc[0, 1] + assert Xt_1_r.iloc[1, 1] == Xt_r.iloc[2, 1] + + # Turn Int64Index to RangeIndex for testing equality + inv_Xt = enc.inverse_transform(Xt).compute().reset_index(drop=True) + inv_Xt_1 = enc.inverse_transform(Xt_1).compute().reset_index(drop=True) + + X_r = X.compute() + X_1_r = X_1.compute() + + assert inv_Xt.equals(X_r) + assert inv_Xt_1.equals(X_1_r) + + assert enc.n_features_in_ == 2 + + +@pytest.mark.mg +def test_ordinal_encoder_array(client: Client) -> None: + X = DataFrame({"A": [4, 1, 1], "B": [1, 3, 2]}) + X = dask_cudf.from_cudf(X, npartitions=2).values + + enc = OrdinalEncoder() + enc.fit(X) + Xt = enc.transform(X) + + X_1 = DataFrame({"A": [1, 1], "B": [1, 2]}) + X_1 = dask_cudf.from_cudf(X_1, npartitions=2).values + + enc = OrdinalEncoder(client=client) + enc.fit(X) + Xt_1 = enc.transform(X_1) + + Xt_r = Xt.compute() + Xt_1_r = Xt_1.compute() + assert Xt_1_r[0, 0] == Xt_r[1, 0] + assert Xt_1_r[1, 0] == Xt_r[1, 0] + assert Xt_1_r[0, 1] == Xt_r[0, 1] + assert Xt_1_r[1, 1] == Xt_r[2, 1] + + inv_Xt = enc.inverse_transform(Xt) + inv_Xt_1 = enc.inverse_transform(Xt_1) + + cp.testing.assert_allclose(X.compute(), inv_Xt.compute()) + cp.testing.assert_allclose(X_1.compute(), inv_Xt_1.compute()) + + assert enc.n_features_in_ == 2 + + +@pytest.mark.mg +@pytest.mark.parametrize("as_array", [True, False], ids=["cupy", "cudf"]) +def test_handle_unknown(client, as_array: bool) -> None: + X = DataFrame({"data": [0, 1]}) + Y = DataFrame({"data": [3, 1]}) + + X = dask_cudf.from_cudf(X, npartitions=2) + Y = dask_cudf.from_cudf(Y, npartitions=2) + + if as_array: + X = X.values + Y = Y.values + + enc = OrdinalEncoder(handle_unknown="error") + enc = enc.fit(X) + with pytest.raises(KeyError): + enc.transform(Y).compute() + + enc = OrdinalEncoder(handle_unknown="ignore") + enc = enc.fit(X) + encoded = enc.transform(Y).compute() + if as_array: + np.isnan(encoded[0, 0]) + else: + assert pd.isna(encoded.iloc[0, 0]) diff --git a/python/cuml/tests/test_array.py b/python/cuml/tests/test_array.py index e2e02367c3..c4c479506c 100644 --- a/python/cuml/tests/test_array.py +++ b/python/cuml/tests/test_array.py @@ -201,10 +201,10 @@ def test_array_init_from_bytes(data_type, dtype, shape, order, mem_type): mem_type=cuml_array_mem_types(), ) @settings(deadline=None) -def test_array_init_bad(input_type, dtype, shape, order, mem_type): +def test_array_mem_type(input_type, dtype, shape, order, mem_type): """ - This test ensures that we assert on incorrect combinations of arguments - when creating CumlArray + Test whether we can create CumlArray from all supported types and array + shapes on all supported mem types. """ mem_type = MemoryType.from_str(mem_type) @@ -214,13 +214,6 @@ def test_array_init_bad(input_type, dtype, shape, order, mem_type): # Ensure the array is creatable array = CumlArray(input_array) - with pytest.raises(ValueError): - bad_dtype = np.float16 if dtype != np.float16 else np.float32 - CumlArray(input_array, dtype=bad_dtype) - - with pytest.raises(ValueError): - CumlArray(input_array, shape=(*array.shape, 1)) - input_mem_type = determine_array_memtype(input_array) if input_mem_type.is_device_accessible: joint_mem_type = input_mem_type diff --git a/python/cuml/tests/test_input_utils.py b/python/cuml/tests/test_input_utils.py index a3aef28446..fbbcde2105 100644 --- a/python/cuml/tests/test_input_utils.py +++ b/python/cuml/tests/test_input_utils.py @@ -396,7 +396,7 @@ def get_input( result = cudf.DataFrame(rand_mat, index=index) if type == "cudf-series": - result = cudf.Series(rand_mat, index=index) + result = cudf.Series(rand_mat.reshape(nrows), index=index) if type == "pandas": result = pdDF(cp.asnumpy(rand_mat), index=index) diff --git a/python/cuml/tests/test_ordinal_encoder.py b/python/cuml/tests/test_ordinal_encoder.py new file mode 100644 index 0000000000..c9379a43be --- /dev/null +++ b/python/cuml/tests/test_ordinal_encoder.py @@ -0,0 +1,133 @@ +# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import cupy as cp +import numpy as np +import pandas as pd +import pytest +from sklearn.preprocessing import OrdinalEncoder as skOrdinalEncoder + +from cuml.internals.safe_imports import gpu_only_import_from +from cuml.preprocessing import OrdinalEncoder + +DataFrame = gpu_only_import_from("cudf", "DataFrame") + + +@pytest.fixture +def test_sample(): + X = DataFrame({"cat": ["M", "F", "F"], "num": [1, 3, 2]}) + return X + + +def test_ordinal_encoder_df(test_sample) -> None: + X = test_sample + enc = OrdinalEncoder() + enc.fit(X) + Xt = enc.transform(X) + + X_1 = DataFrame({"cat": ["F", "F"], "num": [1, 2]}) + Xt_1 = enc.transform(X_1) + + assert Xt_1.iloc[0, 0] == Xt.iloc[1, 0] + assert Xt_1.iloc[1, 0] == Xt.iloc[1, 0] + assert Xt_1.iloc[0, 1] == Xt.iloc[0, 1] + assert Xt_1.iloc[1, 1] == Xt.iloc[2, 1] + + inv_Xt = enc.inverse_transform(Xt) + inv_Xt_1 = enc.inverse_transform(Xt_1) + + assert inv_Xt.equals(X) + assert inv_Xt_1.equals(X_1) + + assert enc.n_features_in_ == 2 + + +def test_ordinal_encoder_array() -> None: + X = DataFrame({"A": [4, 1, 1], "B": [1, 3, 2]}).values + enc = OrdinalEncoder() + enc.fit(X) + Xt = enc.transform(X) + + X_1 = DataFrame({"A": [1, 1], "B": [1, 2]}).values + Xt_1 = enc.transform(X_1) + + assert Xt_1[0, 0] == Xt[1, 0] + assert Xt_1[1, 0] == Xt[1, 0] + assert Xt_1[0, 1] == Xt[0, 1] + assert Xt_1[1, 1] == Xt[2, 1] + + inv_Xt = enc.inverse_transform(Xt) + inv_Xt_1 = enc.inverse_transform(Xt_1) + + cp.testing.assert_allclose(X, inv_Xt) + cp.testing.assert_allclose(X_1, inv_Xt_1) + + assert enc.n_features_in_ == 2 + + +def test_ordinal_array() -> None: + X = cp.arange(32).reshape(16, 2) + + enc = OrdinalEncoder() + enc.fit(X) + Xt = enc.transform(X) + + Xh = cp.asnumpy(X) + skenc = skOrdinalEncoder() + skenc.fit(Xh) + Xt_sk = skenc.transform(Xh) + + cp.testing.assert_allclose(Xt, Xt_sk) + + +def test_output_type(test_sample) -> None: + X = test_sample + enc = OrdinalEncoder(output_type="cupy").fit(X) + assert isinstance(enc.transform(X), cp.ndarray) + enc = OrdinalEncoder(output_type="cudf").fit(X) + assert isinstance(enc.transform(X), DataFrame) + enc = OrdinalEncoder(output_type="pandas").fit(X) + assert isinstance(enc.transform(X), pd.DataFrame) + enc = OrdinalEncoder(output_type="numpy").fit(X) + assert isinstance(enc.transform(X), np.ndarray) + # output_type == "input" + enc = OrdinalEncoder().fit(X) + assert isinstance(enc.transform(X), DataFrame) + + +def test_feature_names(test_sample) -> None: + enc = OrdinalEncoder().fit(test_sample) + assert enc.feature_names_in_ == ["cat", "num"] + + +@pytest.mark.parametrize("as_array", [True, False], ids=["cupy", "cudf"]) +def test_handle_unknown(as_array: bool) -> None: + X = DataFrame({"data": [0, 1]}) + Y = DataFrame({"data": [3, 1]}) + + if as_array: + X = X.values + Y = Y.values + + enc = OrdinalEncoder(handle_unknown="error") + enc = enc.fit(X) + with pytest.raises(KeyError): + enc.transform(Y) + + enc = OrdinalEncoder(handle_unknown="ignore") + enc = enc.fit(X) + encoded = enc.transform(Y) + if as_array: + np.isnan(encoded[0, 0]) + else: + assert pd.isna(encoded.iloc[0, 0]) diff --git a/python/cuml/tests/test_pickle.py b/python/cuml/tests/test_pickle.py index 9d99aab208..e1cfc84609 100644 --- a/python/cuml/tests/test_pickle.py +++ b/python/cuml/tests/test_pickle.py @@ -705,9 +705,6 @@ def assert_second_model(pickled_model, X): def test_svc_pickle(tmpdir, datatype, params, multiclass, sparse): result = {} - if sparse and multiclass: - pytest.skip("Multiclass SVC does not support sparse input") - if sparse and params["probability"]: pytest.skip("Probabilistic SVC does not support sparse input") @@ -740,6 +737,43 @@ def assert_model(pickled_model, data): pickle_save_load(tmpdir, create_mod, assert_model) +@pytest.mark.parametrize("datatype", [np.float32, np.float64]) +@pytest.mark.parametrize( + "params", [{"probability": True}, {"probability": False}] +) +@pytest.mark.parametrize("multiclass", [True, False]) +def test_linear_svc_pickle(tmpdir, datatype, params, multiclass): + result = {} + + def create_mod(): + model = cuml.svm.LinearSVC(**params) + iris = load_iris() + iris_selection = np.random.RandomState(42).choice( + [True, False], 150, replace=True, p=[0.75, 0.25] + ) + X_train = iris.data[iris_selection] + y_train = iris.target[iris_selection] + if not multiclass: + y_train = (y_train > 0).astype(datatype) + data = [X_train, y_train] + result["model"] = model.fit(X_train, y_train) + return model, data + + def assert_model(pickled_model, data): + if result["model"].probability: + print("Comparing probabilistic LinearSVC") + compare_probabilistic_svm( + result["model"], pickled_model, data[0], data[1], 0, 0 + ) + else: + print("comparing base LinearSVC") + pred_before = result["model"].predict(data[0]) + pred_after = pickled_model.predict(data[0]) + assert array_equal(pred_before, pred_after) + + pickle_save_load(tmpdir, create_mod, assert_model) + + @pytest.mark.parametrize("datatype", [np.float32, np.float64]) @pytest.mark.parametrize("nrows", [unit_param(500)]) @pytest.mark.parametrize("ncols", [unit_param(16)]) diff --git a/python/cuml/tests/test_svm.py b/python/cuml/tests/test_svm.py index a0f82f5dc5..5ae8895be1 100644 --- a/python/cuml/tests/test_svm.py +++ b/python/cuml/tests/test_svm.py @@ -41,8 +41,8 @@ np = cpu_only_import("numpy") cuda = gpu_only_import_from("numba", "cuda") - cudf = gpu_only_import("cudf") +scipy_sparse = cpu_only_import("scipy.sparse") IS_ARM = platform.processor() == "aarch64" @@ -176,13 +176,18 @@ def test_svm_skl_cmp_datasets(params, dataset, n_rows, n_cols): @pytest.mark.parametrize("params", [{"kernel": "rbf", "C": 1, "gamma": 1}]) +@pytest.mark.parametrize("sparse", [True, False]) def test_svm_skl_cmp_multiclass( - params, dataset="classification2", n_rows=100, n_cols=6 + params, sparse, dataset="classification2", n_rows=100, n_cols=6 ): X_train, X_test, y_train, y_test = make_dataset( dataset, n_rows, n_cols, n_classes=3, n_informative=6 ) + if sparse: + X_train = scipy_sparse.csr_matrix(X_train) + X_test = scipy_sparse.csr_matrix(X_test) + # Default to numpy for testing with cuml.using_output_type("numpy"): diff --git a/python/pyproject.toml b/python/pyproject.toml index 32f1b7a59e..34cad4c705 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -48,7 +48,7 @@ filterwarnings = [ [project] name = "cuml" -version = "23.12.00" +dynamic = ["version"] description = "cuML - RAPIDS ML Algorithms" readme = { file = "README.md", content-type = "text/markdown" } authors = [ @@ -61,11 +61,10 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "dask-cuda==23.12.*", "dask-cudf==23.12.*", - "dask==2023.9.2", - "distributed==2023.9.2", "joblib>=0.11", "numba>=0.57", "raft-dask==23.12.*", + "rapids-dask-dependency==23.12.*", "scipy>=1.8.0", "treelite==3.9.1", "treelite_runtime==3.9.1", @@ -79,7 +78,7 @@ classifiers = [ [project.optional-dependencies] test = [ - "dask-glm @ git+https://github.com/dask/dask-glm@main", + "dask-glm==0.3.0", "dask-ml", "hdbscan @ git+https://github.com/scikit-learn-contrib/hdbscan.git@master", "hypothesis>=6.0,<7", @@ -104,6 +103,9 @@ Documentation = "https://docs.rapids.ai/api/cuml/stable/" [tool.setuptools] license-files = ["LICENSE"] +[tool.setuptools.dynamic] +version = {file = "cuml/VERSION"} + [tool.black] line-length = 79 target-version = ["py39"] diff --git a/python/setup.py b/python/setup.py index 8e6615a2ef..5a30d78201 100644 --- a/python/setup.py +++ b/python/setup.py @@ -109,6 +109,6 @@ def clean_folder(path): packages = find_packages(include=["cuml*"]) setup( packages=packages, - package_data={key: ["*.pxd"] for key in packages}, + package_data={key: ["VERSION", "*.pxd"] for key in packages}, zip_safe=False, )