From 02960433bfe597dca371f6e4a0d645fae041d132 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 3 Nov 2023 12:39:56 -0700 Subject: [PATCH 01/18] Use drop_duplicates instead of unique for cudf's pandas compatibility mode (#5639) In pandas, `Series.unique` returns a numpy array (for non-extension types) while `Series.drop_duplicates` returns a `Series`. The two results should otherwise contain the same set of values. In cudf, historically both methods returned a `Series`, and at these stages in cuml's pipeline it knows that it is working with cudf objects. However, if cudf has pandas compatibility mode enabled, then `unique` will return an array to match pandas behavior. In this scenario, the method chaining no longer works because cupy is calling methods on the result of `unique` assuming that it will be a `Series`. To fix this, cuml needs to call `drop_duplicates` instead. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Simon Adorf (https://github.com/csadorf) URL: https://github.com/rapidsai/cuml/pull/5639 --- python/cuml/feature_extraction/_vectorizers.py | 4 +++- python/cuml/preprocessing/LabelEncoder.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cuml/feature_extraction/_vectorizers.py b/python/cuml/feature_extraction/_vectorizers.py index 78172ec690..0133195b20 100644 --- a/python/cuml/feature_extraction/_vectorizers.py +++ b/python/cuml/feature_extraction/_vectorizers.py @@ -598,7 +598,9 @@ def fit_transform(self, raw_documents, y=None): if self._fixed_vocabulary: self.vocabulary_ = self.vocabulary else: - self.vocabulary_ = tokenized_df["token"].unique().sort_values() + self.vocabulary_ = ( + tokenized_df["token"].drop_duplicates().sort_values() + ) count_df = self._count_vocab(tokenized_df) diff --git a/python/cuml/preprocessing/LabelEncoder.py b/python/cuml/preprocessing/LabelEncoder.py index 882e552511..c8221ff951 100644 --- a/python/cuml/preprocessing/LabelEncoder.py +++ b/python/cuml/preprocessing/LabelEncoder.py @@ -180,7 +180,7 @@ def fit(self, y, _classes=None): if _classes is not None: self.classes_ = _classes else: - self.classes_ = y.unique().sort_values( + self.classes_ = y.drop_duplicates().sort_values( ignore_index=True ) # dedupe and sort From b79c09f6ea59cf9dfe48da888281c4048db560e5 Mon Sep 17 00:00:00 2001 From: Simon Adorf Date: Wed, 8 Nov 2023 21:15:06 +0100 Subject: [PATCH 02/18] CI: Fix expected ValueError and dask-glm incompatibility (#5644) - Expect all supported types and shapes to work with host mem type. - Pin dask-glm to 0.3.0. Authors: - Simon Adorf (https://github.com/csadorf) Approvers: - Jake Awe (https://github.com/AyodeAwe) - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/cuml/pull/5644 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-120_arch-x86_64.yaml | 2 +- dependencies.yaml | 6 ++---- python/cuml/tests/test_array.py | 13 +++---------- python/pyproject.toml | 2 +- 5 files changed, 8 insertions(+), 17 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index abbf54a4c2..8d4b9ab3ce 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -77,5 +77,5 @@ dependencies: - treelite==3.9.1 - umap-learn==0.5.3 - pip: - - git+https://github.com/dask/dask-glm@main + - dask-glm==0.3.0 name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index d2c8747d35..af119ecb72 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -73,5 +73,5 @@ dependencies: - treelite==3.9.1 - umap-learn==0.5.3 - pip: - - git+https://github.com/dask/dask-glm@main + - dask-glm==0.3.0 name: all_cuda-120_arch-x86_64 diff --git a/dependencies.yaml b/dependencies.yaml index 4387a85fdd..568781a45f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -346,15 +346,13 @@ dependencies: packages: - pip - pip: - # TODO: Figure out what to do with this dependency - # since the repo is now archived. - - git+https://github.com/dask/dask-glm@main + - dask-glm==0.3.0 # TODO: remove pin once a release that includes fixes for the error # is released: https://github.com/rapidsai/cuml/issues/5514 - hdbscan<=0.8.30 - output_types: pyproject packages: - - dask-glm @ git+https://github.com/dask/dask-glm@main + - dask-glm==0.3.0 # TODO: Can we stop pulling from the master branch now that there was a release in October? - hdbscan @ git+https://github.com/scikit-learn-contrib/hdbscan.git@master test_notebooks: diff --git a/python/cuml/tests/test_array.py b/python/cuml/tests/test_array.py index e2e02367c3..c4c479506c 100644 --- a/python/cuml/tests/test_array.py +++ b/python/cuml/tests/test_array.py @@ -201,10 +201,10 @@ def test_array_init_from_bytes(data_type, dtype, shape, order, mem_type): mem_type=cuml_array_mem_types(), ) @settings(deadline=None) -def test_array_init_bad(input_type, dtype, shape, order, mem_type): +def test_array_mem_type(input_type, dtype, shape, order, mem_type): """ - This test ensures that we assert on incorrect combinations of arguments - when creating CumlArray + Test whether we can create CumlArray from all supported types and array + shapes on all supported mem types. """ mem_type = MemoryType.from_str(mem_type) @@ -214,13 +214,6 @@ def test_array_init_bad(input_type, dtype, shape, order, mem_type): # Ensure the array is creatable array = CumlArray(input_array) - with pytest.raises(ValueError): - bad_dtype = np.float16 if dtype != np.float16 else np.float32 - CumlArray(input_array, dtype=bad_dtype) - - with pytest.raises(ValueError): - CumlArray(input_array, shape=(*array.shape, 1)) - input_mem_type = determine_array_memtype(input_array) if input_mem_type.is_device_accessible: joint_mem_type = input_mem_type diff --git a/python/pyproject.toml b/python/pyproject.toml index 346f270395..ed9b4fd45c 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -79,7 +79,7 @@ classifiers = [ [project.optional-dependencies] test = [ - "dask-glm @ git+https://github.com/dask/dask-glm@main", + "dask-glm==0.3.0", "dask-ml", "hdbscan @ git+https://github.com/scikit-learn-contrib/hdbscan.git@master", "hypothesis>=6.0,<7", From 9fed69eb3ca676e607f656ec6831decfe0d816f0 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Thu, 9 Nov 2023 16:27:05 +0100 Subject: [PATCH 03/18] Add rich HTML representation to estimators (#5630) This adds a Jupyter (and other notebook) rich display hook that produces a HTML widget to represent an estimator in notebooks. This adds the basics of having estimators displayed as HTML widgets in notebooks and other editors that use the Jupyter notebook "rich display" system. Screenshot 2023-10-26 at 15 33 37 This doesn't yet contain the cool feature of changing colour depending on fit status or the link to the documentation. For that we'd have to depend on a newer version of scikit-learn (or vendor the logic). In this case "newer" actually means "the next version to be released". WDYT? Authors: - Tim Head (https://github.com/betatim) - Simon Adorf (https://github.com/csadorf) - Dante Gama Dessavre (https://github.com/dantegd) Approvers: - Simon Adorf (https://github.com/csadorf) URL: https://github.com/rapidsai/cuml/pull/5630 --- python/cuml/internals/base.pyx | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/cuml/internals/base.pyx b/python/cuml/internals/base.pyx index 4fb03fdac9..09dda66064 100644 --- a/python/cuml/internals/base.pyx +++ b/python/cuml/internals/base.pyx @@ -28,6 +28,8 @@ from cuml.internals.safe_imports import ( np = cpu_only_import('numpy') nvtx_annotate = gpu_only_import_from("nvtx", "annotate", alt=null_decorator) +from sklearn.utils import estimator_html_repr + import cuml import cuml.common import cuml.internals.logger as logger @@ -443,6 +445,12 @@ class Base(TagsMixin, return {'preserves_dtype': [self.dtype]} return {} + def _repr_mimebundle_(self, **kwargs): + """Prepare representations used by jupyter kernels to display estimator""" + output = {"text/plain": repr(self)} + output["text/html"] = estimator_html_repr(self) + return output + def set_nvtx_annotations(self): for func_name in ['fit', 'transform', 'predict', 'fit_transform', 'fit_predict']: From 6d5118595db3b9e6800937f1e664e3ce41feedb7 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 9 Nov 2023 12:59:54 -0800 Subject: [PATCH 04/18] Flatten cupy array before feeding to cudf.Series (#5651) Previously it seems that cudf was silently flattening 2D arrays when passing them to the cudf.Series constructor, but that is no longer supported here so the test code needs to be updated. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Simon Adorf (https://github.com/csadorf) URL: https://github.com/rapidsai/cuml/pull/5651 --- python/cuml/tests/test_input_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/tests/test_input_utils.py b/python/cuml/tests/test_input_utils.py index a3aef28446..fbbcde2105 100644 --- a/python/cuml/tests/test_input_utils.py +++ b/python/cuml/tests/test_input_utils.py @@ -396,7 +396,7 @@ def get_input( result = cudf.DataFrame(rand_mat, index=index) if type == "cudf-series": - result = cudf.Series(rand_mat, index=index) + result = cudf.Series(rand_mat.reshape(nrows), index=index) if type == "pandas": result = pdDF(cp.asnumpy(rand_mat), index=index) From 325a7e65e0b3dd9dd0e0e6fbd061fc0f11fcd87c Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 13 Nov 2023 10:27:26 -0600 Subject: [PATCH 05/18] Use new `rapids-dask-dependency` metapackage for managing `dask` versions (#5649) Currently dask versions are pinned as part of every release cycle and then unpinned for the next development cycle across all of RAPIDS. This introduces a great deal of churn. To centralize the dependency, we have created a metapackage to manage the required dask version and this PR introduces that metapackage as a dependency of cuml. xref: https://github.com/rapidsai/cudf/pull/14364 Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) - Simon Adorf (https://github.com/csadorf) Approvers: - Simon Adorf (https://github.com/csadorf) - Jake Awe (https://github.com/AyodeAwe) URL: https://github.com/rapidsai/cuml/pull/5649 --- ci/build_wheel.sh | 4 ++++ ci/release/update-version.sh | 11 ++++------- ci/test_wheel.sh | 3 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 4 +--- conda/environments/all_cuda-120_arch-x86_64.yaml | 4 +--- conda/recipes/cuml/meta.yaml | 4 +--- dependencies.yaml | 6 +----- python/README.md | 7 +++---- python/pyproject.toml | 3 +-- 9 files changed, 16 insertions(+), 30 deletions(-) diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index 0231726b82..e4941ad1a8 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -38,6 +38,10 @@ for dep in cudf pylibraft raft-dask rmm; do sed -r -i "s/${dep}==(.*)\"/${dep}${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file} done +for dep in dask-cuda rapids-dask-dependency; do + sed -r -i "s/${dep}==(.*)\"/${dep}==\1${alpha_spec}\"/g" ${pyproject_file} +done + if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then sed -i "s/cuda-python[<=>\.,0-9]*/cuda-python>=12.0,<13.0/g" ${pyproject_file} sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file} diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 9b07a58476..6e8f43cea7 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -73,6 +73,7 @@ DEPENDENCIES=( librmm pylibraft raft-dask + rapids-dask-dependency rmm ) for FILE in dependencies.yaml conda/environments/*.yaml; do @@ -81,17 +82,13 @@ for FILE in dependencies.yaml conda/environments/*.yaml; do done done -sed_runner "s|/branch-.*?/|/branch-${NEXT_SHORT_TAG}/|g" README.md -sed_runner "s|/branch-.*?/|/branch-${NEXT_SHORT_TAG}/|g" python/README.md +sed_runner "s|/branch-[^/]*/|/branch-${NEXT_SHORT_TAG}/|g" README.md +sed_runner "s|/branch-[^/]*/|/branch-${NEXT_SHORT_TAG}/|g" python/README.md +sed_runner "/- rapids-dask-dependency==/ s/==.*/==${NEXT_SHORT_TAG}\.*/g" python/README.md # Wheel builds clone cumlprims_mg, update its branch sed_runner "s/extra-repo-sha: branch-.*/extra-repo-sha: branch-${NEXT_SHORT_TAG}/g" .github/workflows/*.yaml -# Wheel builds install dask-cuda from source, update its branch -for FILE in .github/workflows/*.yaml; do - sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE}; -done - # CI files for FILE in .github/workflows/*.yaml; do sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index f6c61eabac..d1cb6e8e27 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -12,9 +12,6 @@ if [[ "$(arch)" == "aarch64" ]]; then python -m pip install cmake fi -# Always install latest dask for testing -python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.12 - # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install $(echo ./dist/cuml*.whl)[test] diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 8d4b9ab3ce..b650ab412b 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -16,12 +16,9 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 -- dask-core>=2023.9.2 - dask-cuda==23.12.* - dask-cudf==23.12.* - dask-ml -- dask>=2023.9.2 -- distributed>=2023.9.2 - doxygen=1.9.1 - gcc_linux-64=11.* - gmock>=1.13.0 @@ -63,6 +60,7 @@ dependencies: - pytest-xdist - python>=3.9,<3.11 - raft-dask==23.12.* +- rapids-dask-dependency==23.12.* - recommonmark - rmm==23.12.* - scikit-build>=0.13.1 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index af119ecb72..ffe3e3d0ff 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -18,12 +18,9 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.0 -- dask-core>=2023.9.2 - dask-cuda==23.12.* - dask-cudf==23.12.* - dask-ml -- dask>=2023.9.2 -- distributed>=2023.9.2 - doxygen=1.9.1 - gcc_linux-64=11.* - gmock>=1.13.0 @@ -59,6 +56,7 @@ dependencies: - pytest-xdist - python>=3.9,<3.11 - raft-dask==23.12.* +- rapids-dask-dependency==23.12.* - recommonmark - rmm==23.12.* - scikit-build>=0.13.1 diff --git a/conda/recipes/cuml/meta.yaml b/conda/recipes/cuml/meta.yaml index d4767a00bc..bcafb63bb6 100644 --- a/conda/recipes/cuml/meta.yaml +++ b/conda/recipes/cuml/meta.yaml @@ -76,15 +76,13 @@ requirements: - cudf ={{ minor_version }} - cupy >=12.0.0 - dask-cudf ={{ minor_version }} - - dask >=2023.9.2 - - dask-core>=2023.9.2 - - distributed >=2023.9.2 - joblib >=0.11 - libcuml ={{ version }} - libcumlprims ={{ minor_version }} - pylibraft ={{ minor_version }} - python x.x - raft-dask ={{ minor_version }} + - rapids-dask-dependency ={{ minor_version }} - treelite {{ treelite_version }} tests: diff --git a/dependencies.yaml b/dependencies.yaml index 568781a45f..d6dfc19714 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -175,10 +175,8 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cudf==23.12.* - - dask>=2023.9.2 - dask-cuda==23.12.* - dask-cudf==23.12.* - - distributed>=2023.9.2 - joblib>=0.11 - numba>=0.57 # TODO: Is scipy really a hard dependency, or should @@ -186,13 +184,11 @@ dependencies: # installation/run_constrained for conda)? - scipy>=1.8.0 - raft-dask==23.12.* + - rapids-dask-dependency==23.12.* - *treelite - output_types: [conda, requirements] packages: - cupy>=12.0.0 - - output_types: conda - packages: - - dask-core>=2023.9.2 - output_types: pyproject packages: - *treelite_runtime diff --git a/python/README.md b/python/README.md index 342ad177ab..4a19e16e99 100644 --- a/python/README.md +++ b/python/README.md @@ -38,7 +38,7 @@ example `setup.py --singlegpu`) are: RAFT's Python and Cython is located in the [RAFT repository](https://github.com/rapidsai/raft/python). It was designed to be included in projects as opposed to be distributed by itself, so at build time, **setup.py creates a symlink from cuML, located in `/python/cuml/raft/` to the Python folder of RAFT**. -For developers that need to modify RAFT code, please refer to the [RAFT Developer Guide](https://github.com/rapidsai/raft/blob/branch-23.04/BUILD.md#developer-guide) for recommendations. +For developers that need to modify RAFT code, please refer to the [RAFT Developer Guide](https://github.com/rapidsai/raft/blob/branch-23.12/docs/source/build.md) for recommendations. To configure RAFT at build time: @@ -50,7 +50,7 @@ The RAFT Python code gets included in the cuML build and distributable artifacts ### Build Requirements -cuML's convenience [development yaml files](https://github.com/rapidsai/cuml/tree/branch-23.04/environments) includes all dependencies required to build cuML. +cuML's convenience [development yaml files](https://github.com/rapidsai/cuml/tree/branch-23.12/environments) includes all dependencies required to build cuML. To build cuML's Python package, the following dependencies are required: @@ -70,8 +70,7 @@ Packages required for multigpu algorithms*: - ucx-py version matching the cuML version - dask-cudf version matching the cuML version - nccl>=2.5 -- dask>=2023.9.2 -- distributed>=2023.9.2 +- rapids-dask-dependency==23.12.* * this can be avoided with `--singlegpu` argument flag. diff --git a/python/pyproject.toml b/python/pyproject.toml index ed9b4fd45c..34cad4c705 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -61,11 +61,10 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "dask-cuda==23.12.*", "dask-cudf==23.12.*", - "dask>=2023.9.2", - "distributed>=2023.9.2", "joblib>=0.11", "numba>=0.57", "raft-dask==23.12.*", + "rapids-dask-dependency==23.12.*", "scipy>=1.8.0", "treelite==3.9.1", "treelite_runtime==3.9.1", From a5b839f8f2e2ca6647947286b98e39c2d7399ab9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 14 Nov 2023 09:03:22 -0800 Subject: [PATCH 06/18] Simplify some logic in LabelEncoder (#5648) I accidentally committed but forgot to push some changes requested by @csadorf in https://github.com/rapidsai/cuml/pull/5639. Authors: - Vyas Ramasubramani (https://github.com/vyasr) - Simon Adorf (https://github.com/csadorf) Approvers: - Simon Adorf (https://github.com/csadorf) URL: https://github.com/rapidsai/cuml/pull/5648 --- python/cuml/preprocessing/LabelEncoder.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/cuml/preprocessing/LabelEncoder.py b/python/cuml/preprocessing/LabelEncoder.py index c8221ff951..aceed2766a 100644 --- a/python/cuml/preprocessing/LabelEncoder.py +++ b/python/cuml/preprocessing/LabelEncoder.py @@ -171,19 +171,19 @@ def fit(self, y, _classes=None): A fitted instance of itself to allow method chaining """ - if _classes is None: - y = self._to_cudf_series(y) - self._validate_keywords() - self.dtype = y.dtype if y.dtype != cp.dtype("O") else str - if _classes is not None: - self.classes_ = _classes - else: - self.classes_ = y.drop_duplicates().sort_values( - ignore_index=True + if _classes is None: + y = ( + self._to_cudf_series(y) + .drop_duplicates() + .sort_values(ignore_index=True) ) # dedupe and sort + self.classes_ = y + else: + self.classes_ = _classes + self.dtype = y.dtype if y.dtype != cp.dtype("O") else str self._fitted = True return self From f2e9459ad41e068adc1090fd4ce4849c6284650b Mon Sep 17 00:00:00 2001 From: Simon Adorf Date: Tue, 14 Nov 2023 20:58:00 +0100 Subject: [PATCH 07/18] Adjust assumption regarding valid cudf.Series dimensional input. (#5654) cudf.Series does not accept any multi-dimensional input anymore. Authors: - Simon Adorf (https://github.com/csadorf) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - William Hicks (https://github.com/wphicks) URL: https://github.com/rapidsai/cuml/pull/5654 --- python/cuml/testing/strategies.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/cuml/testing/strategies.py b/python/cuml/testing/strategies.py index 39b2068e5d..a8f849866a 100644 --- a/python/cuml/testing/strategies.py +++ b/python/cuml/testing/strategies.py @@ -187,9 +187,7 @@ def create_cuml_array_input(input_type, dtype, shape, order): input_type = "cupy" if input_type is None else input_type - multidimensional = ( - isinstance(shape, tuple) and len([d for d in shape if d > 1]) > 1 - ) + multidimensional = isinstance(shape, tuple) and len(shape) > 1 assume( not ( input_type == "series" From 8b07b000b550ef2af88910355943d728d0489b8b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 17 Nov 2023 10:36:46 -0600 Subject: [PATCH 08/18] Enable build concurrency for nightly and merge triggers. (#5658) --- .github/workflows/build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 776c7ae761..63bc954711 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -22,7 +22,7 @@ on: default: nightly concurrency: - group: ${{ github.workflow }}-${{ github.ref }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} cancel-in-progress: true jobs: From c48eaa18abf6f083cf8b347c3061cf7492616af6 Mon Sep 17 00:00:00 2001 From: Simon Adorf Date: Sat, 18 Nov 2023 10:10:50 +0100 Subject: [PATCH 09/18] CI: Pin clang-tidy to 15.0.7. (#5661) Most recent supported version by libcudacxx. Compilation introduced as a transitive dependency from rmm. Authors: - Simon Adorf (https://github.com/csadorf) Approvers: - Bradley Dice (https://github.com/bdice) - Dante Gama Dessavre (https://github.com/dantegd) - Jake Awe (https://github.com/AyodeAwe) URL: https://github.com/rapidsai/cuml/pull/5661 --- conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml | 4 ++-- cpp/scripts/run-clang-tidy.py | 2 +- dependencies.yaml | 5 +++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml b/conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml index 515abd8929..3f63d4b3f6 100644 --- a/conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml +++ b/conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml @@ -8,8 +8,8 @@ channels: - nvidia dependencies: - c-compiler -- clang-tools==16.0.6 -- clang==16.0.6 +- clang-tools==15.0.7 +- clang==15.0.7 - cmake>=3.26.4 - cuda-version=11.8 - cudatoolkit diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py index 678534b899..67189573f9 100755 --- a/cpp/scripts/run-clang-tidy.py +++ b/cpp/scripts/run-clang-tidy.py @@ -25,7 +25,7 @@ import tomli -EXPECTED_VERSION = "16.0.6" +EXPECTED_VERSION = "15.0.7" VERSION_REGEX = re.compile(r" LLVM version ([0-9.]+)") GPU_ARCH_REGEX = re.compile(r"sm_(\d+)") SPACES = re.compile(r"\s+") diff --git a/dependencies.yaml b/dependencies.yaml index d6dfc19714..1a875c75ec 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -95,8 +95,9 @@ dependencies: common: - output_types: [conda, requirements] packages: - - clang==16.0.6 - - clang-tools==16.0.6 + # clang 15 required by libcudacxx. + - clang==15.0.7 + - clang-tools==15.0.7 - ninja - tomli common_build: From f79d40fe92560033be518d1a2a12d35e42cba9c8 Mon Sep 17 00:00:00 2001 From: Simon Adorf Date: Tue, 21 Nov 2023 16:20:50 +0100 Subject: [PATCH 10/18] Avoid hard import of sklearn in base module. (#5663) Fixes #5662. Authors: - Simon Adorf (https://github.com/csadorf) - Bradley Dice (https://github.com/bdice) Approvers: - Bradley Dice (https://github.com/bdice) - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/cuml/pull/5663 --- python/cuml/internals/base.pyx | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/python/cuml/internals/base.pyx b/python/cuml/internals/base.pyx index 09dda66064..c00ed17f98 100644 --- a/python/cuml/internals/base.pyx +++ b/python/cuml/internals/base.pyx @@ -28,7 +28,11 @@ from cuml.internals.safe_imports import ( np = cpu_only_import('numpy') nvtx_annotate = gpu_only_import_from("nvtx", "annotate", alt=null_decorator) -from sklearn.utils import estimator_html_repr +try: + from sklearn.utils import estimator_html_repr +except ImportError: + estimator_html_repr = None + import cuml import cuml.common @@ -447,9 +451,10 @@ class Base(TagsMixin, def _repr_mimebundle_(self, **kwargs): """Prepare representations used by jupyter kernels to display estimator""" - output = {"text/plain": repr(self)} - output["text/html"] = estimator_html_repr(self) - return output + if estimator_html_repr is not None: + output = {"text/plain": repr(self)} + output["text/html"] = estimator_html_repr(self) + return output def set_nvtx_annotations(self): for func_name in ['fit', 'transform', 'predict', 'fit_transform', From 1570ed736402e0edaca479bfb3135e41bca1269a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Malte=20F=C3=B6rster?= <97973773+mfoerste4@users.noreply.github.com> Date: Tue, 21 Nov 2023 22:19:39 +0100 Subject: [PATCH 11/18] Enable multiclass svm for sparse input (#5588) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit enables multiclass SVM for sparse input. Previously this was deactivated as the `input_to_host_array` functionality does not support sparse arrays, but the data has to be piped through sklearn classes which requires host data. @dantegd , this is a local workaround to enable a `input_to_host_array` for sparse data without the complexity of providing the whole functionality of that function. Please have a look whether this is an acceptable solution for this use case. FYI, @tfeher Authors: - Malte Fรถrster (https://github.com/mfoerste4) - Tamas Bela Feher (https://github.com/tfeher) - Simon Adorf (https://github.com/csadorf) Approvers: - Tamas Bela Feher (https://github.com/tfeher) - Simon Adorf (https://github.com/csadorf) URL: https://github.com/rapidsai/cuml/pull/5588 --- cpp/src/svm/kernelcache.cuh | 12 ++++++++++-- python/cuml/common/__init__.py | 4 +++- python/cuml/internals/input_utils.py | 14 ++++++++++++++ python/cuml/multiclass/multiclass.py | 19 +++++++++++++++---- python/cuml/svm/svc.pyx | 10 ++++------ python/cuml/tests/test_pickle.py | 3 --- python/cuml/tests/test_svm.py | 9 +++++++-- 7 files changed, 53 insertions(+), 18 deletions(-) diff --git a/cpp/src/svm/kernelcache.cuh b/cpp/src/svm/kernelcache.cuh index e20c4d5d12..4583581d2d 100644 --- a/cpp/src/svm/kernelcache.cuh +++ b/cpp/src/svm/kernelcache.cuh @@ -130,8 +130,16 @@ class BatchCache : public raft::cache::Cache { RAFT_CUDA_TRY(cudaMemsetAsync(tmp_buffer, 0, n_ws * 2 * sizeof(int), stream)); // Init cub buffers - cub::DeviceRadixSort::SortKeys( - NULL, d_temp_storage_size, tmp_buffer, tmp_buffer, n_ws, 0, sizeof(int) * 8, stream); + cub::DeviceRadixSort::SortPairs(NULL, + d_temp_storage_size, + tmp_buffer, + tmp_buffer, + tmp_buffer, + tmp_buffer, + n_ws, + 0, + sizeof(int) * 8, + stream); d_temp_storage.resize(d_temp_storage_size, stream); } diff --git a/python/cuml/common/__init__.py b/python/cuml/common/__init__.py index 6a46462878..e267bf668b 100644 --- a/python/cuml/common/__init__.py +++ b/python/cuml/common/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -31,6 +31,7 @@ from cuml.internals.input_utils import input_to_cuml_array from cuml.internals.input_utils import input_to_host_array +from cuml.internals.input_utils import input_to_host_array_with_sparse_support from cuml.internals.memory_utils import rmm_cupy_ary from cuml.internals.memory_utils import set_global_output_type @@ -59,6 +60,7 @@ "has_scipy", "input_to_cuml_array", "input_to_host_array", + "input_to_host_array_with_sparse_support", "rmm_cupy_ary", "set_global_output_type", "using_device_type", diff --git a/python/cuml/internals/input_utils.py b/python/cuml/internals/input_utils.py index bb9e8bc3e3..edcbffabaa 100644 --- a/python/cuml/internals/input_utils.py +++ b/python/cuml/internals/input_utils.py @@ -497,6 +497,20 @@ def input_to_host_array( return out_data._replace(array=out_data.array.to_output("numpy")) +def input_to_host_array_with_sparse_support(X): + _array_type, is_sparse = determine_array_type_full(X) + if is_sparse: + if _array_type == "cupy": + return SparseCumlArray(X).to_output(output_type="scipy") + elif _array_type == "cuml": + return X.to_output(output_type="scipy") + elif _array_type == "numpy": + return X + else: + raise ValueError(f"Unsupported sparse array type: {_array_type}.") + return input_to_host_array(X).array + + def convert_dtype(X, to_dtype=np.float32, legacy=True, safe_dtype=True): """ Convert X to be of dtype `dtype`, raising a TypeError diff --git a/python/cuml/multiclass/multiclass.py b/python/cuml/multiclass/multiclass.py index e97de7256b..65b378a17b 100644 --- a/python/cuml/multiclass/multiclass.py +++ b/python/cuml/multiclass/multiclass.py @@ -20,7 +20,15 @@ from cuml.internals.import_utils import has_sklearn from cuml.internals.mixins import ClassifierMixin from cuml.common.doc_utils import generate_docstring -from cuml.common import input_to_host_array +from cuml.common import ( + input_to_host_array, + input_to_host_array_with_sparse_support, +) +from cuml.internals.input_utils import ( + input_to_cupy_array, + determine_array_type_full, +) +from cuml.internals.array_sparse import SparseCumlArray from cuml.internals import _deprecate_pos_args @@ -142,7 +150,9 @@ def fit(self, X, y) -> "MulticlassClassifier": + ", must be one of " '{"ovr", "ovo"}' ) - X = input_to_host_array(X).array + + X = input_to_host_array_with_sparse_support(X) + y = input_to_host_array(y).array with cuml.internals.exit_internal_api(): self.multiclass_estimator.fit(X, y) @@ -160,7 +170,8 @@ def predict(self, X) -> CumlArray: """ Predict using multi class classifier. """ - X = input_to_host_array(X).array + X = input_to_host_array_with_sparse_support(X) + with cuml.internals.exit_internal_api(): return self.multiclass_estimator.predict(X) @@ -177,7 +188,7 @@ def decision_function(self, X) -> CumlArray: """ Calculate the decision function. """ - X = input_to_host_array(X).array + X = input_to_host_array_with_sparse_support(X) with cuml.internals.exit_internal_api(): return self.multiclass_estimator.decision_function(X) diff --git a/python/cuml/svm/svc.pyx b/python/cuml/svm/svc.pyx index 2fff2672fd..d5d5c35e3e 100644 --- a/python/cuml/svm/svc.pyx +++ b/python/cuml/svm/svc.pyx @@ -35,7 +35,7 @@ from cuml.common.doc_utils import generate_docstring from cuml.internals.logger import warn from pylibraft.common.handle cimport handle_t from pylibraft.common.interruptible import cuda_interruptible -from cuml.common import input_to_cuml_array, input_to_host_array +from cuml.common import input_to_cuml_array, input_to_host_array, input_to_host_array_with_sparse_support from cuml.internals.input_utils import input_to_cupy_array, determine_array_type_full from cuml.preprocessing import LabelEncoder from libcpp cimport nullptr @@ -449,7 +449,7 @@ class SVC(SVMBase, # Currently CalibratedClassifierCV expects data on the host, see # https://github.com/rapidsai/cuml/issues/2608 - X = input_to_host_array(X).array + X = input_to_host_array_with_sparse_support(X) y = input_to_host_array(y).array if not has_sklearn(): @@ -485,8 +485,6 @@ class SVC(SVMBase, return self._fit_proba(X, y, sample_weight) if self.n_classes_ > 2: - if is_sparse: - raise ValueError("Multiclass SVM does not support sparse input.") return self._fit_multiclass(X, y, sample_weight) if is_sparse: @@ -594,7 +592,7 @@ class SVC(SVMBase, if self.probability: self._check_is_fitted('prob_svc') - X = input_to_host_array(X).array + X = input_to_host_array_with_sparse_support(X) with cuml.internals.exit_internal_api(): preds = self.prob_svc.predict(X) @@ -628,7 +626,7 @@ class SVC(SVMBase, if self.probability: self._check_is_fitted('prob_svc') - X = input_to_host_array(X).array + X = input_to_host_array_with_sparse_support(X) # Exit the internal API when calling sklearn code (forces numpy # conversion) diff --git a/python/cuml/tests/test_pickle.py b/python/cuml/tests/test_pickle.py index 950b00a612..e1cfc84609 100644 --- a/python/cuml/tests/test_pickle.py +++ b/python/cuml/tests/test_pickle.py @@ -705,9 +705,6 @@ def assert_second_model(pickled_model, X): def test_svc_pickle(tmpdir, datatype, params, multiclass, sparse): result = {} - if sparse and multiclass: - pytest.skip("Multiclass SVC does not support sparse input") - if sparse and params["probability"]: pytest.skip("Probabilistic SVC does not support sparse input") diff --git a/python/cuml/tests/test_svm.py b/python/cuml/tests/test_svm.py index a0f82f5dc5..5ae8895be1 100644 --- a/python/cuml/tests/test_svm.py +++ b/python/cuml/tests/test_svm.py @@ -41,8 +41,8 @@ np = cpu_only_import("numpy") cuda = gpu_only_import_from("numba", "cuda") - cudf = gpu_only_import("cudf") +scipy_sparse = cpu_only_import("scipy.sparse") IS_ARM = platform.processor() == "aarch64" @@ -176,13 +176,18 @@ def test_svm_skl_cmp_datasets(params, dataset, n_rows, n_cols): @pytest.mark.parametrize("params", [{"kernel": "rbf", "C": 1, "gamma": 1}]) +@pytest.mark.parametrize("sparse", [True, False]) def test_svm_skl_cmp_multiclass( - params, dataset="classification2", n_rows=100, n_cols=6 + params, sparse, dataset="classification2", n_rows=100, n_cols=6 ): X_train, X_test, y_train, y_test = make_dataset( dataset, n_rows, n_cols, n_classes=3, n_informative=6 ) + if sparse: + X_train = scipy_sparse.csr_matrix(X_train) + X_test = scipy_sparse.csr_matrix(X_test) + # Default to numpy for testing with cuml.using_output_type("numpy"): From 21fbf04d3c60aa0e0958b22c1e1130b24bb31ef9 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 22 Nov 2023 05:32:44 +0800 Subject: [PATCH 12/18] Basic implementation of `OrdinalEncoder`. (#5646) - Implement `OrdinalEncoder`. - Implement dask version. - Fix dask transformers with DataFrame input by using `dask_cudf` to construct return df. Some other scikit-learn features are not available yet, for instance, `encoded_missing_value`, `min_frequency`, and `max_categories`. The implementation is mostly based on the existing one hot encoder and label encoder. I'm a bit confused by the `output_type` parameter and not sure how strictly it's enforced. I looked around, it seems some estimators can ignore this parameter in their returns. Would be great if there's a guideline on how to handle this parameter, along with https://github.com/rapidsai/cuml/issues/5645 . Close https://github.com/rapidsai/cuml/issues/4456 . Authors: - Jiaming Yuan (https://github.com/trivialfis) - Simon Adorf (https://github.com/csadorf) Approvers: - Simon Adorf (https://github.com/csadorf) URL: https://github.com/rapidsai/cuml/pull/5646 --- python/cuml/common/doc_utils.py | 3 +- python/cuml/dask/common/base.py | 3 +- python/cuml/dask/preprocessing/__init__.py | 3 +- python/cuml/dask/preprocessing/encoders.py | 173 +++++-- python/cuml/preprocessing/__init__.py | 3 +- python/cuml/preprocessing/encoders.py | 465 +++++++++++++----- .../cuml/preprocessing/ordinalencoder_mg.py | 49 ++ .../tests/dask/test_dask_ordinal_encoder.py | 117 +++++ python/cuml/tests/test_ordinal_encoder.py | 133 +++++ 9 files changed, 777 insertions(+), 172 deletions(-) create mode 100644 python/cuml/preprocessing/ordinalencoder_mg.py create mode 100644 python/cuml/tests/dask/test_dask_ordinal_encoder.py create mode 100644 python/cuml/tests/test_ordinal_encoder.py diff --git a/python/cuml/common/doc_utils.py b/python/cuml/common/doc_utils.py index 5421bbb6d3..03054f0664 100644 --- a/python/cuml/common/doc_utils.py +++ b/python/cuml/common/doc_utils.py @@ -94,6 +94,8 @@ " Ignored when return_sparse=False.\n" " If True, values in the inverse transform below this parameter\n" " are clipped to 0.", + None: "{name} : None\n" + " Ignored. This parameter exists for compatibility only.", } _parameter_possible_values = [ @@ -222,7 +224,6 @@ def deco(func): if ( "X" in params or "y" in params or parameters ) and not skip_parameters_heading: - func.__doc__ += "\nParameters\n----------\n" # Check if we want to prepend the parameters diff --git a/python/cuml/dask/common/base.py b/python/cuml/dask/common/base.py index 718056e01c..a9949310be 100644 --- a/python/cuml/dask/common/base.py +++ b/python/cuml/dask/common/base.py @@ -36,6 +36,7 @@ np = cpu_only_import("numpy") +dask_cudf = gpu_only_import("dask_cudf") dcDataFrame = gpu_only_import_from("dask_cudf.core", "DataFrame") @@ -343,7 +344,7 @@ def _run_parallel_func( if output_futures: return self.client.compute(preds) else: - output = dask.dataframe.from_delayed(preds) + output = dask_cudf.from_delayed(preds) return output if delayed else output.persist() else: raise ValueError( diff --git a/python/cuml/dask/preprocessing/__init__.py b/python/cuml/dask/preprocessing/__init__.py index 17380238ef..f5959467ae 100644 --- a/python/cuml/dask/preprocessing/__init__.py +++ b/python/cuml/dask/preprocessing/__init__.py @@ -13,12 +13,13 @@ # limitations under the License. # +from cuml.dask.preprocessing.encoders import OneHotEncoder, OrdinalEncoder from cuml.dask.preprocessing.label import LabelBinarizer -from cuml.dask.preprocessing.encoders import OneHotEncoder from cuml.dask.preprocessing.LabelEncoder import LabelEncoder __all__ = [ "LabelBinarizer", "OneHotEncoder", + "OrdinalEncoder", "LabelEncoder", ] diff --git a/python/cuml/dask/preprocessing/encoders.py b/python/cuml/dask/preprocessing/encoders.py index 0033f89eca..8bf2503578 100644 --- a/python/cuml/dask/preprocessing/encoders.py +++ b/python/cuml/dask/preprocessing/encoders.py @@ -12,23 +12,46 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from dask_cudf.core import Series as daskSeries +from collections.abc import Sequence + from cuml.common import with_cupy_rmm +from cuml.dask.common.base import ( + BaseEstimator, + DelayedInverseTransformMixin, + DelayedTransformMixin, +) +from cuml.internals.safe_imports import gpu_only_import_from, gpu_only_import +from dask_cudf.core import Series as daskSeries +from toolz import first -from cuml.dask.common.base import BaseEstimator -from cuml.dask.common.base import DelayedTransformMixin -from cuml.dask.common.base import DelayedInverseTransformMixin +dask_cudf = gpu_only_import("dask_cudf") +dcDataFrame = gpu_only_import_from("dask_cudf.core", "DataFrame") -from toolz import first -from collections.abc import Sequence -from cuml.internals.safe_imports import gpu_only_import_from +class DelayedFitTransformMixin: + def fit_transform(self, X, delayed=True): + """Fit the encoder to X, then transform X. Equivalent to fit(X).transform(X). -dcDataFrame = gpu_only_import_from("dask_cudf.core", "DataFrame") + Parameters + ---------- + X : Dask cuDF DataFrame or CuPy backed Dask Array + The data to encode. + delayed : bool (default = True) + Whether to execute as a delayed task or eager. + + Returns + ------- + out : Dask cuDF DataFrame or CuPy backed Dask Array + Distributed object containing the transformed data + """ + return self.fit(X).transform(X, delayed=delayed) class OneHotEncoder( - BaseEstimator, DelayedTransformMixin, DelayedInverseTransformMixin + BaseEstimator, + DelayedTransformMixin, + DelayedInverseTransformMixin, + DelayedFitTransformMixin, ): """ Encode categorical features as a one-hot numeric array. @@ -83,13 +106,9 @@ class OneHotEncoder( will be denoted as None. """ - def __init__(self, *, client=None, verbose=False, **kwargs): - super().__init__(client=client, verbose=verbose, **kwargs) - @with_cupy_rmm def fit(self, X): - """ - Fit a multi-node multi-gpu OneHotEncoder to X. + """Fit a multi-node multi-gpu OneHotEncoder to X. Parameters ---------- @@ -111,10 +130,9 @@ def fit(self, X): return self - def fit_transform(self, X, delayed=True): - """ - Fit OneHotEncoder to X, then transform X. - Equivalent to fit(X).transform(X). + @with_cupy_rmm + def transform(self, X, delayed=True): + """Transform X using one-hot encoding. Parameters ---------- @@ -126,52 +144,137 @@ def fit_transform(self, X, delayed=True): Returns ------- out : Dask cuDF DataFrame or CuPy backed Dask Array - Distributed object containing the transformed data + Distributed object containing the transformed input. """ - return self.fit(X).transform(X, delayed=delayed) + return self._transform( + X, + n_dims=2, + delayed=delayed, + output_dtype=self._get_internal_model().dtype, + output_collection_type="cupy", + ) @with_cupy_rmm - def transform(self, X, delayed=True): - """ - Transform X using one-hot encoding. + def inverse_transform(self, X, delayed=True): + """Convert the data back to the original representation. In case unknown + categories are encountered (all zeros in the one-hot encoding), ``None`` is used + to represent this category. Parameters ---------- - X : Dask cuDF DataFrame or CuPy backed Dask Array - The data to encode. + X : CuPy backed Dask Array, shape [n_samples, n_encoded_features] + The transformed data. delayed : bool (default = True) Whether to execute as a delayed task or eager. Returns ------- - out : Dask cuDF DataFrame or CuPy backed Dask Array - Distributed object containing the transformed input. + X_tr : Dask cuDF DataFrame or CuPy backed Dask Array + Distributed object containing the inverse transformed array. + """ + dtype = self._get_internal_model().dtype + return self._inverse_transform( + X, + n_dims=2, + delayed=delayed, + output_dtype=dtype, + output_collection_type=self.datatype, + ) + + +class OrdinalEncoder( + BaseEstimator, + DelayedTransformMixin, + DelayedInverseTransformMixin, + DelayedFitTransformMixin, +): + """Encode categorical features as an integer array. + + The input to this transformer should be an :py:class:`dask_cudf.DataFrame` or a + :py:class:`dask.array.Array` backed by cupy, denoting the unique values taken on by + categorical (discrete) features. The features are converted to ordinal + integers. This results in a single column of integers (0 to n_categories - 1) per + feature. + + Parameters + ---------- + categories : :py:class:`cupy.ndarray` or :py:class`cudf.DataFrameq, default='auto' + Categories (unique values) per feature. All categories are expected to + fit on one GPU. + - 'auto' : Determine categories automatically from the training data. + - DataFrame/ndarray : ``categories[col]`` holds the categories expected + in the feature col. + handle_unknown : {'error', 'ignore'}, default='error' + Whether to raise an error or ignore if an unknown categorical feature is + present during transform (default is to raise). When this parameter is set + to 'ignore' and an unknown category is encountered during transform, the + resulting encoded value would be null when output type is cudf + dataframe. + verbose : int or boolean, default=False + Sets logging level. It must be one of `cuml.common.logger.level_*`. See + :ref:`verbosity-levels` for more info. + """ + + @with_cupy_rmm + def fit(self, X): + """Fit Ordinal to X. + + Parameters + ---------- + X : :py:class:`dask_cudf.DataFrame` or a CuPy backed :py:class:`dask.array.Array`. + shape = (n_samples, n_features) The data to determine the categories of each + feature. + + Returns + ------- + self + """ + from cuml.preprocessing.ordinalencoder_mg import OrdinalEncoderMG + + el = first(X) if isinstance(X, Sequence) else X + self.datatype = ( + "cudf" if isinstance(el, (dcDataFrame, daskSeries)) else "cupy" + ) + + self._set_internal_model(OrdinalEncoderMG(**self.kwargs).fit(X)) + + return self + + @with_cupy_rmm + def transform(self, X, delayed=True): + """Transform X using ordinal encoding. + + Parameters + ---------- + X : :py:class:`dask_cudf.DataFrame` or cupy backed dask array. The data to + encode. + + Returns + ------- + X_out : + Transformed input. """ return self._transform( X, n_dims=2, delayed=delayed, output_dtype=self._get_internal_model().dtype, - output_collection_type="cupy", + output_collection_type=self.datatype, ) @with_cupy_rmm def inverse_transform(self, X, delayed=True): - """ - Convert the data back to the original representation. - In case unknown categories are encountered (all zeros in the - one-hot encoding), ``None`` is used to represent this category. + """Convert the data back to the original representation. Parameters ---------- - X : CuPy backed Dask Array, shape [n_samples, n_encoded_features] - The transformed data. + X : :py:class:`dask_cudf.DataFrame` or cupy backed dask array. delayed : bool (default = True) Whether to execute as a delayed task or eager. Returns ------- - X_tr : Dask cuDF DataFrame or CuPy backed Dask Array + X_tr : Distributed object containing the inverse transformed array. """ dtype = self._get_internal_model().dtype diff --git a/python/cuml/preprocessing/__init__.py b/python/cuml/preprocessing/__init__.py index 368c570b09..fc07aba50c 100644 --- a/python/cuml/preprocessing/__init__.py +++ b/python/cuml/preprocessing/__init__.py @@ -16,7 +16,7 @@ from cuml.model_selection import train_test_split from cuml.preprocessing.LabelEncoder import LabelEncoder from cuml.preprocessing.label import LabelBinarizer, label_binarize -from cuml.preprocessing.encoders import OneHotEncoder +from cuml.preprocessing.encoders import OneHotEncoder, OrdinalEncoder from cuml.preprocessing.TargetEncoder import TargetEncoder from cuml.preprocessing import text @@ -63,6 +63,7 @@ "MissingIndicator", "Normalizer", "OneHotEncoder", + "OrdinalEncoder", "PolynomialFeatures", "PowerTransformer", "QuantileTransformer", diff --git a/python/cuml/preprocessing/encoders.py b/python/cuml/preprocessing/encoders.py index 32a8defc69..272655b552 100644 --- a/python/cuml/preprocessing/encoders.py +++ b/python/cuml/preprocessing/encoders.py @@ -13,31 +13,165 @@ # limitations under the License. # import warnings +from typing import List, Optional, TypeVar + import cuml.internals.logger as logger -from cuml.internals.safe_imports import gpu_only_import_from from cudf import DataFrame, Series -from cuml.preprocessing import LabelEncoder from cuml import Base +from cuml.common.doc_utils import generate_docstring from cuml.common.exceptions import NotFittedError -from cuml.internals.safe_imports import gpu_only_import -from cuml.internals.safe_imports import cpu_only_import +from cuml.internals.safe_imports import ( + cpu_only_import, + gpu_only_import, + gpu_only_import_from, +) +from cuml.preprocessing import LabelEncoder np = cpu_only_import("numpy") +cudf = gpu_only_import("cudf") cp = gpu_only_import("cupy") cupyx = gpu_only_import("cupyx") GenericIndex = gpu_only_import_from("cudf", "GenericIndex") -class OneHotEncoder(Base): +class CheckFeaturesMixIn: + def _check_n_features(self, X, reset: bool = False): + n_features = X.shape[1] + if reset: + self.n_features_in_ = n_features + if hasattr(X, "columns"): + self.feature_names_in_ = [str(c) for c in X.columns] + else: + if not hasattr(self, "n_features_in_"): + raise RuntimeError( + "The reset parameter is False but there is no " + "n_features_in_ attribute. Is this estimator fitted?" + ) + if n_features != self.n_features_in_: + raise ValueError( + "X has {} features, but this {} is expecting {} features " + "as input.".format( + n_features, + self.__class__.__name__, + self.n_features_in_, + ) + ) + + +class BaseEncoder(Base, CheckFeaturesMixIn): + """Base implementation for encoding categorical values, uses + :py:class:`~cuml.preprocessing.LabelEncoder` for obtaining unique values. + + Parameters + ---------- + + handle : cuml.Handle + Specifies the cuml.handle that holds internal CUDA state for + computations in this model. Most importantly, this specifies the CUDA + stream that will be used for the model's computations, so users can + run different models concurrently in different streams by creating + handles in several streams. + If it is None, a new one is created. + verbose : int or boolean, default=False + Sets logging level. It must be one of `cuml.common.logger.level_*`. + See :ref:`verbosity-levels` for more info. + output_type : {'input', 'array', 'dataframe', 'series', 'df_obj', \ + 'numba', 'cupy', 'numpy', 'cudf', 'pandas'}, default=None + Return results and set estimator attributes to the indicated output + type. If None, the output type set at the module level + (`cuml.global_settings.output_type`) will be used. See + :ref:`output-data-type-configuration` for more info. + """ + + def _set_input_type(self, value): + if self.input_type is None: + self.input_type = value + + def _check_input(self, X, is_categories=False): + """If input is cupy, convert it to a DataFrame with 0 copies.""" + if isinstance(X, cp.ndarray): + self._set_input_type("array") + if is_categories: + X = X.transpose() + return DataFrame(X) + else: + self._set_input_type("df") + return X + + def _check_input_fit(self, X, is_categories=False): + """Helper function used in fit, can be overridden in subclasses.""" + self._check_n_features(X, reset=True) + return self._check_input(X, is_categories=is_categories) + + def _unique(self, inp): + """Helper function used in fit. Can be overridden in subclasses.""" + + # Default implementation passes input through directly since this is + # performed in `LabelEncoder.fit()` + return inp + + def _fit(self, X, need_drop: bool): + X = self._check_input_fit(X) + if type(self.categories) is str and self.categories == "auto": + self._features = X.columns + self._encoders = { + feature: LabelEncoder( + handle=self.handle, + verbose=self.verbose, + output_type=self.output_type, + handle_unknown=self.handle_unknown, + ).fit(self._unique(X[feature])) + for feature in self._features + } + else: + self.categories = self._check_input_fit(self.categories, True) + self._features = self.categories.columns + if len(self._features) != X.shape[1]: + raise ValueError( + "Shape mismatch: if categories is not 'auto'," + " it has to be of shape (n_features, _)." + ) + self._encoders = dict() + for feature in self._features: + le = LabelEncoder( + handle=self.handle, + verbose=self.verbose, + output_type=self.output_type, + handle_unknown=self.handle_unknown, + ) + + self._encoders[feature] = le.fit(self.categories[feature]) + + if self.handle_unknown == "error": + if self._has_unknown( + X[feature], self._encoders[feature].classes_ + ): + msg = ( + "Found unknown categories in column {0}" + " during fit".format(feature) + ) + raise KeyError(msg) + + if need_drop: + self.drop_idx_ = self._compute_drop_idx() + self._fitted = True + + @property + def categories_(self): + """Returns categories used for the one hot encoding in the correct order.""" + return [self._encoders[f].classes_ for f in self._features] + + +class OneHotEncoder(BaseEncoder): """ Encode categorical features as a one-hot numeric array. - The input to this estimator should be a cuDF.DataFrame or a cupy.ndarray, - denoting the unique values taken on by categorical (discrete) features. - The features are encoded using a one-hot (aka 'one-of-K' or 'dummy') - encoding scheme. This creates a binary column for each category and - returns a sparse matrix or dense array (depending on the ``sparse`` - parameter). + The input to this estimator should be a :py:class:`cuDF.DataFrame` or a + :py:class:`cupy.ndarray`, denoting the unique values taken on by categorical + (discrete) features. The features are encoded using a one-hot (aka 'one-of-K' or + 'dummy') encoding scheme. This creates a binary column for each category and returns + a sparse matrix or dense array (depending on the ``sparse`` parameter). + By default, the encoder derives the categories based on the unique values in each feature. Alternatively, you can also specify the `categories` manually. @@ -105,7 +239,6 @@ class OneHotEncoder(Base): ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to be dropped for each feature. None if all the transformed features will be retained. - """ def __init__( @@ -165,7 +298,7 @@ def _check_is_fitted(self): raise NotFittedError(msg) def _compute_drop_idx(self): - """Helper to compute indices to drop from category to drop""" + """Helper to compute indices to drop from category to drop.""" if self.drop is None: return None elif isinstance(self.drop, str) and self.drop == "first": @@ -209,141 +342,46 @@ def _compute_drop_idx(self): ) raise ValueError(msg.format(type(self.drop))) - @property - def categories_(self): - """ - Returns categories used for the one hot encoding in the correct order. - """ - return [self._encoders[f].classes_ for f in self._features] - - def _set_input_type(self, value): - if self.input_type is None: - self.input_type = value - - def _check_input(self, X, is_categories=False): - """ - If input is cupy, convert it to a DataFrame with 0 copies - """ - if isinstance(X, cp.ndarray): - self._set_input_type("array") - if is_categories: - X = X.transpose() - return DataFrame(X) - else: - self._set_input_type("df") - return X - def _check_input_fit(self, X, is_categories=False): """Helper function used in fit. Can be overridden in subclasses.""" return self._check_input(X, is_categories=is_categories) - def _unique(self, inp): - """Helper function used in fit. Can be overridden in subclasses.""" - - # Default implementation passes input through directly since this is - # performed in `LabelEncoder.fit()` - return inp - def _has_unknown(self, X_cat, encoder_cat): - """Check if X_cat has categories that are not present in encoder_cat""" + """Check if X_cat has categories that are not present in encoder_cat.""" return not X_cat.isin(encoder_cat).all() + @generate_docstring(y=None) def fit(self, X, y=None): - """ - Fit OneHotEncoder to X. - - Parameters - ---------- - X : cuDF.DataFrame or cupy.ndarray, shape = (n_samples, n_features) - The data to determine the categories of each feature. - y : None - Ignored. This parameter exists for compatibility only. - - Returns - ------- - self - - """ + """Fit OneHotEncoder to X.""" self._validate_keywords() - X = self._check_input_fit(X) - if type(self.categories) is str and self.categories == "auto": - self._features = X.columns - self._encoders = { - feature: LabelEncoder( - handle=self.handle, - verbose=self.verbose, - output_type=self.output_type, - handle_unknown=self.handle_unknown, - ).fit(self._unique(X[feature])) - for feature in self._features - } - else: - self.categories = self._check_input_fit(self.categories, True) - self._features = self.categories.columns - if len(self._features) != X.shape[1]: - raise ValueError( - "Shape mismatch: if categories is not 'auto'," - " it has to be of shape (n_features, _)." - ) - self._encoders = dict() - for feature in self._features: - - le = LabelEncoder( - handle=self.handle, - verbose=self.verbose, - output_type=self.output_type, - handle_unknown=self.handle_unknown, - ) - - self._encoders[feature] = le.fit(self.categories[feature]) - - if self.handle_unknown == "error": - if self._has_unknown( - X[feature], self._encoders[feature].classes_ - ): - msg = ( - "Found unknown categories in column {0}" - " during fit".format(feature) - ) - raise KeyError(msg) - - self.drop_idx_ = self._compute_drop_idx() - self._fitted = True + self._fit(X, True) return self + @generate_docstring( + y=None, + return_values={ + "name": "X_out", + "description": "Transformed input.", + "type": "sparse matrix if sparse=True else a 2-d array", + }, + ) def fit_transform(self, X, y=None): """ - Fit OneHotEncoder to X, then transform X. - Equivalent to fit(X).transform(X). - - Parameters - ---------- - X : cudf.DataFrame or cupy.ndarray, shape = (n_samples, n_features) - The data to encode. - - Returns - ------- - X_out : sparse matrix if sparse=True else a 2-d array - Transformed input. + Fit OneHotEncoder to X, then transform X. Equivalent to fit(X).transform(X). """ X = self._check_input(X) return self.fit(X).transform(X) + @generate_docstring( + return_values={ + "name": "X_out", + "description": "Transformed input.", + "type": "sparse matrix if sparse=True else a 2-d array", + } + ) def transform(self, X): - """ - Transform X using one-hot encoding. - - Parameters - ---------- - X : cudf.DataFrame or cupy.ndarray - The data to encode. - - Returns - ------- - X_out : sparse matrix if sparse=True else a 2-d array - Transformed input. - """ + """Transform X using one-hot encoding.""" self._check_is_fitted() X = self._check_input(X) @@ -425,10 +463,9 @@ def transform(self, X): ) def inverse_transform(self, X): - """ - Convert the data back to the original representation. - In case unknown categories are encountered (all zeros in the - one-hot encoding), ``None`` is used to represent this category. + """Convert the data back to the original representation. In case unknown + categories are encountered (all zeros in the one-hot encoding), ``None`` is used + to represent this category. The return type is the same as the type of the input used by the first call to fit on this estimator instance. @@ -544,3 +581,165 @@ def get_param_names(self): "dtype", "handle_unknown", ] + + +def _slice_feat(X, i): + if hasattr(X, "iloc"): + return X[i] + return X[:, i] + + +def _get_output( + output_type: Optional[str], + input_type: Optional[str], + out: DataFrame, + dtype, +): + if output_type == "input": + if input_type == "array": + output_type = "cupy" + elif input_type == "df": + output_type = "cudf" + + if output_type is None: + output_type = "cupy" + + if output_type == "cudf": + return out + elif output_type == "cupy": + return out.astype(dtype).to_cupy(na_value=np.nan) + elif output_type == "numpy": + return cp.asnumpy(out.to_cupy(na_value=np.nan, dtype=dtype)) + elif output_type == "pandas": + return out.to_pandas() + else: + raise ValueError("Unsupported output type.") + + +class OrdinalEncoder(BaseEncoder): + def __init__( + self, + *, + categories="auto", + dtype=np.float64, + handle_unknown="error", + handle=None, + verbose=False, + output_type=None, + ) -> None: + """Encode categorical features as an integer array. + + The input to this transformer should be an :py:class:`cudf.DataFrame` or a + :py:class:`cupy.ndarray`, denoting the unique values taken on by categorical + (discrete) features. The features are converted to ordinal integers. This + results in a single column of integers (0 to n_categories - 1) per feature. + + Parameters + ---------- + categories : 'auto' an cupy.ndarray or a cudf.DataFrame, default='auto' + Categories (unique values) per feature: + - 'auto' : Determine categories automatically from the training data. + - DataFrame/ndarray : ``categories[col]`` holds the categories expected + in the feature col. + handle_unknown : {'error', 'ignore'}, default='error' + Whether to raise an error or ignore if an unknown categorical feature is + present during transform (default is to raise). When this parameter is set + to 'ignore' and an unknown category is encountered during transform, the + resulting encoded value would be null when output type is cudf + dataframe. + handle : cuml.Handle + Specifies the cuml.handle that holds internal CUDA state for computations in + this model. Most importantly, this specifies the CUDA stream that will be + used for the model's computations, so users can run different models + concurrently in different streams by creating handles in several streams. + + If it is None, a new one is created. + verbose : int or boolean, default=False + Sets logging level. It must be one of `cuml.common.logger.level_*`. See + :ref:`verbosity-levels` for more info. + output_type : {'input', 'array', 'dataframe', 'series', 'df_obj', \ + 'numba', 'cupy', 'numpy', 'cudf', 'pandas'}, default=None + Return results and set estimator attributes to the indicated output + type. If None, the output type set at the module level + (`cuml.global_settings.output_type`) will be used. See + :ref:`output-data-type-configuration` for more info. + """ + super().__init__( + handle=handle, verbose=verbose, output_type=output_type + ) + + self.categories = categories + self.dtype = dtype + self.handle_unknown = handle_unknown + + self.input_type = None + + @generate_docstring(y=None) + def fit(self, X, y=None) -> "OrdinalEncoder": + """Fit Ordinal to X.""" + self._fit(X, need_drop=False) + return self + + @generate_docstring( + return_values={ + "name": "X_out", + "description": "Transformed input.", + "type": "Type is specified by the `output_type` parameter.", + } + ) + def transform(self, X): + """Transform X using ordinal encoding.""" + self._check_n_features(X, reset=False) + + result = {} + for feature in self._features: + Xi = _slice_feat(X, feature) + col_idx = self._encoders[feature].transform(Xi) + result[feature] = col_idx + + r = DataFrame(result) + return _get_output(self.output_type, self.input_type, r, self.dtype) + + @generate_docstring( + y=None, + return_values={ + "name": "X_out", + "description": "Transformed input.", + "type": "Type is specified by the `output_type` parameter.", + }, + ) + def fit_transform(self, X, y=None): + """Fit OrdinalEncoder to X, then transform X. Equivalent to fit(X).transform(X).""" + X = self._check_input(X) + return self.fit(X).transform(X) + + def inverse_transform(self, X): + """Convert the data back to the original representation. + + Parameters + ---------- + X : array-like or sparse matrix, shape [n_samples, n_encoded_features] + The transformed data. + + Returns + ------- + X_tr : Type is specified by the `output_type` parameter. + Inverse transformed array. + """ + self._check_n_features(X, reset=False) + + result = {} + for feature in self._features: + Xi = _slice_feat(X, feature) + inv = self._encoders[feature].inverse_transform(Xi) + result[feature] = inv + + r = DataFrame(result) + return _get_output(self.output_type, self.input_type, r, self.dtype) + + def get_param_names(self): + return super().get_param_names() + [ + "categories", + "dtype", + "handle_unknown", + ] diff --git a/python/cuml/preprocessing/ordinalencoder_mg.py b/python/cuml/preprocessing/ordinalencoder_mg.py new file mode 100644 index 0000000000..8b47f67819 --- /dev/null +++ b/python/cuml/preprocessing/ordinalencoder_mg.py @@ -0,0 +1,49 @@ +# +# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import cupy as cp +import dask +from cuml.dask.common.dask_arr_utils import to_dask_cudf +from cuml.internals.safe_imports import gpu_only_import, gpu_only_import_from +from cuml.preprocessing.encoders import OrdinalEncoder + +cp = gpu_only_import("cupy") +DataFrame = gpu_only_import_from("cudf", "DataFrame") + + +class OrdinalEncoderMG(OrdinalEncoder): + def __init__(self, *, client=None, **kwargs): + super().__init__(**kwargs) + self.client = client + + def _check_input_fit(self, X, is_categories=False): + """Helper function to check input of fit within the multi-gpu model""" + if isinstance(X, (dask.array.core.Array, cp.ndarray)): + self._set_input_type("array") + if is_categories: + X = X.transpose() + if isinstance(X, cp.ndarray): + return DataFrame(X) + else: + return to_dask_cudf(X, client=self.client) + else: + self._set_input_type("df") + return X + + def _unique(self, inp): + return inp.unique().compute() + + def _has_unknown(self, X_cat, encoder_cat): + return not X_cat.isin(encoder_cat).all().compute() diff --git a/python/cuml/tests/dask/test_dask_ordinal_encoder.py b/python/cuml/tests/dask/test_dask_ordinal_encoder.py new file mode 100644 index 0000000000..36b5fa92d3 --- /dev/null +++ b/python/cuml/tests/dask/test_dask_ordinal_encoder.py @@ -0,0 +1,117 @@ +# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import cupy as cp +import dask_cudf +import numpy as np +import pandas as pd +import pytest +from cudf import DataFrame +from cuml.dask.preprocessing import OrdinalEncoder +from distributed import Client + + +@pytest.mark.mg +def test_ordinal_encoder_df(client: Client) -> None: + X = DataFrame({"cat": ["M", "F", "F"], "int": [1, 3, 2]}) + X = dask_cudf.from_cudf(X, npartitions=2) + + enc = OrdinalEncoder() + enc.fit(X) + Xt = enc.transform(X) + + X_1 = DataFrame({"cat": ["F", "F"], "int": [1, 2]}) + X_1 = dask_cudf.from_cudf(X_1, npartitions=2) + + enc = OrdinalEncoder(client=client) + enc.fit(X) + Xt_1 = enc.transform(X_1) + + Xt_r = Xt.compute() + Xt_1_r = Xt_1.compute() + assert Xt_1_r.iloc[0, 0] == Xt_r.iloc[1, 0] + assert Xt_1_r.iloc[1, 0] == Xt_r.iloc[1, 0] + assert Xt_1_r.iloc[0, 1] == Xt_r.iloc[0, 1] + assert Xt_1_r.iloc[1, 1] == Xt_r.iloc[2, 1] + + # Turn Int64Index to RangeIndex for testing equality + inv_Xt = enc.inverse_transform(Xt).compute().reset_index(drop=True) + inv_Xt_1 = enc.inverse_transform(Xt_1).compute().reset_index(drop=True) + + X_r = X.compute() + X_1_r = X_1.compute() + + assert inv_Xt.equals(X_r) + assert inv_Xt_1.equals(X_1_r) + + assert enc.n_features_in_ == 2 + + +@pytest.mark.mg +def test_ordinal_encoder_array(client: Client) -> None: + X = DataFrame({"A": [4, 1, 1], "B": [1, 3, 2]}) + X = dask_cudf.from_cudf(X, npartitions=2).values + + enc = OrdinalEncoder() + enc.fit(X) + Xt = enc.transform(X) + + X_1 = DataFrame({"A": [1, 1], "B": [1, 2]}) + X_1 = dask_cudf.from_cudf(X_1, npartitions=2).values + + enc = OrdinalEncoder(client=client) + enc.fit(X) + Xt_1 = enc.transform(X_1) + + Xt_r = Xt.compute() + Xt_1_r = Xt_1.compute() + assert Xt_1_r[0, 0] == Xt_r[1, 0] + assert Xt_1_r[1, 0] == Xt_r[1, 0] + assert Xt_1_r[0, 1] == Xt_r[0, 1] + assert Xt_1_r[1, 1] == Xt_r[2, 1] + + inv_Xt = enc.inverse_transform(Xt) + inv_Xt_1 = enc.inverse_transform(Xt_1) + + cp.testing.assert_allclose(X.compute(), inv_Xt.compute()) + cp.testing.assert_allclose(X_1.compute(), inv_Xt_1.compute()) + + assert enc.n_features_in_ == 2 + + +@pytest.mark.mg +@pytest.mark.parametrize("as_array", [True, False], ids=["cupy", "cudf"]) +def test_handle_unknown(client, as_array: bool) -> None: + X = DataFrame({"data": [0, 1]}) + Y = DataFrame({"data": [3, 1]}) + + X = dask_cudf.from_cudf(X, npartitions=2) + Y = dask_cudf.from_cudf(Y, npartitions=2) + + if as_array: + X = X.values + Y = Y.values + + enc = OrdinalEncoder(handle_unknown="error") + enc = enc.fit(X) + with pytest.raises(KeyError): + enc.transform(Y).compute() + + enc = OrdinalEncoder(handle_unknown="ignore") + enc = enc.fit(X) + encoded = enc.transform(Y).compute() + if as_array: + np.isnan(encoded[0, 0]) + else: + assert pd.isna(encoded.iloc[0, 0]) diff --git a/python/cuml/tests/test_ordinal_encoder.py b/python/cuml/tests/test_ordinal_encoder.py new file mode 100644 index 0000000000..c9379a43be --- /dev/null +++ b/python/cuml/tests/test_ordinal_encoder.py @@ -0,0 +1,133 @@ +# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import cupy as cp +import numpy as np +import pandas as pd +import pytest +from sklearn.preprocessing import OrdinalEncoder as skOrdinalEncoder + +from cuml.internals.safe_imports import gpu_only_import_from +from cuml.preprocessing import OrdinalEncoder + +DataFrame = gpu_only_import_from("cudf", "DataFrame") + + +@pytest.fixture +def test_sample(): + X = DataFrame({"cat": ["M", "F", "F"], "num": [1, 3, 2]}) + return X + + +def test_ordinal_encoder_df(test_sample) -> None: + X = test_sample + enc = OrdinalEncoder() + enc.fit(X) + Xt = enc.transform(X) + + X_1 = DataFrame({"cat": ["F", "F"], "num": [1, 2]}) + Xt_1 = enc.transform(X_1) + + assert Xt_1.iloc[0, 0] == Xt.iloc[1, 0] + assert Xt_1.iloc[1, 0] == Xt.iloc[1, 0] + assert Xt_1.iloc[0, 1] == Xt.iloc[0, 1] + assert Xt_1.iloc[1, 1] == Xt.iloc[2, 1] + + inv_Xt = enc.inverse_transform(Xt) + inv_Xt_1 = enc.inverse_transform(Xt_1) + + assert inv_Xt.equals(X) + assert inv_Xt_1.equals(X_1) + + assert enc.n_features_in_ == 2 + + +def test_ordinal_encoder_array() -> None: + X = DataFrame({"A": [4, 1, 1], "B": [1, 3, 2]}).values + enc = OrdinalEncoder() + enc.fit(X) + Xt = enc.transform(X) + + X_1 = DataFrame({"A": [1, 1], "B": [1, 2]}).values + Xt_1 = enc.transform(X_1) + + assert Xt_1[0, 0] == Xt[1, 0] + assert Xt_1[1, 0] == Xt[1, 0] + assert Xt_1[0, 1] == Xt[0, 1] + assert Xt_1[1, 1] == Xt[2, 1] + + inv_Xt = enc.inverse_transform(Xt) + inv_Xt_1 = enc.inverse_transform(Xt_1) + + cp.testing.assert_allclose(X, inv_Xt) + cp.testing.assert_allclose(X_1, inv_Xt_1) + + assert enc.n_features_in_ == 2 + + +def test_ordinal_array() -> None: + X = cp.arange(32).reshape(16, 2) + + enc = OrdinalEncoder() + enc.fit(X) + Xt = enc.transform(X) + + Xh = cp.asnumpy(X) + skenc = skOrdinalEncoder() + skenc.fit(Xh) + Xt_sk = skenc.transform(Xh) + + cp.testing.assert_allclose(Xt, Xt_sk) + + +def test_output_type(test_sample) -> None: + X = test_sample + enc = OrdinalEncoder(output_type="cupy").fit(X) + assert isinstance(enc.transform(X), cp.ndarray) + enc = OrdinalEncoder(output_type="cudf").fit(X) + assert isinstance(enc.transform(X), DataFrame) + enc = OrdinalEncoder(output_type="pandas").fit(X) + assert isinstance(enc.transform(X), pd.DataFrame) + enc = OrdinalEncoder(output_type="numpy").fit(X) + assert isinstance(enc.transform(X), np.ndarray) + # output_type == "input" + enc = OrdinalEncoder().fit(X) + assert isinstance(enc.transform(X), DataFrame) + + +def test_feature_names(test_sample) -> None: + enc = OrdinalEncoder().fit(test_sample) + assert enc.feature_names_in_ == ["cat", "num"] + + +@pytest.mark.parametrize("as_array", [True, False], ids=["cupy", "cudf"]) +def test_handle_unknown(as_array: bool) -> None: + X = DataFrame({"data": [0, 1]}) + Y = DataFrame({"data": [3, 1]}) + + if as_array: + X = X.values + Y = Y.values + + enc = OrdinalEncoder(handle_unknown="error") + enc = enc.fit(X) + with pytest.raises(KeyError): + enc.transform(Y) + + enc = OrdinalEncoder(handle_unknown="ignore") + enc = enc.fit(X) + encoded = enc.transform(Y) + if as_array: + np.isnan(encoded[0, 0]) + else: + assert pd.isna(encoded.iloc[0, 0]) From 197d4f3f8183502752a5966a60f02d0a8be546ee Mon Sep 17 00:00:00 2001 From: Jinfeng Li Date: Tue, 28 Nov 2023 13:01:46 -0800 Subject: [PATCH 13/18] [LogisticRegressionMG] Support sparse vectors (#5632) Authors: - Jinfeng Li (https://github.com/lijinf2) - Dante Gama Dessavre (https://github.com/dantegd) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/cuml/pull/5632 --- cpp/include/cuml/linear_model/qn_mg.hpp | 31 ++++ cpp/src/glm/qn/mg/glm_base_mg.cuh | 21 ++- cpp/src/glm/qn_mg.cu | 73 +++++++++ .../dask/linear_model/logistic_regression.py | 18 ++- python/cuml/linear_model/base_mg.pyx | 55 +++++-- .../linear_model/logistic_regression_mg.pyx | 61 ++++++-- .../dask/test_dask_logistic_regression.py | 140 +++++++++++++++++- 7 files changed, 365 insertions(+), 34 deletions(-) diff --git a/cpp/include/cuml/linear_model/qn_mg.hpp b/cpp/include/cuml/linear_model/qn_mg.hpp index f70fd833e9..21d35584be 100644 --- a/cpp/include/cuml/linear_model/qn_mg.hpp +++ b/cpp/include/cuml/linear_model/qn_mg.hpp @@ -63,6 +63,37 @@ void qnFit(raft::handle_t& handle, float* f, int* num_iters); +/** + * @brief support sparse vectors (Compressed Sparse Row format) for MNMG logistic regression fit + * using quasi newton methods + * @param[in] handle: the internal cuml handle object + * @param[in] input_values: vector holding non-zero values of all partitions for that rank + * @param[in] input_cols: vector holding column indices of non-zero values of all partitions for + * that rank + * @param[in] input_row_ids: vector holding row pointers of non-zero values of all partitions for + * that rank + * @param[in] X_nnz: the number of non-zero values of that rank + * @param[in] input_desc: PartDescriptor object for the input + * @param[in] labels: labels data + * @param[out] coef: learned coefficients + * @param[in] pams: model parameters + * @param[in] n_classes: number of outputs (number of classes or `1` for regression) + * @param[out] f: host pointer holding the final objective value + * @param[out] num_iters: host pointer holding the actual number of iterations taken + */ +void qnFitSparse(raft::handle_t& handle, + std::vector*>& input_values, + int* input_cols, + int* input_row_ids, + int X_nnz, + Matrix::PartDescriptor& input_desc, + std::vector*>& labels, + float* coef, + const qn_params& pams, + int n_classes, + float* f, + int* num_iters); + }; // namespace opg }; // namespace GLM }; // namespace ML diff --git a/cpp/src/glm/qn/mg/glm_base_mg.cuh b/cpp/src/glm/qn/mg/glm_base_mg.cuh index 977e79f0f4..094d7197b6 100644 --- a/cpp/src/glm/qn/mg/glm_base_mg.cuh +++ b/cpp/src/glm/qn/mg/glm_base_mg.cuh @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -112,34 +113,42 @@ struct GLMWithDataMG : ML::GLM::detail::GLMWithData { T* dev_scalar, cudaStream_t stream) { + raft::comms::comms_t const& communicator = raft::resource::get_comms(*(this->handle_p)); SimpleDenseMat W(wFlat.data, this->C, this->dims); SimpleDenseMat G(gradFlat.data, this->C, this->dims); SimpleVec lossVal(dev_scalar, 1); + // Ensure the same coefficients on all GPU + communicator.bcast(wFlat.data, this->C * this->dims, 0, stream); + communicator.sync_stream(stream); + // apply regularization auto regularizer_obj = this->objective; auto lossFunc = regularizer_obj->loss; auto reg = regularizer_obj->reg; G.fill(0, stream); - float reg_host = 0; + T reg_host = 0; if (reg->l2_penalty != 0) { reg->reg_grad(dev_scalar, G, W, lossFunc->fit_intercept, stream); raft::update_host(®_host, dev_scalar, 1, stream); - // note: avoid syncing here because there's a sync before reg_host is used. + raft::resource::sync_stream(*(this->handle_p)); } // apply linearFwd, getLossAndDz, linearBwd ML::GLM::detail::linearFwd( lossFunc->handle, *(this->Z), *(this->X), W); // linear part: forward pass - raft::comms::comms_t const& communicator = raft::resource::get_comms(*(this->handle_p)); - lossFunc->getLossAndDZ(dev_scalar, *(this->Z), *(this->y), stream); // loss specific part // normalize local loss before allreduce sum T factor = 1.0 * (*this->y).len / this->n_samples; raft::linalg::multiplyScalar(dev_scalar, dev_scalar, factor, 1, stream); + // GPUs calculates reg_host independently and may get values that show tiny divergence. + // Take the averaged reg_host to avoid the divergence. + T reg_factor = reg_host / this->n_ranks; + raft::linalg::addScalar(dev_scalar, dev_scalar, reg_factor, 1, stream); + communicator.allreduce(dev_scalar, dev_scalar, 1, raft::comms::op_t::SUM, stream); communicator.sync_stream(stream); @@ -154,11 +163,9 @@ struct GLMWithDataMG : ML::GLM::detail::GLMWithData { communicator.allreduce(G.data, G.data, this->C * this->dims, raft::comms::op_t::SUM, stream); communicator.sync_stream(stream); - float loss_host; + T loss_host; raft::update_host(&loss_host, dev_scalar, 1, stream); raft::resource::sync_stream(*(this->handle_p)); - loss_host += reg_host; - lossVal.fill(loss_host + reg_host, stream); return loss_host; } diff --git a/cpp/src/glm/qn_mg.cu b/cpp/src/glm/qn_mg.cu index 5a60c01f79..ee75316a18 100644 --- a/cpp/src/glm/qn_mg.cu +++ b/cpp/src/glm/qn_mg.cu @@ -29,6 +29,8 @@ #include using namespace MLCommon; +#include + namespace ML { namespace GLM { namespace opg { @@ -172,6 +174,77 @@ void qnFit(raft::handle_t& handle, handle, input_data, input_desc, labels, coef, pams, X_col_major, n_classes, f, num_iters); } +template +void qnFitSparse_impl(const raft::handle_t& handle, + const qn_params& pams, + T* X_values, + I* X_cols, + I* X_row_ids, + I X_nnz, + T* y, + size_t N, + size_t D, + size_t C, + T* w0, + T* f, + int* num_iters, + size_t n_samples, + int rank, + int n_ranks) +{ + auto X_simple = SimpleSparseMat(X_values, X_cols, X_row_ids, X_nnz, N, D); + + ML::GLM::opg::qn_fit_x_mg(handle, + pams, + X_simple, + y, + C, + w0, + f, + num_iters, + n_samples, + rank, + n_ranks); // ignore sample_weight, svr_eps + return; +} + +void qnFitSparse(raft::handle_t& handle, + std::vector*>& input_values, + int* input_cols, + int* input_row_ids, + int X_nnz, + Matrix::PartDescriptor& input_desc, + std::vector*>& labels, + float* coef, + const qn_params& pams, + int n_classes, + float* f, + int* num_iters) +{ + RAFT_EXPECTS(input_values.size() == 1, + "qn_mg.cu currently does not accept more than one input matrix"); + + auto data_input_values = input_values[0]; + auto data_y = labels[0]; + + qnFitSparse_impl(handle, + pams, + data_input_values->ptr, + input_cols, + input_row_ids, + X_nnz, + data_y->ptr, + input_desc.totalElementsOwnedBy(input_desc.rank), + input_desc.N, + n_classes, + coef, + f, + num_iters, + input_desc.M, + input_desc.rank, + input_desc.uniqueRanks().size()); +} + }; // namespace opg }; // namespace GLM }; // namespace ML diff --git a/python/cuml/dask/linear_model/logistic_regression.py b/python/cuml/dask/linear_model/logistic_regression.py index 38366a1b50..af53d509b1 100644 --- a/python/cuml/dask/linear_model/logistic_regression.py +++ b/python/cuml/dask/linear_model/logistic_regression.py @@ -21,6 +21,7 @@ from raft_dask.common.comms import get_raft_comm_state from dask.distributed import get_worker +from cuml.common.sparse_utils import is_sparse, has_scipy from cuml.dask.common import parts_to_ranks from cuml.dask.common.input_utils import DistributedDataHandler, concatenate from raft_dask.common.comms import Comms @@ -29,7 +30,9 @@ from cuml.internals.safe_imports import gpu_only_import cp = gpu_only_import("cupy") +cupyx = gpu_only_import("cupyx") np = cpu_only_import("numpy") +scipy = cpu_only_import("scipy") class LogisticRegression(LinearRegression): @@ -172,7 +175,20 @@ def _create_model(sessionId, datatype, **kwargs): @staticmethod def _func_fit(f, data, n_rows, n_cols, partsToSizes, rank): - inp_X = concatenate([X for X, _ in data]) + if is_sparse(data[0][0]) is False: + inp_X = concatenate([X for X, _ in data]) + + elif has_scipy() and scipy.sparse.isspmatrix(data[0][0]): + inp_X = scipy.sparse.vstack([X for X, _ in data]) + + elif cupyx.scipy.sparse.isspmatrix(data[0][0]): + inp_X = cupyx.scipy.sparse.vstack([X for X, _ in data]) + + else: + raise ValueError( + "input matrix must be dense, scipy sparse, or cupy sparse" + ) + inp_y = concatenate([y for _, y in data]) n_ranks = max([p[0] for p in partsToSizes]) + 1 aggregated_partsToSizes = [[i, 0] for i in range(n_ranks)] diff --git a/python/cuml/linear_model/base_mg.pyx b/python/cuml/linear_model/base_mg.pyx index c13d0d2de1..3dddb74f6c 100644 --- a/python/cuml/linear_model/base_mg.pyx +++ b/python/cuml/linear_model/base_mg.pyx @@ -30,6 +30,9 @@ from cuml.common.opg_data_utils_mg cimport * from cuml.internals.input_utils import input_to_cuml_array from cuml.decomposition.utils cimport * +from cuml.common.sparse_utils import is_sparse +from cuml.internals.array_sparse import SparseCumlArray + class MGFitMixin(object): @@ -45,8 +48,10 @@ class MGFitMixin(object): :param partsToSizes: array of tuples in the format: [(rank,size)] :return: self """ + self._set_output_type(input_data[0][0]) self._set_n_features_in(n_cols) + sparse_input = is_sparse(input_data[0][0]) X_arys = [] y_arys = [] @@ -57,8 +62,14 @@ class MGFitMixin(object): else: check_dtype = self.dtype - X_m, _, self.n_cols, _ = \ - input_to_cuml_array(input_data[i][0], check_dtype=check_dtype, order=order) + if sparse_input: + + X_m = SparseCumlArray(input_data[i][0], convert_index=np.int32) + _, self.n_cols = X_m.shape + else: + X_m, _, self.n_cols, _ = \ + input_to_cuml_array(input_data[i][0], check_dtype=check_dtype, order=order) + X_arys.append(X_m) if i == 0: @@ -81,18 +92,42 @@ class MGFitMixin(object): rank_to_sizes, rank) - cdef uintptr_t X_arg = opg.build_data_t(X_arys) + cdef uintptr_t X_arg cdef uintptr_t y_arg = opg.build_data_t(y_arys) - # call inheriting class _fit that does all cython pointers and calls - self._fit(X=X_arg, - y=y_arg, - coef_ptr=coef_ptr_arg, - input_desc=part_desc) + cdef uintptr_t X_cols + cdef uintptr_t X_row_ids + + if sparse_input is False: + + X_arg = opg.build_data_t(X_arys) + + # call inheriting class _fit that does all cython pointers and calls + self._fit(X=X_arg, + y=y_arg, + coef_ptr=coef_ptr_arg, + input_desc=part_desc) + + opg.free_data_t(X_arg, self.dtype) + + else: + + assert len(X_arys) == 1, "does not support more than one sparse input matrix" + X_arg = opg.build_data_t([x.data for x in X_arys]) + X_cols = X_arys[0].indices.ptr + X_row_ids = X_arys[0].indptr.ptr + X_nnz = sum([x.nnz for x in X_arys]) + + # call inheriting class _fit that does all cython pointers and calls + self._fit(X=[X_arg, X_cols, X_row_ids, X_nnz], + y=y_arg, + coef_ptr=coef_ptr_arg, + input_desc=part_desc) + + for ary in X_arys: + del ary opg.free_rank_size_pair(rank_to_sizes) opg.free_part_descriptor(part_desc) - opg.free_data_t(X_arg, self.dtype) opg.free_data_t(y_arg, self.dtype) - return self diff --git a/python/cuml/linear_model/logistic_regression_mg.pyx b/python/cuml/linear_model/logistic_regression_mg.pyx index 3330541b32..2e96851dfa 100644 --- a/python/cuml/linear_model/logistic_regression_mg.pyx +++ b/python/cuml/linear_model/logistic_regression_mg.pyx @@ -84,6 +84,20 @@ cdef extern from "cuml/linear_model/qn_mg.hpp" namespace "ML::GLM::opg" nogil: PartDescriptor &input_desc, vector[floatData_t*] labels) except+ + cdef void qnFitSparse( + handle_t& handle, + vector[floatData_t *] input_values, + int *input_cols, + int *input_row_ids, + int X_nnz, + PartDescriptor &input_desc, + vector[floatData_t *] labels, + float *coef, + const qn_params& pams, + int n_classes, + float *f, + int *num_iters) except + + class LogisticRegressionMG(MGFitMixin, LogisticRegression): @@ -171,6 +185,7 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression): def fit(self, input_data, n_rows, n_cols, parts_rank_size, rank, convert_dtype=False): + self.rank = rank assert len(input_data) == 1, f"Currently support only one (X, y) pair in the list. Received {len(input_data)} pairs." self.is_col_major = False order = 'F' if self.is_col_major else 'C' @@ -196,18 +211,42 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression): cdef qn_params qnpams = self.solver_model.qnparams.params + sparse_input = True if isinstance(X, list) else False + if self.dtype == np.float32: - qnFit( - handle_[0], - deref(X), - deref(input_desc), - deref(y), - mat_coef_ptr, - qnpams, - self.is_col_major, - self._num_classes, - &objective32, - &num_iters) + if sparse_input is False: + qnFit( + handle_[0], + deref(X), + deref(input_desc), + deref(y), + mat_coef_ptr, + qnpams, + self.is_col_major, + self._num_classes, + &objective32, + &num_iters) + + else: + assert len(X) == 4 + X_values = X[0] + X_cols = X[1] + X_row_ids = X[2] + X_nnz = X[3] + + qnFitSparse( + handle_[0], + deref(X_values), + X_cols, + X_row_ids, + X_nnz, + deref(input_desc), + deref(y), + mat_coef_ptr, + qnpams, + self._num_classes, + &objective32, + &num_iters) self.solver_model.objective = objective32 diff --git a/python/cuml/tests/dask/test_dask_logistic_regression.py b/python/cuml/tests/dask/test_dask_logistic_regression.py index 4f0cd7408b..d9e27f63af 100644 --- a/python/cuml/tests/dask/test_dask_logistic_regression.py +++ b/python/cuml/tests/dask/test_dask_logistic_regression.py @@ -21,6 +21,7 @@ from sklearn.linear_model import LogisticRegression as skLR from cuml.internals.safe_imports import cpu_only_import from cuml.testing.utils import array_equal +from scipy.sparse import csr_matrix pd = cpu_only_import("pandas") np = cpu_only_import("numpy") @@ -48,6 +49,38 @@ def _prep_training_data(c, X_train, y_train, partitions_per_worker): return X_train_df, y_train_df +def _prep_training_data_sparse(c, X_train, y_train, partitions_per_worker): + "The implementation follows test_dask_tfidf.create_cp_sparse_dask_array" + import dask.array as da + + workers = c.has_what().keys() + target_n_partitions = partitions_per_worker * len(workers) + + def cal_chunks(dataset, n_partitions): + + n_samples = dataset.shape[0] + n_samples_per_part = int(n_samples / n_partitions) + chunk_sizes = [n_samples_per_part] * n_partitions + samples_last_row = n_samples - ( + (n_partitions - 1) * n_samples_per_part + ) + chunk_sizes[-1] = samples_last_row + return tuple(chunk_sizes) + + assert ( + X_train.shape[0] == y_train.shape[0] + ), "the number of data records is not equal to the number of labels" + target_chunk_sizes = cal_chunks(X_train, target_n_partitions) + + X_da = da.from_array(X_train, chunks=(target_chunk_sizes, -1)) + y_da = da.from_array(y_train, chunks=target_chunk_sizes) + + X_da, y_da = dask_utils.persist_across_workers( + c, [X_da, y_da], workers=workers + ) + return X_da, y_da + + def make_classification_dataset(datatype, nrows, ncols, n_info, n_classes=2): X, y = make_classification( n_samples=nrows, @@ -285,6 +318,7 @@ def test_lbfgs( l1_ratio=None, C=1.0, n_classes=2, + convert_to_sparse=False, ): tolerance = 0.005 @@ -305,7 +339,12 @@ def imp(): datatype, nrows, ncols, n_info, n_classes=n_classes ) - X_df, y_df = _prep_training_data(client, X, y, n_parts) + if convert_to_sparse is False: + # X_dask and y_dask are dask cudf + X_dask, y_dask = _prep_training_data(client, X, y, n_parts) + else: + # X_dask and y_dask are dask array + X_dask, y_dask = _prep_training_data_sparse(client, X, y, n_parts) lr = cumlLBFGS_dask( solver="qn", @@ -315,9 +354,19 @@ def imp(): C=C, verbose=True, ) - lr.fit(X_df, y_df) - lr_coef = lr.coef_.to_numpy() - lr_intercept = lr.intercept_.to_numpy() + lr.fit(X_dask, y_dask) + + def array_to_numpy(ary): + if isinstance(ary, cp.ndarray): + return cp.asarray(ary) + elif isinstance(ary, cudf.DataFrame) or isinstance(ary, cudf.Series): + return ary.to_numpy() + else: + assert isinstance(ary, np.ndarray) + return ary + + lr_coef = array_to_numpy(lr.coef_) + lr_intercept = array_to_numpy(lr.intercept_) if penalty == "l2" or penalty == "none": sk_solver = "lbfgs" @@ -345,7 +394,11 @@ def imp(): ) # test predict - cu_preds = lr.predict(X_df, delayed=delayed).compute().to_numpy() + cu_preds = lr.predict(X_dask, delayed=delayed).compute() + if isinstance(cu_preds, cp.ndarray): + cu_preds = cp.asnumpy(cu_preds) + if not isinstance(cu_preds, np.ndarray): + cu_preds = cu_preds.to_numpy() accuracy_cuml = accuracy_score(y, cu_preds) sk_preds = sk_model.predict(X) @@ -491,3 +544,80 @@ def test_elasticnet( strength = 1.0 / lr.C assert l1_strength == lr.l1_ratio * strength assert l2_strength == (1.0 - lr.l1_ratio) * strength + + +@pytest.mark.mg +@pytest.mark.parametrize("fit_intercept", [False, True]) +@pytest.mark.parametrize( + "regularization", + [ + ("none", 1.0, None), + ("l2", 2.0, None), + ("l1", 2.0, None), + ("elasticnet", 2.0, 0.2), + ], +) +@pytest.mark.parametrize("datatype", [np.float32]) +@pytest.mark.parametrize("delayed", [True]) +@pytest.mark.parametrize("n_classes", [2, 8]) +def test_sparse_from_dense( + fit_intercept, regularization, datatype, delayed, n_classes, client +): + penalty = regularization[0] + C = regularization[1] + l1_ratio = regularization[2] + + test_lbfgs( + nrows=1e5, + ncols=20, + n_parts=2, + fit_intercept=fit_intercept, + datatype=datatype, + delayed=delayed, + client=client, + penalty=penalty, + n_classes=n_classes, + C=C, + l1_ratio=l1_ratio, + convert_to_sparse=True, + ) + + +@pytest.mark.parametrize("dtype", [np.float32]) +def test_sparse_nlp20news(dtype, nlp_20news, client): + + X, y = nlp_20news + n_parts = 2 # partitions_per_worker + + from scipy.sparse import csr_matrix + from sklearn.model_selection import train_test_split + + X = X.astype(dtype) + + X = csr_matrix(X) + y = y.get().astype(dtype) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + + from cuml.dask.linear_model import LogisticRegression as MG + + X_train_da, y_train_da = _prep_training_data_sparse( + client, X_train, y_train, partitions_per_worker=n_parts + ) + X_test_da, _ = _prep_training_data_sparse( + client, X_test, y_test, partitions_per_worker=n_parts + ) + + cumg = MG(verbose=6, C=20.0) + cumg.fit(X_train_da, y_train_da) + + preds = cumg.predict(X_test_da).compute() + cuml_score = accuracy_score(y_test, preds.tolist()) + + from sklearn.linear_model import LogisticRegression as CPULR + + cpu = CPULR(C=20.0) + cpu.fit(X_train, y_train) + cpu_preds = cpu.predict(X_test) + cpu_score = accuracy_score(y_test, cpu_preds.tolist()) + assert cuml_score >= cpu_score or np.abs(cuml_score - cpu_score) < 1e-3 From 97b6fa3db628cfaa1b05446b5a202f91e602789b Mon Sep 17 00:00:00 2001 From: Jinfeng Li Date: Tue, 28 Nov 2023 18:47:40 -0800 Subject: [PATCH 14/18] [LogisticRegressionMG][FEA] Support training when dataset contains only one class (#5655) This pull request introduces functionality for C++ training on datasets with a single label. It helps Spark Rapids ML match Spark's behavior. Additionally, it updates the Dask class to generate an error message, consistent with Scikit-learn's behavior. This PR depends on https://github.com/rapidsai/cuml/pull/5632 Authors: - Jinfeng Li (https://github.com/lijinf2) Approvers: - Simon Adorf (https://github.com/csadorf) - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/cuml/pull/5655 --- cpp/src/glm/qn/mg/qn_mg.cuh | 2 +- .../dask/linear_model/logistic_regression.py | 9 ++- python/cuml/linear_model/base_mg.pyx | 1 - .../linear_model/logistic_regression_mg.pyx | 6 +- .../dask/test_dask_logistic_regression.py | 56 +++++++++++++++---- 5 files changed, 56 insertions(+), 18 deletions(-) diff --git a/cpp/src/glm/qn/mg/qn_mg.cuh b/cpp/src/glm/qn/mg/qn_mg.cuh index ef9c1db6c2..177eb17b1b 100644 --- a/cpp/src/glm/qn/mg/qn_mg.cuh +++ b/cpp/src/glm/qn/mg/qn_mg.cuh @@ -101,7 +101,7 @@ inline void qn_fit_x_mg(const raft::handle_t& handle, switch (pams.loss) { case QN_LOSS_LOGISTIC: { - ASSERT(C == 2, "qn_mg.cuh: logistic loss invalid C"); + ASSERT(C > 0, "qn_mg.cuh: logistic loss invalid C"); ML::GLM::detail::LogisticLoss loss(handle, D, pams.fit_intercept); ML::GLM::opg::qn_fit_mg( handle, pams, loss, X, y, Z, w0_data, f, num_iters, n_samples, rank, n_ranks); diff --git a/python/cuml/dask/linear_model/logistic_regression.py b/python/cuml/dask/linear_model/logistic_regression.py index af53d509b1..faf194962f 100644 --- a/python/cuml/dask/linear_model/logistic_regression.py +++ b/python/cuml/dask/linear_model/logistic_regression.py @@ -195,6 +195,13 @@ def _func_fit(f, data, n_rows, n_cols, partsToSizes, rank): for p in partsToSizes: aggregated_partsToSizes[p[0]][1] += p[1] - return f.fit( + ret_status = f.fit( [(inp_X, inp_y)], n_rows, n_cols, aggregated_partsToSizes, rank ) + + if len(f.classes_) == 1: + raise ValueError( + f"This solver needs samples of at least 2 classes in the data, but the data contains only one class: {f.classes_[0]}" + ) + + return ret_status diff --git a/python/cuml/linear_model/base_mg.pyx b/python/cuml/linear_model/base_mg.pyx index 3dddb74f6c..eb92218513 100644 --- a/python/cuml/linear_model/base_mg.pyx +++ b/python/cuml/linear_model/base_mg.pyx @@ -63,7 +63,6 @@ class MGFitMixin(object): check_dtype = self.dtype if sparse_input: - X_m = SparseCumlArray(input_data[i][0], convert_index=np.int32) _, self.n_cols = X_m.shape else: diff --git a/python/cuml/linear_model/logistic_regression_mg.pyx b/python/cuml/linear_model/logistic_regression_mg.pyx index 2e96851dfa..ae9a2db58b 100644 --- a/python/cuml/linear_model/logistic_regression_mg.pyx +++ b/python/cuml/linear_model/logistic_regression_mg.pyx @@ -170,7 +170,7 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression): "with softmax (multinomial).") if solves_classification and not solves_multiclass: - self._num_classes_dim = self._num_classes - 1 + self._num_classes_dim = 1 else: self._num_classes_dim = self._num_classes @@ -185,7 +185,6 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression): def fit(self, input_data, n_rows, n_cols, parts_rank_size, rank, convert_dtype=False): - self.rank = rank assert len(input_data) == 1, f"Currently support only one (X, y) pair in the list. Received {len(input_data)} pairs." self.is_col_major = False order = 'F' if self.is_col_major else 'C' @@ -207,11 +206,12 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression): self._num_classes = len(self.classes_) self.loss = "sigmoid" if self._num_classes <= 2 else "softmax" self.prepare_for_fit(self._num_classes) + cdef uintptr_t mat_coef_ptr = self.coef_.ptr cdef qn_params qnpams = self.solver_model.qnparams.params - sparse_input = True if isinstance(X, list) else False + sparse_input = isinstance(X, list) if self.dtype == np.float32: if sparse_input is False: diff --git a/python/cuml/tests/dask/test_dask_logistic_regression.py b/python/cuml/tests/dask/test_dask_logistic_regression.py index d9e27f63af..c50a9f3978 100644 --- a/python/cuml/tests/dask/test_dask_logistic_regression.py +++ b/python/cuml/tests/dask/test_dask_logistic_regression.py @@ -16,6 +16,7 @@ from cuml.internals.safe_imports import gpu_only_import import pytest from cuml.dask.common import utils as dask_utils +from functools import partial from sklearn.metrics import accuracy_score, mean_squared_error from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression as skLR @@ -339,12 +340,12 @@ def imp(): datatype, nrows, ncols, n_info, n_classes=n_classes ) - if convert_to_sparse is False: - # X_dask and y_dask are dask cudf - X_dask, y_dask = _prep_training_data(client, X, y, n_parts) - else: + if convert_to_sparse: # X_dask and y_dask are dask array X_dask, y_dask = _prep_training_data_sparse(client, X, y, n_parts) + else: + # X_dask and y_dask are dask cudf + X_dask, y_dask = _prep_training_data(client, X, y, n_parts) lr = cumlLBFGS_dask( solver="qn", @@ -557,23 +558,21 @@ def test_elasticnet( ("elasticnet", 2.0, 0.2), ], ) -@pytest.mark.parametrize("datatype", [np.float32]) -@pytest.mark.parametrize("delayed", [True]) +@pytest.mark.parametrize("datatype", [np.float32, np.float64]) @pytest.mark.parametrize("n_classes", [2, 8]) def test_sparse_from_dense( - fit_intercept, regularization, datatype, delayed, n_classes, client + fit_intercept, regularization, datatype, n_classes, client ): - penalty = regularization[0] - C = regularization[1] - l1_ratio = regularization[2] + penalty, C, l1_ratio = regularization - test_lbfgs( + run_test = partial( + test_lbfgs, nrows=1e5, ncols=20, n_parts=2, fit_intercept=fit_intercept, datatype=datatype, - delayed=delayed, + delayed=True, client=client, penalty=penalty, n_classes=n_classes, @@ -582,6 +581,15 @@ def test_sparse_from_dense( convert_to_sparse=True, ) + if datatype == np.float32: + run_test() + else: + with pytest.raises( + RuntimeError, + match="dtypes other than float32 are currently not supported", + ): + run_test() + @pytest.mark.parametrize("dtype", [np.float32]) def test_sparse_nlp20news(dtype, nlp_20news, client): @@ -621,3 +629,27 @@ def test_sparse_nlp20news(dtype, nlp_20news, client): cpu_preds = cpu.predict(X_test) cpu_score = accuracy_score(y_test, cpu_preds.tolist()) assert cuml_score >= cpu_score or np.abs(cuml_score - cpu_score) < 1e-3 + + +@pytest.mark.parametrize("fit_intercept", [False, True]) +def test_exception_one_label(fit_intercept, client): + n_parts = 2 + datatype = "float32" + + X = np.array([(1, 2), (1, 3), (2, 1), (3, 1)], datatype) + y = np.array([1.0, 1.0, 1.0, 1.0], datatype) + X_df, y_df = _prep_training_data(client, X, y, n_parts) + + err_msg = "This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1.0" + + from cuml.dask.linear_model import LogisticRegression as cumlLBFGS_dask + + mg = cumlLBFGS_dask(fit_intercept=fit_intercept, verbose=6) + with pytest.raises(RuntimeError, match=err_msg): + mg.fit(X_df, y_df) + + from sklearn.linear_model import LogisticRegression + + lr = LogisticRegression(fit_intercept=fit_intercept) + with pytest.raises(ValueError, match=err_msg): + lr.fit(X, y) From a1d1fb6d9eb7354882bd1f33c78dc3c77ca0f963 Mon Sep 17 00:00:00 2001 From: Taurean Dyer <46935140+taureandyernv@users.noreply.github.com> Date: Wed, 29 Nov 2023 12:02:15 -0800 Subject: [PATCH 15/18] updated docs around `make_column_transformer` change from `.preprocessing` to `.compose` (#5680) closes #5675 Authors: - Taurean Dyer (https://github.com/taureandyernv) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/cuml/pull/5680 --- .../_thirdparty/sklearn/preprocessing/_column_transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_column_transformer.py b/python/cuml/_thirdparty/sklearn/preprocessing/_column_transformer.py index ded00ed619..d928f9b218 100644 --- a/python/cuml/_thirdparty/sklearn/preprocessing/_column_transformer.py +++ b/python/cuml/_thirdparty/sklearn/preprocessing/_column_transformer.py @@ -1147,8 +1147,8 @@ class make_column_selector: Examples -------- >>> from cuml.preprocessing import StandardScaler, OneHotEncoder - >>> from cuml.preprocessing import make_column_transformer - >>> from cuml.preprocessing import make_column_selector + >>> from cuml.compose import make_column_transformer + >>> from cuml.compose import make_column_selector >>> import cupy as cp >>> import cudf # doctest: +SKIP >>> X = cudf.DataFrame({'city': ['London', 'London', 'Paris', 'Sallisaw'], From 41f0d40adace3f8fcf793699ae70395a9303e219 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 30 Nov 2023 11:53:58 -0600 Subject: [PATCH 16/18] Skip dask pytest NN hang in CUDA 11.4 CI (#5665) Authors: - Dante Gama Dessavre (https://github.com/dantegd) Approvers: - Simon Adorf (https://github.com/csadorf) URL: https://github.com/rapidsai/cuml/pull/5665 --- .../cuml/tests/dask/test_dask_nearest_neighbors.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/cuml/tests/dask/test_dask_nearest_neighbors.py b/python/cuml/tests/dask/test_dask_nearest_neighbors.py index 16931e0ab9..9dbd4dc010 100644 --- a/python/cuml/tests/dask/test_dask_nearest_neighbors.py +++ b/python/cuml/tests/dask/test_dask_nearest_neighbors.py @@ -13,6 +13,7 @@ # limitations under the License. # +import platform from cuml.testing.utils import array_equal from sklearn.neighbors import KNeighborsClassifier from cuml.testing.utils import unit_param, quality_param, stress_param @@ -29,6 +30,17 @@ pd = cpu_only_import("pandas") np = cpu_only_import("numpy") +cp = gpu_only_import("cupy") + + +IS_ARM = platform.processor() == "aarch64" + +if IS_ARM and cp.cuda.runtime.runtimeGetVersion() < 11080: + pytest.skip( + "Test hang in AARCH64 with CUDA < 11.8: " + "https://github.com/rapidsai/cuml/issues/5673", + allow_module_level=True, + ) def predict(neigh_ind, _y, n_neighbors): From 4ac95c39d45293b5d63cdc0fcbdffe85d6931c1a Mon Sep 17 00:00:00 2001 From: Ray Douglass <3107146+raydouglass@users.noreply.github.com> Date: Mon, 4 Dec 2023 14:10:53 -0500 Subject: [PATCH 17/18] Pin actions/labeler to v4 [skip ci] (#5686) RAPIDS repos are using the `main` branch of https://github.com/actions/labeler which recently introduced [breaking changes](https://github.com/actions/labeler/releases/tag/v5.0.0). This PR pins to the latest v4 release of the labeler action until we can evaluate the changes required for v5. Authors: - Ray Douglass (https://github.com/raydouglass) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) --- .github/workflows/labeler.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 23956a02fb..31e78f82a6 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -6,6 +6,6 @@ jobs: triage: runs-on: ubuntu-latest steps: - - uses: actions/labeler@main + - uses: actions/labeler@v4 with: repo-token: "${{ secrets.GITHUB_TOKEN }}" From 8f29a1657a0dac7ec6e029111d504608b2abc474 Mon Sep 17 00:00:00 2001 From: Ray Douglass Date: Wed, 6 Dec 2023 09:58:05 -0500 Subject: [PATCH 18/18] Update Changelog [skip ci] --- CHANGELOG.md | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index dada3b9440..037b314f90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,52 @@ +# cuML 23.12.00 (6 Dec 2023) + +## ๐Ÿšจ Breaking Changes + +- [LogisticRegressionMG] Support sparse vectors ([#5632](https://github.com/rapidsai/cuml/pull/5632)) [@lijinf2](https://github.com/lijinf2) + +## ๐Ÿ› Bug Fixes + +- Update actions/labeler to v4 ([#5686](https://github.com/rapidsai/cuml/pull/5686)) [@raydouglass](https://github.com/raydouglass) +- updated docs around `make_column_transformer` change from `.preprocessing` to `.compose` ([#5680](https://github.com/rapidsai/cuml/pull/5680)) [@taureandyernv](https://github.com/taureandyernv) +- Skip dask pytest NN hang in CUDA 11.4 CI ([#5665](https://github.com/rapidsai/cuml/pull/5665)) [@dantegd](https://github.com/dantegd) +- Avoid hard import of sklearn in base module. ([#5663](https://github.com/rapidsai/cuml/pull/5663)) [@csadorf](https://github.com/csadorf) +- CI: Pin clang-tidy to 15.0.7. ([#5661](https://github.com/rapidsai/cuml/pull/5661)) [@csadorf](https://github.com/csadorf) +- Adjust assumption regarding valid cudf.Series dimensional input. ([#5654](https://github.com/rapidsai/cuml/pull/5654)) [@csadorf](https://github.com/csadorf) +- Flatten cupy array before feeding to cudf.Series ([#5651](https://github.com/rapidsai/cuml/pull/5651)) [@vyasr](https://github.com/vyasr) +- CI: Fix expected ValueError and dask-glm incompatibility ([#5644](https://github.com/rapidsai/cuml/pull/5644)) [@csadorf](https://github.com/csadorf) +- Use drop_duplicates instead of unique for cudf's pandas compatibility mode ([#5639](https://github.com/rapidsai/cuml/pull/5639)) [@vyasr](https://github.com/vyasr) +- Temporarily avoid pydata-sphinx-theme version 0.14.2. ([#5629](https://github.com/rapidsai/cuml/pull/5629)) [@csadorf](https://github.com/csadorf) +- Fix type hint in split function. ([#5625](https://github.com/rapidsai/cuml/pull/5625)) [@trivialfis](https://github.com/trivialfis) +- Fix trying to get pointer to None in svm/linear.pyx ([#5615](https://github.com/rapidsai/cuml/pull/5615)) [@yosider](https://github.com/yosider) +- Reduce parallelism to avoid OOMs in wheel tests ([#5611](https://github.com/rapidsai/cuml/pull/5611)) [@vyasr](https://github.com/vyasr) + +## ๐Ÿ“– Documentation + +- Update interoperability docs ([#5633](https://github.com/rapidsai/cuml/pull/5633)) [@beckernick](https://github.com/beckernick) +- Update instructions for creating a conda build environment ([#5628](https://github.com/rapidsai/cuml/pull/5628)) [@csadorf](https://github.com/csadorf) + +## ๐Ÿš€ New Features + +- Basic implementation of `OrdinalEncoder`. ([#5646](https://github.com/rapidsai/cuml/pull/5646)) [@trivialfis](https://github.com/trivialfis) + +## ๐Ÿ› ๏ธ Improvements + +- Build concurrency for nightly and merge triggers ([#5658](https://github.com/rapidsai/cuml/pull/5658)) [@bdice](https://github.com/bdice) +- [LogisticRegressionMG][FEA] Support training when dataset contains only one class ([#5655](https://github.com/rapidsai/cuml/pull/5655)) [@lijinf2](https://github.com/lijinf2) +- Use new `rapids-dask-dependency` metapackage for managing `dask` versions ([#5649](https://github.com/rapidsai/cuml/pull/5649)) [@galipremsagar](https://github.com/galipremsagar) +- Simplify some logic in LabelEncoder ([#5648](https://github.com/rapidsai/cuml/pull/5648)) [@vyasr](https://github.com/vyasr) +- Increase `Nanny` close timeout in `LocalCUDACluster` tests ([#5636](https://github.com/rapidsai/cuml/pull/5636)) [@pentschev](https://github.com/pentschev) +- [LogisticRegressionMG] Support sparse vectors ([#5632](https://github.com/rapidsai/cuml/pull/5632)) [@lijinf2](https://github.com/lijinf2) +- Add rich HTML representation to estimators ([#5630](https://github.com/rapidsai/cuml/pull/5630)) [@betatim](https://github.com/betatim) +- Unpin `dask` and `distributed` for `23.12` development ([#5627](https://github.com/rapidsai/cuml/pull/5627)) [@galipremsagar](https://github.com/galipremsagar) +- Update `shared-action-workflows` references ([#5621](https://github.com/rapidsai/cuml/pull/5621)) [@AyodeAwe](https://github.com/AyodeAwe) +- Use branch-23.12 workflows. ([#5618](https://github.com/rapidsai/cuml/pull/5618)) [@bdice](https://github.com/bdice) +- Update rapids-cmake functions to non-deprecated signatures ([#5616](https://github.com/rapidsai/cuml/pull/5616)) [@robertmaynard](https://github.com/robertmaynard) +- Allow nightly dependencies and set up consistent nightly versions for conda and pip packages ([#5607](https://github.com/rapidsai/cuml/pull/5607)) [@vyasr](https://github.com/vyasr) +- Forward-merge branch-23.10 to branch-23.12 ([#5596](https://github.com/rapidsai/cuml/pull/5596)) [@bdice](https://github.com/bdice) +- Build CUDA 12.0 ARM conda packages. ([#5595](https://github.com/rapidsai/cuml/pull/5595)) [@bdice](https://github.com/bdice) +- Enable multiclass svm for sparse input ([#5588](https://github.com/rapidsai/cuml/pull/5588)) [@mfoerste4](https://github.com/mfoerste4) + # cuML 23.10.00 (11 Oct 2023) ## ๐Ÿšจ Breaking Changes