diff --git a/python/cuml/feature_extraction/_vectorizers.py b/python/cuml/feature_extraction/_vectorizers.py index 78172ec690..0133195b20 100644 --- a/python/cuml/feature_extraction/_vectorizers.py +++ b/python/cuml/feature_extraction/_vectorizers.py @@ -598,7 +598,9 @@ def fit_transform(self, raw_documents, y=None): if self._fixed_vocabulary: self.vocabulary_ = self.vocabulary else: - self.vocabulary_ = tokenized_df["token"].unique().sort_values() + self.vocabulary_ = ( + tokenized_df["token"].drop_duplicates().sort_values() + ) count_df = self._count_vocab(tokenized_df) diff --git a/python/cuml/preprocessing/LabelEncoder.py b/python/cuml/preprocessing/LabelEncoder.py index 882e552511..c8221ff951 100644 --- a/python/cuml/preprocessing/LabelEncoder.py +++ b/python/cuml/preprocessing/LabelEncoder.py @@ -180,7 +180,7 @@ def fit(self, y, _classes=None): if _classes is not None: self.classes_ = _classes else: - self.classes_ = y.unique().sort_values( + self.classes_ = y.drop_duplicates().sort_values( ignore_index=True ) # dedupe and sort