diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3623db5a283..69f6634b5c2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,6 +32,15 @@ repos: language: system files: \.(cu|cuh|h|hpp|cpp|inl)$ args: ['-fallback-style=none'] + - repo: local + hooks: + - id: mypy + name: mypy + description: mypy + pass_filenames: false + entry: mypy --config-file=python/cudf/setup.cfg python/cudf/cudf + language: system + types: [python] default_language_version: python: python3 diff --git a/ci/checks/style.sh b/ci/checks/style.sh index 2534f857ee4..17599c6d74d 100755 --- a/ci/checks/style.sh +++ b/ci/checks/style.sh @@ -29,6 +29,10 @@ FLAKE_RETVAL=$? FLAKE_CYTHON=`flake8 --config=python/.flake8.cython` FLAKE_CYTHON_RETVAL=$? +# Run mypy and get results/return code +MYPY_CUDF=`mypy --config=python/cudf/setup.cfg python/cudf/cudf` +MYPY_CUDF_RETVAL=$? + # Run clang-format and check for a consistent code format CLANG_FORMAT=`python cpp/scripts/run-clang-format.py 2>&1` CLANG_FORMAT_RETVAL=$? @@ -66,6 +70,14 @@ else echo -e "\n\n>>>> PASSED: flake8-cython style check\n\n" fi +if [ "$MYPY_CUDF_RETVAL" != "0" ]; then + echo -e "\n\n>>>> FAILED: mypy style check; begin output\n\n" + echo -e "$MYPY_CUDF" + echo -e "\n\n>>>> FAILED: mypy style check; end output\n\n" +else + echo -e "\n\n>>>> PASSED: mypy style check\n\n" +fi + if [ "$CLANG_FORMAT_RETVAL" != "0" ]; then echo -e "\n\n>>>> FAILED: clang format check; begin output\n\n" echo -e "$CLANG_FORMAT" @@ -79,7 +91,7 @@ HEADER_META=`ci/checks/headers_test.sh` HEADER_META_RETVAL=$? echo -e "$HEADER_META" -RETVALS=($ISORT_RETVAL $BLACK_RETVAL $FLAKE_RETVAL $FLAKE_CYTHON_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL) +RETVALS=($ISORT_RETVAL $BLACK_RETVAL $FLAKE_RETVAL $FLAKE_CYTHON_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL $MYPY_CUDF_RETVAL) IFS=$'\n' RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1` diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml index 24882d9b3e2..b810b87111a 100644 --- a/conda/environments/cudf_dev_cuda10.1.yml +++ b/conda/environments/cudf_dev_cuda10.1.yml @@ -40,6 +40,8 @@ dependencies: - flake8=3.8.3 - black=19.10 - isort=5.0.7 + - mypy=0.782 + - typing_extensions - pre_commit - dask>=2.22.0 - distributed>=2.22.0 diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml index 49675fe2154..b4e95bc6730 100644 --- a/conda/environments/cudf_dev_cuda10.2.yml +++ b/conda/environments/cudf_dev_cuda10.2.yml @@ -40,6 +40,8 @@ dependencies: - flake8=3.8.3 - black=19.10 - isort=5.0.7 + - mypy=0.782 + - typing_extensions - pre_commit - dask>=2.22.0 - distributed>=2.22.0 diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 2917c2c3ce0..3b21f00ab16 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -40,6 +40,8 @@ dependencies: - flake8=3.8.3 - black=19.10 - isort=5.0.7 + - mypy=0.782 + - typing_extensions - pre_commit - dask>=2.22.0 - distributed>=2.22.0 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index ea93c5eb279..c5f7bd34c25 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -34,6 +34,7 @@ requirements: run: - protobuf - python + - typing_extensions - pandas >=1.0,<1.2.0dev0 - cupy >7.1.0,<9.0.0a0 - numba >=0.49.0 diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index be2d4ef5f51..0293518a5d9 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -10,13 +10,16 @@ datetime, filling, gpuarrow, + groupby, hash, interop, join, + json, merge, null_mask, nvtext, orc, + parquet, partitioning, quantiles, reduce, @@ -27,6 +30,7 @@ search, sort, stream_compaction, + string_casting, strings, table, transpose, diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi new file mode 100644 index 00000000000..0f8c044410d --- /dev/null +++ b/python/cudf/cudf/_lib/column.pyi @@ -0,0 +1,124 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from __future__ import annotations +from typing import Tuple, Union, TypeVar, Optional + +from cudf._typing import DtypeObj, Dtype, ScalarLike +from cudf.core.buffer import Buffer +from cudf.core.column import ColumnBase + + +T = TypeVar("T") + +class Column: + _data: Optional[Buffer] + _mask: Optional[Buffer] + _base_data: Optional[Buffer] + _base_mask: Optional[Buffer] + _dtype: DtypeObj + _offset: int + _null_count: int + _children: Tuple[ColumnBase, ...] + _base_children: Tuple[ColumnBase, ...] + + def __init__( + self, + data: Optional[Buffer], + dtype: Dtype, + size: int = None, + mask: Optional[Buffer] = None, + offset: int = None, + null_count: int = None, + children: Tuple[ColumnBase, ...] = (), + ) -> None: + ... + + @property + def base_size(self) -> int: + ... + + @property + def dtype(self) -> DtypeObj: + ... + + @property + def size(self) -> int: + ... + + @property + def base_data(self) -> Optional[Buffer]: + ... + + @property + def base_data_ptr(self) -> int: + ... + + @property + def data(self) -> Optional[Buffer]: + ... + + @property + def data_ptr(self) -> int: + ... + + def set_base_data(self, value: Buffer) -> None: + ... + + @property + def nullable(self) -> bool: + ... + + @property + def has_nulls(self) -> bool: + ... + + @property + def base_mask(self) -> Optional[Buffer]: + ... + + @property + def base_mask_ptr(self) -> int: + ... + + @property + def mask(self) -> Optional[Buffer]: + ... + + @property + def mask_ptr(self) -> int: + ... + + def set_base_mask(self, value: Optional[Buffer]) -> None: + ... + + def set_mask(self: T, value: Optional[Buffer]) -> T: + ... + + @property + def null_count(self) -> int: + ... + + @property + def offset(self) -> int: + ... + + @property + def base_children(self) -> Tuple[ColumnBase, ...]: + ... + + @property + def children(self) -> Tuple[ColumnBase, ...]: + ... + + def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: + ... + + def _mimic_inplace(self, other_col: ColumnBase, inplace=False) -> Optional[ColumnBase]: + ... + + @staticmethod + def from_scalar( + val: ScalarLike, + size: int + ) -> ColumnBase: # TODO: This should be Scalar, not ScalarLike + ... diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 7989b62d8c7..28dacb5e944 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -60,14 +60,14 @@ cdef class Column: The *dtype* indicates the Column's element type. """ def __init__( - self, - object data, - int size, - object dtype, - object mask=None, - int offset=0, - object null_count=None, - object children=() + self, + object data, + int size, + object dtype, + object mask=None, + int offset=0, + object null_count=None, + object children=() ): self._size = size @@ -247,10 +247,10 @@ cdef class Column: ) return cudf.core.column.build_column( - self.data, - self.dtype, - mask, - self.size, + data=self.data, + dtype=self.dtype, + mask=mask, + size=self.size, offset=0, children=self.children ) @@ -561,25 +561,22 @@ cdef class Column: children = tuple(children) result = cudf.core.column.build_column( - data, - dtype, - mask, - size, - offset, - null_count, - tuple(children) + data=data, + dtype=dtype, + mask=mask, + size=size, + offset=offset, + null_count=null_count, + children=tuple(children) ) return result - -def make_column_from_scalar(object py_val, size_type size): - - cdef DeviceScalar val = py_val.device_value - - cdef const scalar* c_val = val.get_raw_ptr() - cdef unique_ptr[column] c_result - with nogil: - c_result = move(cpp_make_column_from_scalar(c_val[0], size)) - - return Column.from_unique_ptr(move(c_result)) + @staticmethod + def from_scalar(py_val, size_type size): + cdef DeviceScalar val = py_val.device_value + cdef const scalar* c_val = val.get_raw_ptr() + cdef unique_ptr[column] c_result + with nogil: + c_result = move(cpp_make_column_from_scalar(c_val[0], size)) + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/table.pyi b/python/cudf/cudf/_lib/table.pyi new file mode 100644 index 00000000000..772e940f812 --- /dev/null +++ b/python/cudf/cudf/_lib/table.pyi @@ -0,0 +1,29 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from typing import List, Any, Optional, TYPE_CHECKING + +import cudf + +class Table(object): + _data: cudf.core.column_accessor.ColumnAccessor + _index: Optional[cudf.core.index.Index] + + def __init__(self, data: object = None, index: object = None) -> None: ... + + @property + def _num_columns(self) -> int: ... + + @property + def _num_indices(self) -> int: ... + + @property + def _num_rows(self) -> int: ... + + @property + def _column_names(self) -> List[Any]: ... + + @property + def _index_names(self) -> List[Any]: ... + + @property + def _columns(self) -> List[Any]: ... # TODO: actually, a list of columns diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py new file mode 100644 index 00000000000..0087daa1676 --- /dev/null +++ b/python/cudf/cudf/_typing.py @@ -0,0 +1,28 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from typing import TYPE_CHECKING, Any, TypeVar, Union + +import numpy as np +from pandas import Period, Timedelta, Timestamp +from pandas.api.extensions import ExtensionDtype + +if TYPE_CHECKING: + import cudf + +# Many of these are from +# https://github.com/pandas-dev/pandas/blob/master/pandas/_typing.py + +Dtype = Union["ExtensionDtype", str, np.dtype] +DtypeObj = Union["ExtensionDtype", np.dtype] + +# scalars +DatetimeLikeScalar = TypeVar( + "DatetimeLikeScalar", Period, Timestamp, Timedelta +) +ScalarLike = Any + +# columns +ColumnLike = Any + +# binary operation +BinaryOperand = Union["cudf.Scalar", "cudf.core.column.ColumnBase"] diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py index d6c232373c7..91a369c31f8 100644 --- a/python/cudf/cudf/core/__init__.py +++ b/python/cudf/cudf/core/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) 2018-2020, NVIDIA CORPORATION. -from cudf.core import buffer, column, common +from cudf.core import buffer, column, column_accessor, common from cudf.core.buffer import Buffer from cudf.core.dataframe import DataFrame, from_pandas, merge from cudf.core.index import ( diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py index 02150a79d57..0550b1d4de0 100644 --- a/python/cudf/cudf/core/abc.py +++ b/python/cudf/cudf/core/abc.py @@ -12,9 +12,9 @@ try: import pickle5 as pickle except ImportError: - import pickle + import pickle # type: ignore else: - import pickle + import pickle # type: ignore class Serializable(abc.ABC): diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index 08bc068c28c..350346a87f9 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -1,7 +1,10 @@ # Copyright (c) 2020, NVIDIA CORPORATION. +from __future__ import annotations + import functools import operator import pickle +from typing import Any, Dict, Optional, Tuple import numpy as np @@ -12,7 +15,13 @@ class Buffer(Serializable): - def __init__(self, data=None, size=None, owner=None): + ptr: int + size: int + _owner: Any + + def __init__( + self, data: Any = None, size: Optional[int] = None, owner: Any = None + ): """ A Buffer represents a device memory allocation. @@ -36,7 +45,6 @@ def __init__(self, data=None, size=None, owner=None): elif hasattr(data, "__array_interface__") or hasattr( data, "__cuda_array_interface__" ): - self._init_from_array_like(data, owner) elif isinstance(data, memoryview): self._init_from_array_like(np.asarray(data), owner) @@ -57,15 +65,15 @@ def __init__(self, data=None, size=None, owner=None): raise TypeError("data must be Buffer, array-like or integer") self._init_from_array_like(np.asarray(data), owner) - def __len__(self): + def __len__(self) -> int: return self.size @property - def nbytes(self): + def nbytes(self) -> int: return self.size @property - def __cuda_array_interface__(self): + def __cuda_array_interface__(self) -> dict: intf = { "data": (self.ptr, False), "shape": (self.size,), @@ -102,8 +110,8 @@ def _init_from_array_like(self, data, owner): f"Cannot construct Buffer from {data.__class__.__name__}" ) - def serialize(self): - header = {} + def serialize(self) -> Tuple[dict, list]: + header = {} # type: Dict[Any, Any] header["type-serialized"] = pickle.dumps(type(self)) header["constructor-kwargs"] = {} header["desc"] = self.__cuda_array_interface__.copy() @@ -112,7 +120,7 @@ def serialize(self): return header, frames @classmethod - def deserialize(cls, header, frames): + def deserialize(cls, header: dict, frames: list) -> Buffer: buf = cls(frames[0], **header["constructor-kwargs"]) if header["desc"]["shape"] != buf.__cuda_array_interface__["shape"]: @@ -125,7 +133,7 @@ def deserialize(cls, header, frames): return buf @classmethod - def empty(cls, size): + def empty(cls, size: int) -> Buffer: dbuf = DeviceBuffer(size=size) return Buffer(dbuf) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index ff514e6c6f0..498851c47ee 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1,12 +1,27 @@ # Copyright (c) 2018-2020, NVIDIA CORPORATION. +from __future__ import annotations + import pickle +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Mapping, + Optional, + Tuple, + Union, + cast, +) import numpy as np import pandas as pd +from numba import cuda import cudf from cudf import _lib as libcudf +from cudf._lib.scalar import as_device_scalar from cudf._lib.transform import bools_to_mask +from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import column from cudf.core.column.methods import ColumnMethodsMixin @@ -18,9 +33,23 @@ min_unsigned_type, ) +if TYPE_CHECKING: + from cudf.core.column import ( + ColumnBase, + DatetimeColumn, + NumericalColumn, + StringColumn, + TimeDeltaColumn, + ) + + +ParentType = Union["cudf.Series", "cudf.Index"] + class CategoricalAccessor(ColumnMethodsMixin): - def __init__(self, column, parent=None): + _column: CategoricalColumn + + def __init__(self, column: Any, parent: ParentType = None): """ Accessor object for categorical properties of the Series values. Be aware that assigning to `categories` is a inplace operation, @@ -28,7 +57,8 @@ def __init__(self, column, parent=None): Parameters ---------- - data : Series or CategoricalIndex + column : Column + parent : Series or CategoricalIndex Examples -------- @@ -77,34 +107,35 @@ def __init__(self, column, parent=None): raise AttributeError( "Can only use .cat accessor with a 'category' dtype" ) - self._column = column - self._parent = parent + super().__init__(column=column, parent=parent) @property - def categories(self): + def categories(self) -> "cudf.Index": """ The categories of this categorical. """ return cudf.core.index.as_index(self._column.categories) @property - def codes(self): + def codes(self) -> "cudf.Series": """ Return Series of codes as well as the index. """ - return cudf.Series( - self._column.codes, - index=self._parent.index if self._parent is not None else None, + index = ( + self._parent.index + if isinstance(self._parent, cudf.Series) + else None ) + return cudf.Series(self._column.codes, index=index) @property - def ordered(self): + def ordered(self) -> bool: """ Whether the categories have an ordered relationship. """ return self._column.ordered - def as_ordered(self, inplace=False): + def as_ordered(self, inplace: bool = False) -> Optional[ParentType]: """ Set the Categorical to be ordered. @@ -165,7 +196,7 @@ def as_ordered(self, inplace=False): return self._return_or_inplace(out_col, inplace=inplace) - def as_unordered(self, inplace=False): + def as_unordered(self, inplace: bool = False) -> Optional[ParentType]: """ Set the Categorical to be unordered. @@ -237,7 +268,9 @@ def as_unordered(self, inplace=False): return self._return_or_inplace(out_col, inplace=inplace) - def add_categories(self, new_categories, inplace=False): + def add_categories( + self, new_categories: Any, inplace: bool = False + ) -> Optional[ParentType]: """ Add new categories. @@ -320,7 +353,9 @@ def add_categories(self, new_categories, inplace=False): return self._return_or_inplace(out_col, inplace=inplace) - def remove_categories(self, removals, inplace=False): + def remove_categories( + self, removals: Any, inplace: bool = False, + ) -> Optional[ParentType]: """ Remove the specified categories. @@ -411,8 +446,12 @@ def remove_categories(self, removals, inplace=False): return self._return_or_inplace(out_col, inplace=inplace) def set_categories( - self, new_categories, ordered=None, rename=False, inplace=False, - ): + self, + new_categories: Any, + ordered: bool = False, + rename: bool = False, + inplace: bool = False, + ) -> Optional[ParentType]: """ Set the categories to the specified new_categories. @@ -539,7 +578,12 @@ def set_categories( ) return self._return_or_inplace(out_col, inplace=inplace) - def reorder_categories(self, new_categories, ordered=False, inplace=False): + def reorder_categories( + self, + new_categories: Any, + ordered: bool = False, + inplace: bool = False, + ) -> Optional[ParentType]: """ Reorder categories as specified in new_categories. @@ -621,9 +665,9 @@ def reorder_categories(self, new_categories, ordered=False, inplace=False): return self._return_or_inplace(out_col, inplace=inplace) - def _categories_equal(self, new_categories, ordered=None): - ordered = ordered if ordered is not None else self.ordered - + def _categories_equal( + self, new_categories: ColumnBase, ordered=False + ) -> bool: cur_categories = self._column.categories if len(new_categories) != len(cur_categories): return False @@ -640,8 +684,12 @@ def _categories_equal(self, new_categories, ordered=None): return cur_categories.equals(new_categories) def _set_categories( - self, current_categories, new_categories, is_unique=False, ordered=None - ): + self, + current_categories: Any, + new_categories: Any, + is_unique: bool = False, + ordered: bool = False, + ) -> CategoricalColumn: """Returns a new CategoricalColumn with the categories set to the specified *new_categories*. @@ -705,14 +753,17 @@ class CategoricalColumn(column.ColumnBase): """Implements operations for Columns of Categorical type """ + _codes: Optional[NumericalColumn] + _children: Tuple[NumericalColumn] + def __init__( self, - dtype, - mask=None, - size=None, - offset=0, - null_count=None, - children=(), + dtype: CategoricalDtype, + mask: Buffer = None, + size: int = None, + offset: int = 0, + null_count: int = None, + children: Tuple["column.ColumnBase", ...] = (), ): """ Parameters @@ -722,7 +773,7 @@ def __init__( The validity mask offset : int Data offset - children : Tuple[Column] + children : Tuple[ColumnBase] Two non-null columns containing the categories and codes respectively """ @@ -745,24 +796,23 @@ def __init__( null_count=null_count, children=children, ) - self._codes = None @property - def base_size(self): + def base_size(self) -> int: return int( (self.base_children[0].size) / self.base_children[0].dtype.itemsize ) - def __contains__(self, item): + def __contains__(self, item: ScalarLike) -> bool: try: self._encode(item) except ValueError: return False return self._encode(item) in self.as_numerical - def serialize(self): - header = {} + def serialize(self) -> Tuple[dict, list]: + header = {} # type: Dict[Any, Any] frames = [] header["type-serialized"] = pickle.dumps(type(self)) header["dtype"], dtype_frames = self.dtype.serialize() @@ -771,7 +821,7 @@ def serialize(self): header["data"], data_frames = self.codes.serialize() header["data_frames_count"] = len(data_frames) frames.extend(data_frames) - if self.nullable: + if self.mask is not None: mask_header, mask_frames = self.mask.serialize() header["mask"] = mask_header frames.extend(mask_frames) @@ -779,7 +829,7 @@ def serialize(self): return header, frames @classmethod - def deserialize(cls, header, frames): + def deserialize(cls, header: dict, frames: list) -> CategoricalColumn: n_dtype_frames = header["dtype_frames_count"] dtype = CategoricalDtype.deserialize( header["dtype"], frames[:n_dtype_frames] @@ -796,11 +846,14 @@ def deserialize(cls, header, frames): mask = Buffer.deserialize( header["mask"], [frames[n_dtype_frames + n_data_frames]] ) - return column.build_column( - data=None, - dtype=dtype, - mask=mask, - children=(column.as_column(data.base_data, dtype=data.dtype),), + return cast( + CategoricalColumn, + column.build_column( + data=None, + dtype=dtype, + mask=mask, + children=(column.as_column(data.base_data, dtype=data.dtype),), + ), ) def set_base_data(self, value): @@ -812,16 +865,16 @@ def set_base_data(self, value): else: super().set_base_data(value) - def set_base_mask(self, value): + def set_base_mask(self, value: Optional[Buffer]): super().set_base_mask(value) self._codes = None - def set_base_children(self, value): + def set_base_children(self, value: Tuple[ColumnBase, ...]): super().set_base_children(value) self._codes = None @property - def children(self): + def children(self) -> Tuple[NumericalColumn]: if self._children is None: codes_column = self.base_children[0] @@ -829,20 +882,26 @@ def children(self): buf.ptr = buf.ptr + (self.offset * codes_column.dtype.itemsize) buf.size = self.size * codes_column.dtype.itemsize - codes_column = column.build_column( - data=buf, dtype=codes_column.dtype, size=self.size, + codes_column = cast( + cudf.core.column.NumericalColumn, + column.build_column( + data=buf, dtype=codes_column.dtype, size=self.size, + ), ) self._children = (codes_column,) return self._children @property - def as_numerical(self): - return column.build_column( - data=self.codes.data, dtype=self.codes.dtype, mask=self.mask + def as_numerical(self) -> NumericalColumn: + return cast( + cudf.core.column.NumericalColumn, + column.build_column( + data=self.codes.data, dtype=self.codes.dtype, mask=self.mask + ), ) @property - def categories(self): + def categories(self) -> ColumnBase: return self.dtype.categories._values @categories.setter @@ -852,30 +911,82 @@ def categories(self, value): ) @property - def codes(self): + def codes(self) -> NumericalColumn: if self._codes is None: self._codes = self.children[0].set_mask(self.mask) - return self._codes + return cast(cudf.core.column.NumericalColumn, self._codes) @property - def ordered(self): + def ordered(self) -> bool: return self.dtype.ordered @ordered.setter - def ordered(self, value): + def ordered(self, value: bool): self.dtype.ordered = value - def cat(self, parent=None): + def cat(self, parent: ParentType = None): return CategoricalAccessor(self, parent=parent) - def unary_operator(self, unaryop): + def unary_operator(self, unaryop: str): raise TypeError( f"Series of dtype `category` cannot perform the operation: " f"{unaryop}" ) - def binary_operator(self, op, rhs, reflect=False): + def __setitem__(self, key, value): + if cudf.utils.dtypes.is_scalar(value): + value = self._encode(value) if value is not None else value + else: + value = cudf.core.column.as_column(value).astype(self.dtype) + value = value.codes + codes = self.codes + codes[key] = value + out = cudf.core.column.build_categorical_column( + categories=self.categories, + codes=codes, + mask=codes.base_mask, + size=codes.size, + offset=self.offset, + ordered=self.ordered, + ) + self._mimic_inplace(out, inplace=True) + + def _fill( + self, + fill_value: ScalarLike, + begin: int, + end: int, + inplace: bool = False, + ) -> "column.ColumnBase": + if end <= begin or begin >= self.size: + return self if inplace else self.copy() + + fill_code = self._encode(fill_value) + fill_scalar = as_device_scalar(fill_code, self.codes.dtype) + + result = self if inplace else self.copy() + + libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar) + return result + + def slice( + self, start: int, stop: int, stride: int = None + ) -> "column.ColumnBase": + codes = self.codes.slice(start, stop, stride) + return cudf.core.column.build_categorical_column( + categories=self.categories, + codes=cudf.core.column.as_column( + codes.base_data, dtype=codes.dtype + ), + mask=codes.base_mask, + ordered=self.ordered, + size=codes.size, + offset=codes.offset, + ) + def binary_operator( + self, op: str, rhs, reflect: bool = False + ) -> ColumnBase: if not (self.ordered and rhs.ordered) and op not in ("eq", "ne"): if op in ("lt", "gt", "le", "ge"): raise TypeError( @@ -889,7 +1000,7 @@ def binary_operator(self, op, rhs, reflect=False): raise TypeError("Categoricals can only compare with the same type") return self.as_numerical.binary_operator(op, rhs.as_numerical) - def normalize_binop_value(self, other): + def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn: if isinstance(other, np.ndarray) and other.ndim == 0: other = other.item() @@ -905,7 +1016,9 @@ def normalize_binop_value(self, other): ) return col - def sort_by_values(self, ascending=True, na_position="last"): + def sort_by_values( + self, ascending: bool = True, na_position="last" + ) -> Tuple[CategoricalColumn, NumericalColumn]: codes, inds = self.as_numerical.sort_by_values(ascending, na_position) col = column.build_categorical_column( categories=self.dtype.categories, @@ -916,19 +1029,21 @@ def sort_by_values(self, ascending=True, na_position="last"): ) return col, inds - def element_indexing(self, index): + def element_indexing(self, index: int) -> ScalarLike: val = self.as_numerical.element_indexing(index) - return self._decode(val) if val is not None else val + return self._decode(int(val)) if val is not None else val @property - def __cuda_array_interface__(self): + def __cuda_array_interface__(self) -> Mapping[str, Any]: raise TypeError( "Categorical does not support `__cuda_array_interface__`." " Please consider using `.codes` or `.categories`" " if you need this functionality." ) - def to_pandas(self, index=None, nullable=False): + def to_pandas( + self, index: ColumnLike = None, nullable: bool = False, **kwargs + ) -> pd.Series: signed_dtype = min_signed_type(len(self.categories)) codes = self.cat().codes.astype(signed_dtype).fillna(-1).to_array() categories = self.categories.to_pandas() @@ -938,7 +1053,7 @@ def to_pandas(self, index=None, nullable=False): return pd.Series(data, index=index) @property - def values_host(self): + def values_host(self) -> np.ndarray: """ Return a numpy representation of the CategoricalColumn. """ @@ -951,7 +1066,16 @@ def values(self): """ raise NotImplementedError("cudf.Categorical is not yet implemented") - def unique(self): + def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase": + return ( + self.astype(self.categories.dtype).clip(lo, hi).astype(self.dtype) + ) + + @property + def data_array_view(self) -> cuda.devicearray.DeviceNDArray: + return self.codes.data_array_view + + def unique(self) -> CategoricalColumn: codes = self.as_numerical.unique() return column.build_categorical_column( categories=self.categories, @@ -962,18 +1086,23 @@ def unique(self): ordered=self.ordered, ) - def _encode(self, value): + def _encode(self, value) -> ScalarLike: return self.categories.find_first_value(value) - def _decode(self, value): + def _decode(self, value: int) -> ScalarLike: if value == self.default_na_value(): return None return self.categories.element_indexing(value) - def default_na_value(self): + def default_na_value(self) -> ScalarLike: return -1 - def find_and_replace(self, to_replace, replacement, all_nan): + def find_and_replace( + self, + to_replace: ColumnLike, + replacement: ColumnLike, + all_nan: bool = False, + ) -> CategoricalColumn: """ Return col with *to_replace* replaced with *replacement*. """ @@ -1038,7 +1167,9 @@ def find_and_replace(self, to_replace, replacement, all_nan): ordered=self.dtype.ordered, ) - def fillna(self, fill_value=None, method=None): + def fillna( + self, fill_value: Any = None, method: Any = None, dtype: Dtype = None + ) -> CategoricalColumn: """ Fill null values with *fill_value* """ @@ -1084,20 +1215,22 @@ def fillna(self, fill_value=None, method=None): return result - def find_first_value(self, value, closest=False): + def find_first_value( + self, value: ScalarLike, closest: bool = False + ) -> int: """ Returns offset of first value that matches """ return self.as_numerical.find_first_value(self._encode(value)) - def find_last_value(self, value, closest=False): + def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: """ Returns offset of last value that matches """ return self.as_numerical.find_last_value(self._encode(value)) @property - def is_monotonic_increasing(self): + def is_monotonic_increasing(self) -> bool: if not hasattr(self, "_is_monotonic_increasing"): self._is_monotonic_increasing = ( self.ordered and self.as_numerical.is_monotonic_increasing @@ -1105,14 +1238,16 @@ def is_monotonic_increasing(self): return self._is_monotonic_increasing @property - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: if not hasattr(self, "_is_monotonic_decreasing"): self._is_monotonic_decreasing = ( self.ordered and self.as_numerical.is_monotonic_decreasing ) return self._is_monotonic_decreasing - def as_categorical_column(self, dtype, **kwargs): + def as_categorical_column( + self, dtype: Dtype, **kwargs + ) -> CategoricalColumn: if isinstance(dtype, str) and dtype == "category": return self if ( @@ -1129,6 +1264,9 @@ def as_categorical_column(self, dtype, **kwargs): categories=dtype.categories, ordered=dtype.ordered ) + if not isinstance(dtype, CategoricalDtype): + raise ValueError("dtype must be CategoricalDtype") + if not isinstance(self.categories, type(dtype.categories._values)): # If both categories are of different Column types, # return a column full of Nulls. @@ -1138,25 +1276,25 @@ def as_categorical_column(self, dtype, **kwargs): new_categories=dtype.categories, ordered=dtype.ordered ) - def as_numerical_column(self, dtype): + def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: return self._get_decategorized_column().as_numerical_column(dtype) - def as_string_column(self, dtype, **kwargs): + def as_string_column(self, dtype, format=None) -> StringColumn: return self._get_decategorized_column().as_string_column( - dtype, **kwargs + dtype, format=format ) - def as_datetime_column(self, dtype, **kwargs): + def as_datetime_column(self, dtype, **kwargs) -> DatetimeColumn: return self._get_decategorized_column().as_datetime_column( dtype, **kwargs ) - def as_timedelta_column(self, dtype, **kwargs): + def as_timedelta_column(self, dtype, **kwargs) -> TimeDeltaColumn: return self._get_decategorized_column().as_timedelta_column( dtype, **kwargs ) - def _get_decategorized_column(self): + def _get_decategorized_column(self) -> ColumnBase: if self.null_count == len(self): # self.categories is empty; just return codes return self.cat().codes._column @@ -1165,7 +1303,7 @@ def _get_decategorized_column(self): out = out.set_mask(self.mask) return out - def copy(self, deep=True): + def copy(self, deep: bool = True) -> CategoricalColumn: if deep: copied_col = libcudf.copying.copy_column(self) copied_cat = libcudf.copying.copy_column(self.dtype._categories) @@ -1192,12 +1330,13 @@ def copy(self, deep=True): size=self.size, ) - def __sizeof__(self): + def __sizeof__(self) -> int: return ( self.cat().categories.__sizeof__() + self.cat().codes.__sizeof__() ) - def _memory_usage(self, deep=False): + def _memory_usage(self, **kwargs) -> int: + deep = kwargs.get("deep", False) if deep: return self.__sizeof__() else: @@ -1206,22 +1345,25 @@ def _memory_usage(self, deep=False): + self.cat().codes.memory_usage() ) - def _mimic_inplace(self, other_col, inplace=False): + def _mimic_inplace( + self, other_col: ColumnBase, inplace: bool = False + ) -> Optional[ColumnBase]: out = super()._mimic_inplace(other_col, inplace=inplace) - if inplace: + if inplace and isinstance(other_col, CategoricalColumn): self._codes = other_col._codes return out - def view(self, dtype): + def view(self, dtype: Dtype) -> ColumnBase: raise NotImplementedError( "Categorical column views are not currently supported" ) -def _create_empty_categorical_column(categorical_column, dtype): - +def _create_empty_categorical_column( + categorical_column: CategoricalColumn, dtype: "CategoricalDtype" +) -> CategoricalColumn: return column.build_categorical_column( - categories=dtype.categories, + categories=column.as_column(dtype.categories), codes=column.as_column( cudf.utils.utils.scalar_broadcast_to( categorical_column.default_na_value(), @@ -1236,7 +1378,9 @@ def _create_empty_categorical_column(categorical_column, dtype): ) -def pandas_categorical_as_column(categorical, codes=None): +def pandas_categorical_as_column( + categorical: ColumnLike, codes: ColumnLike = None +) -> CategoricalColumn: """Creates a CategoricalColumn from a pandas.Categorical diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 7008604a1c3..670dd456de9 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1,9 +1,24 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. +from __future__ import annotations +import builtins import pickle import warnings -from numbers import Number +from collections.abc import MutableSequence from types import SimpleNamespace +from typing import ( + Any, + Callable, + Dict, + List, + Mapping, + Optional, + Sequence, + Tuple, + TypeVar, + Union, + cast, +) import cupy import numpy as np @@ -22,6 +37,7 @@ from cudf._lib.scalar import as_device_scalar from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count from cudf._lib.transform import bools_to_mask +from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.abc import Serializable from cudf.core.buffer import Buffer from cudf.core.dtypes import CategoricalDtype @@ -44,68 +60,34 @@ ) from cudf.utils.utils import mask_dtype +T = TypeVar("T", bound="ColumnBase") -class ColumnBase(Column, Serializable): - def __init__( - self, - data, - size, - dtype, - mask=None, - offset=0, - null_count=None, - children=(), - ): - """ - Parameters - ---------- - data : Buffer - dtype - The type associated with the data Buffer - mask : Buffer, optional - children : tuple, optional - """ - super().__init__( - data, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - children=children, - ) - def as_frame(self): +class ColumnBase(Column, Serializable): + def as_frame(self) -> "cudf.core.frame.Frame": """ Converts a Column to Frame """ return cudf.core.frame.Frame({None: self.copy(deep=False)}) @property - def data_array_view(self): + def data_array_view(self) -> "cuda.devicearray.DeviceNDArray": """ View the data as a device array object """ - if self.dtype == "object": - raise ValueError("Cannot get an array view of a StringColumn") - - if is_categorical_dtype(self.dtype): - return self.codes.data_array_view - else: - dtype = self.dtype - result = cuda.as_cuda_array(self.data) # Workaround until `.view(...)` can change itemsize # xref: https://github.com/numba/numba/issues/4829 result = cuda.devicearray.DeviceNDArray( - shape=(result.nbytes // dtype.itemsize,), - strides=(dtype.itemsize,), - dtype=dtype, + shape=(result.nbytes // self.dtype.itemsize,), + strides=(self.dtype.itemsize,), + dtype=self.dtype, gpu_data=result.gpu_data, ) return result @property - def mask_array_view(self): + def mask_array_view(self) -> "cuda.devicearray.DeviceNDArray": """ View the mask as a device array """ @@ -122,10 +104,12 @@ def mask_array_view(self): ) return result - def __len__(self): + def __len__(self) -> int: return self.size - def to_pandas(self, index=None, nullable=False, **kwargs): + def to_pandas( + self, index: ColumnLike = None, nullable: bool = False, **kwargs + ) -> "pd.Series": if nullable and self.dtype in cudf_dtypes_to_pandas_dtypes: pandas_nullable_dtype = cudf_dtypes_to_pandas_dtypes[self.dtype] arrow_array = self.to_arrow() @@ -144,14 +128,14 @@ def __iter__(self): cudf.utils.utils.raise_iteration_error(obj=self) @property - def values_host(self): + def values_host(self) -> "np.ndarray": """ Return a numpy representation of the Column. """ return self.data_array_view.copy_to_host() @property - def values(self): + def values(self) -> "cupy.ndarray": """ Return a CuPy representation of the Column. """ @@ -163,14 +147,18 @@ def values(self): return cupy.asarray(self.data_array_view) - def clip(self, lo, hi): - if is_categorical_dtype(self): - input_col = self.astype(self.categories.dtype) - return libcudf.replace.clip(input_col, lo, hi).astype(self.dtype) - else: - return libcudf.replace.clip(self, lo, hi) + def find_and_replace( + self: T, + to_replace: ColumnLike, + replacement: ColumnLike, + all_nan: bool = False, + ) -> T: + raise NotImplementedError - def equals(self, other, check_dtypes=False): + def clip(self, lo: ScalarLike, hi: ScalarLike) -> ColumnBase: + return libcudf.replace.clip(self, lo, hi) + + def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool: if self is other: return True if other is None or len(self) != len(other): @@ -180,21 +168,32 @@ def equals(self, other, check_dtypes=False): return False return (self == other).min() - def all(self): + def all(self) -> bool: return bool(libcudf.reduce.reduce("all", self, dtype=np.bool_)) - def any(self): + def any(self) -> bool: return bool(libcudf.reduce.reduce("any", self, dtype=np.bool_)) - def __sizeof__(self): - n = self.data.size + def __sizeof__(self) -> int: + n = 0 + if self.data is not None: + n += self.data.size if self.nullable: n += bitmask_allocation_size_bytes(self.size) return n - @classmethod - def _concat(cls, objs, dtype=None): + def cat( + self, parent=None + ) -> "cudf.core.column.categorical.CategoricalAccessor": + raise NotImplementedError() + def str(self, parent=None) -> "cudf.core.column.string.StringMethods": + raise NotImplementedError() + + @classmethod + def _concat( + cls, objs: "MutableSequence[ColumnBase]", dtype: Dtype = None + ) -> ColumnBase: if len(objs) == 0: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): @@ -282,7 +281,7 @@ def _concat(cls, objs, dtype=None): if is_categorical: col = build_categorical_column( - categories=cats, + categories=as_column(cats), codes=as_column(col.base_data, dtype=col.dtype), mask=col.base_mask, size=col.size, @@ -291,7 +290,7 @@ def _concat(cls, objs, dtype=None): return col - def dropna(self, drop_nan=False): + def dropna(self, drop_nan: bool = False) -> ColumnBase: if drop_nan: col = self.nans_to_nulls() else: @@ -301,7 +300,7 @@ def dropna(self, drop_nan=False): ) return dropped_col - def to_arrow(self): + def to_arrow(self) -> pa.Array: """Convert to PyArrow Array Examples @@ -350,7 +349,7 @@ def to_arrow(self): )["None"].chunk(0) @classmethod - def from_arrow(cls, array): + def from_arrow(cls, array: pa.Array) -> ColumnBase: """ Convert PyArrow Array/ChunkedArray to column @@ -412,15 +411,18 @@ def from_arrow(cls, array): "None" ] - def _get_mask_as_column(self): + def _get_mask_as_column(self) -> ColumnBase: return libcudf.transform.mask_to_bools( self.base_mask, self.offset, self.offset + len(self) ) - def _memory_usage(self, **kwargs): + def _memory_usage(self, **kwargs) -> int: return self.__sizeof__() - def to_gpu_array(self, fillna=None): + def default_na_value(self) -> Any: + raise NotImplementedError() + + def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray": """Get a dense numba device array for the data. Parameters @@ -439,7 +441,7 @@ def to_gpu_array(self, fillna=None): else: return self.dropna(drop_nan=False).data_array_view - def to_array(self, fillna=None): + def to_array(self, fillna=None) -> "np.array": """Get a dense numpy array for the data. Parameters @@ -458,13 +460,16 @@ def to_array(self, fillna=None): return self.to_gpu_array(fillna=fillna).copy_to_host() - def _fill(self, fill_value, begin=0, end=-1, inplace=False): + def _fill( + self, + fill_value: ScalarLike, + begin: int, + end: int, + inplace: bool = False, + ) -> Optional[ColumnBase]: if end <= begin or begin >= self.size: return self if inplace else self.copy() - if is_categorical_dtype(self.dtype): - return self._fill_categorical(fill_value, begin, end, inplace) - fill_scalar = as_device_scalar(fill_value, self.dtype) if not inplace: @@ -484,7 +489,6 @@ def _fill(self, fill_value, begin=0, end=-1, inplace=False): return self - def _fill_categorical(self, fill_value, begin, end, inplace): fill_code = self._encode(fill_value) fill_scalar = as_device_scalar(fill_code, self.codes.dtype) @@ -493,16 +497,16 @@ def _fill_categorical(self, fill_value, begin, end, inplace): libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar) return result - def shift(self, offset, fill_value): + def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase: return libcudf.copying.shift(self, offset, fill_value) @property - def valid_count(self): + def valid_count(self) -> int: """Number of non-null values""" return len(self) - self.null_count @property - def nullmask(self): + def nullmask(self) -> Buffer: """The gpu buffer for the null-mask """ if self.nullable: @@ -510,7 +514,7 @@ def nullmask(self): else: raise ValueError("Column has no null mask") - def copy(self, deep=True): + def copy(self, deep: bool = True) -> ColumnBase: """Columns are immutable, so a deep copy produces a copy of the underlying data and mask and a shallow copy creates a new column and copies the references of the data and mask. @@ -527,7 +531,7 @@ def copy(self, deep=True): children=self.base_children, ) - def view(self, dtype): + def view(self, dtype: Dtype) -> ColumnBase: """ View the data underlying a column as different dtype. The source column must divide evenly into the size of @@ -569,6 +573,7 @@ def view(self, dtype): + f" total bytes into {dtype} with size {dtype.itemsize}" ) + assert self.base_data is not None new_buf_ptr = ( self.base_data.ptr + self.offset * self.dtype.itemsize ) @@ -580,7 +585,7 @@ def view(self, dtype): ) return build_column(view_buf, dtype=dtype) - def element_indexing(self, index): + def element_indexing(self, index: int): """Default implementation for indexing to an element Raises @@ -595,46 +600,29 @@ def element_indexing(self, index): return libcudf.copying.get_element(self, index).value - def __getitem__(self, arg): + def slice(self, start: int, stop: int, stride: int = None) -> ColumnBase: + if start < 0: + start = start + len(self) + if stop < 0: + stop = stop + len(self) + if start >= stop: + return column_empty(0, self.dtype, masked=True) + # compute mask slice + if stride == 1 or stride is None: + return libcudf.copying.column_slice(self, [start, stop])[0] + else: + # Need to create a gather map for given slice with stride + gather_map = arange( + start=start, stop=stop, step=stride, dtype=np.dtype(np.int32), + ) + return self.take(gather_map) - if isinstance(arg, Number): - arg = int(arg) - return self.element_indexing(arg) + def __getitem__(self, arg) -> Union[ScalarLike, ColumnBase]: + if is_scalar(arg): + return self.element_indexing(int(arg)) elif isinstance(arg, slice): - - if is_categorical_dtype(self): - codes = self.codes[arg] - return build_categorical_column( - categories=self.categories, - codes=as_column(codes.base_data, dtype=codes.dtype), - mask=codes.base_mask, - ordered=self.ordered, - size=codes.size, - offset=codes.offset, - ) - start, stop, stride = arg.indices(len(self)) - - if start < 0: - start = start + len(self) - if stop < 0: - stop = stop + len(self) - - if start >= stop: - return column_empty(0, self.dtype, masked=True) - # compute mask slice - if stride == 1 or stride is None: - - return libcudf.copying.column_slice(self, [start, stop])[0] - else: - # Need to create a gather map for given slice with stride - gather_map = arange( - start=start, - stop=stop, - step=stride, - dtype=np.dtype(np.int32), - ) - return self.take(gather_map) + return self.slice(start, stop, stride) else: arg = as_column(arg) if len(arg) == 0: @@ -645,7 +633,7 @@ def __getitem__(self, arg): return self.apply_boolean_mask(arg) raise NotImplementedError(type(arg)) - def __setitem__(self, key, value): + def __setitem__(self, key: Any, value: Any): """ Set the value of self[key] to value. @@ -686,10 +674,7 @@ def __setitem__(self, key, value): nelem = len(key) if is_scalar(value): - if is_categorical_dtype(self.dtype): - value = self._encode(value) - else: - value = self.dtype.type(value) if value is not None else value + value = self.dtype.type(value) if value is not None else value else: if len(value) != nelem: msg = ( @@ -699,9 +684,6 @@ def __setitem__(self, key, value): ) raise ValueError(msg) value = as_column(value).astype(self.dtype) - if is_categorical_dtype(value.dtype): - value = value.cat().set_categories(self.categories) - assert self.dtype == value.dtype if ( isinstance(key, slice) @@ -712,34 +694,11 @@ def __setitem__(self, key, value): out = libcudf.copying.copy_range( value, self, 0, nelem, key_start, key_stop, False ) - if is_categorical_dtype(value.dtype): - out = build_categorical_column( - categories=value.categories, - codes=as_column(out.base_data, dtype=out.dtype), - mask=out.base_mask, - size=out.size, - offset=out.offset, - ordered=value.ordered, - ) else: try: if is_scalar(value): input = self - if is_categorical_dtype(self.dtype): - input = self.codes - out = input.as_frame()._scatter(key, [value])._as_column() - - if is_categorical_dtype(self.dtype): - out = build_categorical_column( - categories=self.categories, - codes=as_column(out.base_data, dtype=out.dtype), - mask=out.base_mask, - size=out.size, - offset=out.offset, - ordered=self.ordered, - ) - else: if not isinstance(value, Column): value = as_column(value) @@ -757,7 +716,12 @@ def __setitem__(self, key, value): self._mimic_inplace(out, inplace=True) - def fillna(self, value=None, method=None, dtype=None): + def fillna( + self: T, + value: Any = None, + method: builtins.str = None, + dtype: Dtype = None, + ) -> T: """Fill null values with ``value``. Returns a copy with null filled. @@ -766,7 +730,7 @@ def fillna(self, value=None, method=None, dtype=None): input_col=self, replacement=value, method=method, dtype=dtype ) - def isnull(self): + def isnull(self) -> ColumnBase: """Identify missing values in a Column. """ result = libcudf.unary.is_null(self) @@ -778,12 +742,12 @@ def isnull(self): return result - def isna(self): + def isna(self) -> ColumnBase: """Identify missing values in a Column. Alias for isnull. """ return self.isnull() - def notnull(self): + def notnull(self) -> ColumnBase: """Identify non-missing values in a Column. """ result = libcudf.unary.is_valid(self) @@ -795,12 +759,14 @@ def notnull(self): return result - def notna(self): + def notna(self) -> ColumnBase: """Identify non-missing values in a Column. Alias for notnull. """ return self.notnull() - def find_first_value(self, value): + def find_first_value( + self, value: ScalarLike, closest: bool = False + ) -> int: """ Returns offset of first value that matches """ @@ -811,7 +777,7 @@ def find_first_value(self, value): raise ValueError("value not found") return indices[0] - def find_last_value(self, value): + def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: """ Returns offset of last value that matches """ @@ -822,21 +788,26 @@ def find_last_value(self, value): raise ValueError("value not found") return indices[-1] - def append(self, other): + def append(self, other: ColumnBase) -> ColumnBase: return ColumnBase._concat([self, as_column(other)]) - def quantile(self, q, interpolation, exact): + def quantile( + self, + q: Union[float, Sequence[float]], + interpolation: builtins.str, + exact: bool, + ) -> ColumnBase: raise TypeError(f"cannot perform quantile with type {self.dtype}") - def median(self, skipna=None): + def median(self, skipna: bool = None) -> ScalarLike: raise TypeError(f"cannot perform median with type {self.dtype}") - def take(self, indices, keep_index=True): + def take(self: T, indices: ColumnBase, keep_index: bool = True) -> T: """Return Column by taking values from the corresponding *indices*. """ # Handle zero size if indices.size == 0: - return column_empty_like(self, newsize=0) + return cast(T, column_empty_like(self, newsize=0)) try: return ( self.as_frame() @@ -850,7 +821,7 @@ def take(self, indices, keep_index=True): ) from e raise - def isin(self, values): + def isin(self, values: Sequence) -> ColumnBase: """Check whether values are contained in the Column. Parameters @@ -905,17 +876,17 @@ def isin(self, values): rhs = as_column(pd.Categorical.from_codes([-1], categories=[])) rhs = rhs.cat().set_categories(lhs_cats).astype(self.dtype) - lhs = cudf.DataFrame({"x": lhs, "orig_order": arange(len(lhs))}) - rhs = cudf.DataFrame( + ldf = cudf.DataFrame({"x": lhs, "orig_order": arange(len(lhs))}) + rdf = cudf.DataFrame( {"x": rhs, "bool": full(len(rhs), True, dtype="bool")} ) - res = lhs.merge(rhs, on="x", how="left").sort_values(by="orig_order") + res = ldf.merge(rdf, on="x", how="left").sort_values(by="orig_order") res = res.drop_duplicates(subset="orig_order", ignore_index=True) res = res._data["bool"].fillna(False) return res - def as_mask(self): + def as_mask(self) -> Buffer: """Convert booleans to bitmask Returns @@ -935,15 +906,15 @@ def to_dlpack(self): return cudf.io.dlpack.to_dlpack(self) @property - def is_unique(self): + def is_unique(self) -> bool: return self.distinct_count() == len(self) @property - def is_monotonic(self): + def is_monotonic(self) -> bool: return self.is_monotonic_increasing @property - def is_monotonic_increasing(self): + def is_monotonic_increasing(self) -> bool: if not hasattr(self, "_is_monotonic_increasing"): if self.has_nulls: self._is_monotonic_increasing = False @@ -954,7 +925,7 @@ def is_monotonic_increasing(self): return self._is_monotonic_increasing @property - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: if not hasattr(self, "_is_monotonic_decreasing"): if self.has_nulls: self._is_monotonic_decreasing = False @@ -964,14 +935,16 @@ def is_monotonic_decreasing(self): ) return self._is_monotonic_decreasing - def get_slice_bound(self, label, side, kind): + def get_slice_bound( + self, label: ScalarLike, side: builtins.str, kind: builtins.str + ) -> int: """ Calculate slice bound that corresponds to given label. Returns leftmost (one-past-the-rightmost if ``side=='right'``) position of given label. Parameters ---------- - label : object + label : Scalar side : {'left', 'right'} kind : {'ix', 'loc', 'getitem'} """ @@ -986,21 +959,29 @@ def get_slice_bound(self, label, side, kind): # Not currently using `kind` argument. if side == "left": return self.find_first_value(label, closest=True) - if side == "right": + elif side == "right": return self.find_last_value(label, closest=True) + 1 + else: + raise ValueError(f"Invalid value for side: {side}") - def sort_by_values(self, ascending=True, na_position="last"): + def sort_by_values( + self: ColumnBase, + ascending: bool = True, + na_position: builtins.str = "last", + ) -> Tuple[ColumnBase, "cudf.core.column.NumericalColumn"]: col_inds = self.as_frame()._get_sorted_inds(ascending, na_position) - col_keys = self[col_inds] + col_keys = self.take(col_inds) return col_keys, col_inds - def distinct_count(self, method="sort", dropna=True): + def distinct_count( + self, method: builtins.str = "sort", dropna: bool = True + ) -> int: if method != "sort": msg = "non sort based distinct_count() not implemented yet" raise NotImplementedError(msg) return cpp_distinct_count(self, ignore_nulls=dropna) - def astype(self, dtype, **kwargs): + def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: if is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) elif pd.api.types.pandas_dtype(dtype).type in { @@ -1022,7 +1003,7 @@ def astype(self, dtype, **kwargs): else: return self.as_numerical_column(dtype) - def as_categorical_column(self, dtype, **kwargs): + def as_categorical_column(self, dtype, **kwargs) -> ColumnBase: if "ordered" in kwargs: ordered = kwargs["ordered"] else: @@ -1065,26 +1046,36 @@ def as_categorical_column(self, dtype, **kwargs): ordered=ordered, ) - def as_numerical_column(self, dtype): + def as_numerical_column( + self, dtype: Dtype + ) -> "cudf.core.column.NumericalColumn": raise NotImplementedError - def as_datetime_column(self, dtype, **kwargs): + def as_datetime_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.DatetimeColumn": raise NotImplementedError - def as_timedelta_column(self, dtype, **kwargs): + def as_timedelta_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.TimeDeltaColumn": raise NotImplementedError - def as_string_column(self, dtype, **kwargs): + def as_string_column( + self, dtype: Dtype, format=None + ) -> "cudf.core.column.StringColumn": raise NotImplementedError - def apply_boolean_mask(self, mask): + def apply_boolean_mask(self, mask) -> ColumnBase: mask = as_column(mask, dtype="bool") result = ( self.as_frame()._apply_boolean_mask(boolean_mask=mask)._as_column() ) return result - def argsort(self, ascending=True, na_position="last"): + def argsort( + self, ascending: bool = True, na_position: builtins.str = "last" + ) -> ColumnBase: sorted_indices = self.as_frame()._get_sorted_inds( ascending=ascending, na_position=na_position @@ -1092,7 +1083,7 @@ def argsort(self, ascending=True, na_position="last"): return sorted_indices @property - def __cuda_array_interface__(self): + def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]: output = { "shape": (len(self),), "strides": (self.dtype.itemsize,), @@ -1164,14 +1155,18 @@ def __ge__(self, other): return self.binary_operator("ge", other) def searchsorted( - self, value, side="left", ascending=True, na_position="last" + self, + value, + side: builtins.str = "left", + ascending: bool = True, + na_position: builtins.str = "last", ): values = as_column(value).as_frame() return self.as_frame().searchsorted( values, side, ascending=ascending, na_position=na_position ) - def unique(self): + def unique(self) -> ColumnBase: """ Get unique values in the data """ @@ -1181,17 +1176,18 @@ def unique(self): ._as_column() ) - def serialize(self): - header = {} + def serialize(self) -> Tuple[dict, list]: + header = {} # type: Dict[Any, Any] frames = [] header["type-serialized"] = pickle.dumps(type(self)) header["dtype"] = self.dtype.str - data_header, data_frames = self.data.serialize() - header["data"] = data_header - frames.extend(data_frames) + if self.data is not None: + data_header, data_frames = self.data.serialize() + header["data"] = data_header + frames.extend(data_frames) - if self.nullable: + if self.mask is not None: mask_header, mask_frames = self.mask.serialize() header["mask"] = mask_header frames.extend(mask_frames) @@ -1200,7 +1196,7 @@ def serialize(self): return header, frames @classmethod - def deserialize(cls, header, frames): + def deserialize(cls, header: dict, frames: list) -> ColumnBase: dtype = header["dtype"] data = Buffer.deserialize(header["data"], [frames[0]]) mask = None @@ -1208,63 +1204,71 @@ def deserialize(cls, header, frames): mask = Buffer.deserialize(header["mask"], [frames[1]]) return build_column(data=data, dtype=dtype, mask=mask) - def binary_operator(self, op, other, reflect=False): + def binary_operator( + self, op: builtins.str, other: BinaryOperand, reflect: bool = False + ) -> ColumnBase: raise NotImplementedError - def min(self, skipna=None, dtype=None): + def min(self, skipna: bool = None, dtype: Dtype = None): result_col = self._process_for_reduction(skipna=skipna) if isinstance(result_col, ColumnBase): return libcudf.reduce.reduce("min", result_col, dtype=dtype) else: return result_col - def max(self, skipna=None, dtype=None): + def max(self, skipna: bool = None, dtype: Dtype = None): result_col = self._process_for_reduction(skipna=skipna) if isinstance(result_col, ColumnBase): return libcudf.reduce.reduce("max", result_col, dtype=dtype) else: return result_col - def sum(self, skipna=None, dtype=None, min_count=0): + def sum( + self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 + ): raise TypeError(f"cannot perform sum with type {self.dtype}") - def product(self, skipna=None, dtype=None, min_count=0): + def product( + self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 + ): raise TypeError(f"cannot perform prod with type {self.dtype}") - def mean(self, skipna=None, dtype=None): + def mean(self, skipna: bool = None, dtype: Dtype = None): raise TypeError(f"cannot perform mean with type {self.dtype}") - def std(self, skipna=None, ddof=1, dtype=np.float64): + def std(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64): raise TypeError(f"cannot perform std with type {self.dtype}") - def var(self, skipna=None, ddof=1, dtype=np.float64): + def var(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64): raise TypeError(f"cannot perform var with type {self.dtype}") - def kurtosis(self, skipna=None): + def kurtosis(self, skipna: bool = None): raise TypeError(f"cannot perform kurt with type {self.dtype}") - def skew(self, skipna=None): + def skew(self, skipna: bool = None): raise TypeError(f"cannot perform skew with type {self.dtype}") - def cov(self, other): + def cov(self, other: ColumnBase): raise TypeError( f"cannot perform covarience with types {self.dtype}, " f"{other.dtype}" ) - def corr(self, other): + def corr(self, other: ColumnBase): raise TypeError( f"cannot perform corr with types {self.dtype}, {other.dtype}" ) - def nans_to_nulls(self): + def nans_to_nulls(self: T) -> T: if self.dtype.kind == "f": newmask = libcudf.transform.nans_to_nulls(self) return self.set_mask(newmask) else: return self - def _process_for_reduction(self, skipna=None, min_count=0): + def _process_for_reduction( + self, skipna: bool = None, min_count: int = 0 + ) -> Union[ColumnBase, ScalarLike]: skipna = True if skipna is None else skipna if skipna: @@ -1289,8 +1293,13 @@ def _process_for_reduction(self, skipna=None, min_count=0): return result_col def scatter_to_table( - self, row_indices, column_indices, names, nrows=None, ncols=None - ): + self, + row_indices: ColumnBase, + column_indices: ColumnBase, + names: List[Any], + nrows: int = None, + ncols: int = None, + ) -> "cudf.core.frame.Frame": """ Scatters values from the column into a table. @@ -1335,7 +1344,12 @@ def scatter_to_table( ) -def column_empty_like(column, dtype=None, masked=False, newsize=None): +def column_empty_like( + column: ColumnBase, + dtype: Dtype = None, + masked: bool = False, + newsize: int = None, +) -> ColumnBase: """Allocate a new column like the given *column* """ if dtype is None: @@ -1347,6 +1361,7 @@ def column_empty_like(column, dtype=None, masked=False, newsize=None): and is_categorical_dtype(column.dtype) and dtype == column.dtype ): + column = cast("cudf.core.column.CategoricalColumn", column) codes = column_empty_like(column.codes, masked=masked, newsize=newsize) return build_column( data=None, @@ -1359,7 +1374,9 @@ def column_empty_like(column, dtype=None, masked=False, newsize=None): return column_empty(row_count, dtype, masked) -def column_empty_like_same_mask(column, dtype): +def column_empty_like_same_mask( + column: ColumnBase, dtype: Dtype +) -> ColumnBase: """Create a new empty Column with the same length and the same mask. Parameters @@ -1373,11 +1390,13 @@ def column_empty_like_same_mask(column, dtype): return result -def column_empty(row_count, dtype="object", masked=False): +def column_empty( + row_count: int, dtype: Dtype = "object", masked: bool = False +) -> ColumnBase: """Allocate a new column like the given row_count and dtype. """ dtype = pd.api.types.pandas_dtype(dtype) - children = () + children = () # type: Tuple[ColumnBase, ...] if is_categorical_dtype(dtype): data = None @@ -1410,8 +1429,15 @@ def column_empty(row_count, dtype="object", masked=False): def build_column( - data, dtype, mask=None, size=None, offset=0, null_count=None, children=() -): + data: Union[Buffer, None], + dtype: Dtype, + *, + size: int = None, + mask: Buffer = None, + offset: int = 0, + null_count: int = None, + children: Tuple[ColumnBase, ...] = (), +) -> ColumnBase: """ Build a Column of the appropriate type from the given parameters @@ -1446,6 +1472,7 @@ def build_column( children=children, ) elif dtype.type is np.datetime64: + assert data is not None return cudf.core.column.DatetimeColumn( data=data, dtype=dtype, @@ -1455,6 +1482,7 @@ def build_column( null_count=null_count, ) elif dtype.type is np.timedelta64: + assert data is not None return cudf.core.column.TimeDeltaColumn( data=data, dtype=dtype, @@ -1483,8 +1511,8 @@ def build_column( elif is_struct_dtype(dtype): return cudf.core.column.StructColumn( data=data, - size=size, dtype=dtype, + size=size, mask=mask, null_count=null_count, children=children, @@ -1499,6 +1527,7 @@ def build_column( children=children, ) else: + assert data is not None return cudf.core.column.NumericalColumn( data=data, dtype=dtype, @@ -1510,14 +1539,14 @@ def build_column( def build_categorical_column( - categories, - codes, - mask=None, - size=None, - offset=0, - null_count=None, - ordered=None, -): + categories: ColumnBase, + codes: ColumnBase, + mask: Buffer = None, + size: int = None, + offset: int = 0, + null_count: int = None, + ordered: bool = None, +) -> "cudf.core.column.CategoricalColumn": """ Build a CategoricalColumn @@ -1541,9 +1570,9 @@ def build_categorical_column( if codes.dtype != codes_dtype: codes = codes.astype(codes_dtype) - dtype = CategoricalDtype(categories=as_column(categories), ordered=ordered) + dtype = CategoricalDtype(categories=categories, ordered=ordered) - return build_column( + result = build_column( data=None, dtype=dtype, mask=mask, @@ -1552,9 +1581,15 @@ def build_categorical_column( null_count=null_count, children=(codes,), ) + return cast("cudf.core.column.CategoricalColumn", result) -def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): +def as_column( + arbitrary: Any, + nan_as_null: bool = None, + dtype: Dtype = None, + length: int = None, +): """Create a Column from an arbitrary object Parameters @@ -1791,7 +1826,10 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): mask = data.mask data = cudf.core.column.timedelta.TimeDeltaColumn( - data=buffer, mask=mask, dtype=arbitrary.dtype + data=buffer, + size=len(arbitrary), + mask=mask, + dtype=arbitrary.dtype, ) elif arb_dtype.kind in ("O", "U"): data = as_column( @@ -1840,9 +1878,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null ) elif isinstance(arbitrary, cudf.Scalar): - data = libcudf.column.make_column_from_scalar( - arbitrary, length if length else 1 - ) + data = ColumnBase.from_scalar(arbitrary, length if length else 1) elif isinstance(arbitrary, pd.core.arrays.masked.BaseMaskedArray): cudf_dtype = arbitrary._data.dtype @@ -1924,7 +1960,11 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None): return data -def column_applymap(udf, column, out_dtype): +def column_applymap( + udf: Callable[[ScalarLike], ScalarLike], + column: ColumnBase, + out_dtype: Dtype, +) -> ColumnBase: """Apply an element-wise function to transform the values in the Column. Parameters @@ -1972,7 +2012,7 @@ def kernel_non_masked(values, results): return as_column(results) -def _data_from_cuda_array_interface_desc(obj): +def _data_from_cuda_array_interface_desc(obj) -> Buffer: desc = obj.__cuda_array_interface__ ptr = desc["data"][0] nelem = desc["shape"][0] if len(desc["shape"]) > 0 else 1 @@ -1982,7 +2022,7 @@ def _data_from_cuda_array_interface_desc(obj): return data -def _mask_from_cuda_array_interface_desc(obj): +def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]: desc = obj.__cuda_array_interface__ mask = desc.get("mask", None) @@ -2005,7 +2045,7 @@ def _mask_from_cuda_array_interface_desc(obj): return mask -def serialize_columns(columns): +def serialize_columns(columns) -> Tuple[List[dict], List]: """ Return the headers and frames resulting from serializing a list of Column @@ -2020,7 +2060,7 @@ def serialize_columns(columns): frames : list list of frames """ - headers = [] + headers = [] # type List[Dict[Any, Any], ...] frames = [] if len(columns) > 0: @@ -2032,7 +2072,7 @@ def serialize_columns(columns): return headers, frames -def deserialize_columns(headers, frames): +def deserialize_columns(headers: List[dict], frames: List) -> List[ColumnBase]: """ Construct a list of Columns from a list of headers and frames. @@ -2050,7 +2090,12 @@ def deserialize_columns(headers, frames): return columns -def arange(start, stop=None, step=1, dtype=None): +def arange( + start: Union[int, float], + stop: Union[int, float] = None, + step: Union[int, float] = 1, + dtype=None, +) -> ColumnBase: """ Returns a column with evenly spaced values within a given interval. @@ -2103,7 +2148,7 @@ def arange(start, stop=None, step=1, dtype=None): ) -def full(size, fill_value, dtype=None): +def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase: """ Returns a column of given size and dtype, filled with a given value. @@ -2134,7 +2179,4 @@ def full(size, fill_value, dtype=None): 4 7 dtype: int8 """ - - return libcudf.column.make_column_from_scalar( - cudf.Scalar(fill_value, dtype), size - ) + return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 4561b1f68f2..8ae16288050 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1,7 +1,10 @@ # Copyright (c) 2019-2020, NVIDIA CORPORATION. +from __future__ import annotations + import datetime as dt import re from numbers import Number +from typing import Any, Sequence, Union, cast import numpy as np import pandas as pd @@ -9,7 +12,9 @@ import cudf from cudf import _lib as libcudf -from cudf.core.column import column, string +from cudf._typing import DatetimeLikeScalar, Dtype, DtypeObj, ScalarLike +from cudf.core.buffer import Buffer +from cudf.core.column import ColumnBase, column, string from cudf.utils.dtypes import is_scalar from cudf.utils.utils import _fillna_natwise @@ -34,7 +39,13 @@ class DatetimeColumn(column.ColumnBase): def __init__( - self, data, dtype, mask=None, size=None, offset=0, null_count=None + self, + data: Buffer, + dtype: DtypeObj, + mask: Buffer = None, + size: int = None, + offset: int = 0, + null_count: int = None, ): """ Parameters @@ -66,49 +77,51 @@ def __init__( self._time_unit, _ = np.datetime_data(self.dtype) - def __contains__(self, item): + def __contains__(self, item: ScalarLike) -> bool: try: - item = np.datetime64(item, self._time_unit) + item_as_dt64 = np.datetime64(item, self._time_unit) except ValueError: # If item cannot be converted to datetime type # np.datetime64 raises ValueError, hence `item` # cannot exist in `self`. return False - return item.astype("int64") in self.as_numerical + return item_as_dt64.astype("int64") in self.as_numerical @property - def time_unit(self): + def time_unit(self) -> str: return self._time_unit @property - def year(self): + def year(self) -> ColumnBase: return self.get_dt_field("year") @property - def month(self): + def month(self) -> ColumnBase: return self.get_dt_field("month") @property - def day(self): + def day(self) -> ColumnBase: return self.get_dt_field("day") @property - def hour(self): + def hour(self) -> ColumnBase: return self.get_dt_field("hour") @property - def minute(self): + def minute(self) -> ColumnBase: return self.get_dt_field("minute") @property - def second(self): + def second(self) -> ColumnBase: return self.get_dt_field("second") @property - def weekday(self): + def weekday(self) -> ColumnBase: return self.get_dt_field("weekday") - def to_pandas(self, index=None, **kwargs): + def to_pandas( + self, index: "cudf.Index" = None, nullable: bool = False, **kwargs + ) -> "cudf.Series": # Workaround until following issue is fixed: # https://issues.apache.org/jira/browse/ARROW-9772 @@ -122,10 +135,10 @@ def to_pandas(self, index=None, **kwargs): return pd_series - def get_dt_field(self, field): + def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component(self, field) - def normalize_binop_value(self, other): + def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: if isinstance(other, cudf.Scalar): return other @@ -162,30 +175,41 @@ def normalize_binop_value(self, other): raise TypeError(f"cannot normalize {type(other)}") @property - def as_numerical(self): - return column.build_column( - data=self.base_data, - dtype=np.int64, - mask=self.base_mask, - offset=self.offset, - size=self.size, + def as_numerical(self) -> "cudf.core.column.NumericalColumn": + return cast( + "cudf.core.column.NumericalColumn", + column.build_column( + data=self.base_data, + dtype=np.int64, + mask=self.base_mask, + offset=self.offset, + size=self.size, + ), ) - def as_datetime_column(self, dtype, **kwargs): + def as_datetime_column(self, dtype: Dtype, **kwargs) -> DatetimeColumn: dtype = np.dtype(dtype) if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype=dtype) - def as_timedelta_column(self, dtype, **kwargs): + def as_timedelta_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.TimeDeltaColumn": raise TypeError( f"cannot astype a datetimelike from [{self.dtype}] to [{dtype}]" ) - def as_numerical_column(self, dtype): - return self.as_numerical.astype(dtype) + def as_numerical_column( + self, dtype: Dtype + ) -> "cudf.core.column.NumericalColumn": + return cast( + "cudf.core.column.NumericalColumn", self.as_numerical.astype(dtype) + ) - def as_string_column(self, dtype, format=None): + def as_string_column( + self, dtype: Dtype, format=None + ) -> "cudf.core.column.StringColumn": if format is None: format = _dtype_to_format_conversion.get( self.dtype.name, "%Y-%m-%d %H:%M:%S" @@ -195,20 +219,25 @@ def as_string_column(self, dtype, format=None): np.dtype(self.dtype) ](self, format) else: - return column.column_empty(0, dtype="object", masked=False) + return cast( + "cudf.core.column.StringColumn", + column.column_empty(0, dtype="object", masked=False), + ) - def default_na_value(self): + def default_na_value(self) -> DatetimeLikeScalar: """Returns the default NA value for this column """ return np.datetime64("nat", self.time_unit) - def mean(self, skipna=None, dtype=np.float64): + def mean(self, skipna=None, dtype=np.float64) -> ScalarLike: return pd.Timestamp( self.as_numerical.mean(skipna=skipna, dtype=dtype), unit=self.time_unit, ) - def quantile(self, q, interpolation, exact): + def quantile( + self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + ) -> ColumnBase: result = self.as_numerical.quantile( q=q, interpolation=interpolation, exact=exact ) @@ -216,18 +245,23 @@ def quantile(self, q, interpolation, exact): return pd.Timestamp(result, unit=self.time_unit) return result.astype(self.dtype) - def binary_operator(self, op, rhs, reflect=False): + def binary_operator( + self, + op: str, + rhs: Union[ColumnBase, "cudf.Scalar"], + reflect: bool = False, + ) -> ColumnBase: if isinstance(rhs, cudf.DateOffset): return binop_offset(self, rhs, op) lhs, rhs = self, rhs if op in ("eq", "ne", "lt", "gt", "le", "ge"): out_dtype = np.bool elif op == "add" and pd.api.types.is_timedelta64_dtype(rhs.dtype): - out_dtype = cudf.core.column.timedelta._timedelta_binary_op_add( + out_dtype = cudf.core.column.timedelta._timedelta_add_result_dtype( rhs, lhs ) elif op == "sub" and pd.api.types.is_timedelta64_dtype(rhs.dtype): - out_dtype = cudf.core.column.timedelta._timedelta_binary_op_sub( + out_dtype = cudf.core.column.timedelta._timedelta_sub_result_dtype( rhs if reflect else lhs, lhs if reflect else rhs ) elif op == "sub" and pd.api.types.is_datetime64_dtype(rhs.dtype): @@ -244,13 +278,11 @@ def binary_operator(self, op, rhs, reflect=False): f"Series of dtype {self.dtype} cannot perform " f" the operation {op}" ) + return binop(lhs, rhs, op=op, out_dtype=out_dtype, reflect=reflect) - if reflect: - lhs, rhs = rhs, lhs - - return binop(lhs, rhs, op=op, out_dtype=out_dtype) - - def fillna(self, fill_value=None, method=None): + def fillna( + self, fill_value: Any = None, method: str = None, dtype: Dtype = None + ) -> DatetimeColumn: if fill_value is not None: if cudf.utils.utils.isnat(fill_value): return _fillna_natwise(self) @@ -262,7 +294,9 @@ def fillna(self, fill_value=None, method=None): return super().fillna(fill_value, method) - def find_first_value(self, value, closest=False): + def find_first_value( + self, value: ScalarLike, closest: bool = False + ) -> int: """ Returns offset of first value that matches """ @@ -270,7 +304,7 @@ def find_first_value(self, value, closest=False): value = column.as_column(value, dtype=self.dtype).as_numerical[0] return self.as_numerical.find_first_value(value, closest=closest) - def find_last_value(self, value, closest=False): + def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: """ Returns offset of last value that matches """ @@ -279,10 +313,10 @@ def find_last_value(self, value, closest=False): return self.as_numerical.find_last_value(value, closest=closest) @property - def is_unique(self): + def is_unique(self) -> bool: return self.as_numerical.is_unique - def can_cast_safely(self, to_dtype): + def can_cast_safely(self, to_dtype: Dtype) -> bool: if np.issubdtype(to_dtype, np.datetime64): to_res, _ = np.datetime_data(to_dtype) @@ -315,7 +349,15 @@ def can_cast_safely(self, to_dtype): @annotate("BINARY_OP", color="orange", domain="cudf_python") -def binop(lhs, rhs, op, out_dtype): +def binop( + lhs: Union[ColumnBase, ScalarLike], + rhs: Union[ColumnBase, ScalarLike], + op: str, + out_dtype: Dtype, + reflect: bool, +) -> ColumnBase: + if reflect: + lhs, rhs = rhs, lhs out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) return out @@ -329,11 +371,10 @@ def binop_offset(lhs, rhs, op): return out -def infer_format(element, **kwargs): +def infer_format(element: str, **kwargs) -> str: """ Infers datetime format from a string, also takes cares for `ms` and `ns` """ - fmt = pd.core.tools.datetimes._guess_datetime_format(element, **kwargs) if fmt is not None: @@ -345,8 +386,8 @@ def infer_format(element, **kwargs): # There is possibility that the element is of following format # '00:00:03.333333 2016-01-01' - second_part = re.split(r"(\D+)", element_parts[1], maxsplit=1) - subsecond_fmt = ".%" + str(len(second_part[0])) + "f" + second_parts = re.split(r"(\D+)", element_parts[1], maxsplit=1) + subsecond_fmt = ".%" + str(len(second_parts[0])) + "f" first_part = pd.core.tools.datetimes._guess_datetime_format( element_parts[0], **kwargs @@ -360,16 +401,16 @@ def infer_format(element, **kwargs): if first_part is None: raise ValueError("Unable to infer the timestamp format from the data") - if len(second_part) > 1: + if len(second_parts) > 1: # "Z" indicates Zulu time(widely used in aviation) - Which is # UTC timezone that currently cudf only supports. Having any other # unsuppported timezone will let the code fail below # with a ValueError. - second_part.remove("Z") - second_part = "".join(second_part[1:]) + second_parts.remove("Z") + second_part = "".join(second_parts[1:]) if len(second_part) > 1: - # Only infer if second_part is not an empty string. + # Only infer if second_parts is not an empty string. second_part = pd.core.tools.datetimes._guess_datetime_format( second_part, **kwargs ) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index c2aa41a5de1..8641bc88806 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -173,8 +173,7 @@ def __init__(self, column, parent=None): raise AttributeError( "Can only use .list accessor with a 'list' dtype" ) - self._column = column - self._parent = parent + super().__init__(column=column, parent=parent) @property def leaves(self): diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 8395c9c3da6..eec9c2a7860 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -1,9 +1,57 @@ # Copyright (c) 2020, NVIDIA CORPORATION. +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional, Union, overload + +from typing_extensions import Literal + import cudf +if TYPE_CHECKING: + from cudf.core.column import ColumnBase + class ColumnMethodsMixin: + _column: ColumnBase + _parent: Optional[Union["cudf.Series", "cudf.Index"]] + + def __init__( + self, + column: ColumnBase, + parent: Union["cudf.Series", "cudf.Index"] = None, + ): + self._column = column + self._parent = parent + + @overload + def _return_or_inplace( + self, new_col, inplace: Literal[False], expand=False, retain_index=True + ) -> Union["cudf.Series", "cudf.Index"]: + ... + + @overload + def _return_or_inplace( + self, new_col, expand: bool = False, retain_index: bool = True + ) -> Union["cudf.Series", "cudf.Index"]: + ... + + @overload + def _return_or_inplace( + self, new_col, inplace: Literal[True], expand=False, retain_index=True + ) -> None: + ... + + @overload + def _return_or_inplace( + self, + new_col, + inplace: bool = False, + expand: bool = False, + retain_index: bool = True, + ) -> Optional[Union["cudf.Series", "cudf.Index"]]: + ... + def _return_or_inplace( self, new_col, inplace=False, expand=False, retain_index=True ): @@ -19,31 +67,29 @@ def _return_or_inplace( ), inplace=True, ) + return None else: self._column._mimic_inplace(new_col, inplace=True) + return None else: + if self._parent is None: + return new_col if expand or isinstance( self._parent, (cudf.DataFrame, cudf.MultiIndex) ): # This branch indicates the passed as new_col - # is actually a table-like data + # is a Table table = new_col - if isinstance(table, cudf._lib.table.Table): - if isinstance(self._parent, cudf.Index): - idx = self._parent._constructor_expanddim._from_table( - table=table - ) - idx.names = None - return idx - else: - return self._parent._constructor_expanddim( - data=table._data, index=self._parent.index - ) + if isinstance(self._parent, cudf.Index): + idx = self._parent._constructor_expanddim._from_table( + table=table + ) + idx.names = None + return idx else: return self._parent._constructor_expanddim( - {index: value for index, value in enumerate(table)}, - index=self._parent.index, + data=table._data, index=self._parent.index ) elif isinstance(self._parent, cudf.Series): if retain_index: @@ -59,7 +105,4 @@ def _return_or_inplace( new_col, name=self._parent.name ) else: - if self._parent is None: - return new_col - else: - return self._parent._mimic_inplace(new_col, inplace=False) + return self._parent._mimic_inplace(new_col, inplace=False) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index f302a4519ed..f77c408f205 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -1,6 +1,8 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. +from __future__ import annotations from numbers import Number +from typing import Any, Callable, Sequence, Union, cast import numpy as np import pandas as pd @@ -10,8 +12,15 @@ import cudf from cudf import _lib as libcudf from cudf._lib.quantiles import quantile as cpp_quantile +from cudf._typing import BinaryOperand, ColumnLike, Dtype, DtypeObj, ScalarLike from cudf.core.buffer import Buffer -from cudf.core.column import as_column, build_column, column, string +from cudf.core.column import ( + ColumnBase, + as_column, + build_column, + column, + string, +) from cudf.utils import cudautils, utils from cudf.utils.dtypes import ( min_column_type, @@ -21,9 +30,15 @@ ) -class NumericalColumn(column.ColumnBase): +class NumericalColumn(ColumnBase): def __init__( - self, data, dtype, mask=None, size=None, offset=0, null_count=None + self, + data: Buffer, + dtype: DtypeObj, + mask: Buffer = None, + size: int = None, + offset: int = 0, + null_count: int = None, ): """ Parameters @@ -39,6 +54,7 @@ def __init__( if size is None: size = data.size // dtype.itemsize size = size - offset + super().__init__( data, size=size, @@ -48,7 +64,7 @@ def __init__( null_count=null_count, ) - def __contains__(self, item): + def __contains__(self, item: ScalarLike) -> bool: """ Returns True if column contains item, else False. """ @@ -66,10 +82,12 @@ def __contains__(self, item): self, column.as_column([item], dtype=self.dtype) ).any() - def unary_operator(self, unaryop): + def unary_operator(self, unaryop: str) -> ColumnBase: return _numeric_column_unaryop(self, op=unaryop) - def binary_operator(self, binop, rhs, reflect=False): + def binary_operator( + self, binop: str, rhs: BinaryOperand, reflect: bool = False, + ) -> ColumnBase: int_dtypes = [ np.dtype("int8"), np.dtype("int16"), @@ -80,32 +98,33 @@ def binary_operator(self, binop, rhs, reflect=False): np.dtype("uint32"), np.dtype("uint64"), ] - tmp = rhs - if reflect: - tmp = self - if isinstance(rhs, (NumericalColumn, cudf.Scalar)) or np.isscalar(rhs): + if rhs is None: + out_dtype = self.dtype + else: + if not ( + isinstance(rhs, (NumericalColumn, cudf.Scalar,),) + or np.isscalar(rhs) + ): + msg = "{!r} operator not supported between {} and {}" + raise TypeError(msg.format(binop, type(self), type(rhs))) out_dtype = np.result_type(self.dtype, rhs.dtype) if binop in ["mod", "floordiv"]: + tmp = self if reflect else rhs if (tmp.dtype in int_dtypes) and ( (np.isscalar(tmp) and (0 == tmp)) or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp)) ): out_dtype = np.dtype("float64") - elif rhs is None: - out_dtype = self.dtype - else: - raise TypeError( - f"'{binop}' operator not supported between " - f"{type(self).__name__} and {type(rhs).__name__}" - ) return _numeric_column_binop( lhs=self, rhs=rhs, op=binop, out_dtype=out_dtype, reflect=reflect ) - def _apply_scan_op(self, op): + def _apply_scan_op(self, op: str) -> ColumnBase: return libcudf.reduce.scan(op, self, True) - def normalize_binop_value(self, other): + def normalize_binop_value( + self, other: ScalarLike + ) -> Union[ColumnBase, ScalarLike]: if other is None: return other if isinstance(other, cudf.Scalar): @@ -122,8 +141,8 @@ def normalize_binop_value(self, other): return other other_dtype = np.promote_types(self.dtype, other_dtype) if other_dtype == np.dtype("float16"): - other = np.dtype("float32").type(other) - other_dtype = other.dtype + other_dtype = np.dtype("float32") + other = other_dtype.type(other) if self.dtype.kind == "b": other_dtype = min_signed_type(other) if np.isscalar(other): @@ -134,104 +153,110 @@ def normalize_binop_value(self, other): other, size=len(self), dtype=other_dtype ) return column.build_column( - data=Buffer.from_array_like(ary), - dtype=ary.dtype, - mask=self.mask, + data=Buffer(ary), dtype=ary.dtype, mask=self.mask, ) else: raise TypeError(f"cannot broadcast {type(other)}") - def int2ip(self): + def int2ip(self) -> "cudf.core.column.StringColumn": if self.dtype != np.dtype("int64"): raise TypeError("Only int64 type can be converted to ip") return libcudf.string_casting.int2ip(self) - def as_string_column(self, dtype, **kwargs): + def as_string_column( + self, dtype: Dtype, format=None + ) -> "cudf.core.column.StringColumn": if len(self) > 0: return string._numeric_to_str_typecast_functions[ np.dtype(self.dtype) ](self) else: - return as_column([], dtype="object") - - def as_datetime_column(self, dtype, **kwargs): + return cast( + "cudf.core.column.StringColumn", as_column([], dtype="object") + ) - return build_column( - data=self.astype("int64").base_data, - dtype=dtype, - mask=self.base_mask, - offset=self.offset, - size=self.size, + def as_datetime_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.DatetimeColumn": + return cast( + "cudf.core.column.DatetimeColumn", + build_column( + data=self.astype("int64").base_data, + dtype=dtype, + mask=self.base_mask, + offset=self.offset, + size=self.size, + ), ) - def as_timedelta_column(self, dtype, **kwargs): - - return build_column( - data=self.astype("int64").base_data, - dtype=dtype, - mask=self.base_mask, - offset=self.offset, - size=self.size, + def as_timedelta_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.TimeDeltaColumn": + return cast( + "cudf.core.column.TimeDeltaColumn", + build_column( + data=self.astype("int64").base_data, + dtype=dtype, + mask=self.base_mask, + offset=self.offset, + size=self.size, + ), ) - def as_numerical_column(self, dtype): + def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: dtype = np.dtype(dtype) if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype) - def sum(self, skipna=None, dtype=None, min_count=0): - result_col = self._process_for_reduction( + def reduce(self, op: str, skipna: bool = None, **kwargs) -> float: + min_count = kwargs.pop("min_count", 0) + preprocessed = self._process_for_reduction( skipna=skipna, min_count=min_count ) - if isinstance(result_col, cudf.core.column.ColumnBase): - return libcudf.reduce.reduce("sum", result_col, dtype=dtype) + if isinstance(preprocessed, ColumnBase): + return libcudf.reduce.reduce(op, preprocessed, **kwargs) else: - return result_col + return cast(float, preprocessed) - def product(self, skipna=None, dtype=None, min_count=0): - result_col = self._process_for_reduction( - skipna=skipna, min_count=min_count + def sum( + self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 + ) -> float: + return self.reduce( + "sum", skipna=skipna, dtype=dtype, min_count=min_count ) - if isinstance(result_col, cudf.core.column.ColumnBase): - return libcudf.reduce.reduce("product", result_col, dtype=dtype) - else: - return result_col - def mean(self, skipna=None, dtype=np.float64): - result_col = self._process_for_reduction(skipna=skipna) - if isinstance(result_col, cudf.core.column.ColumnBase): - return libcudf.reduce.reduce("mean", result_col, dtype=dtype) - else: - return result_col + def product( + self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 + ) -> float: + return self.reduce( + "product", skipna=skipna, dtype=dtype, min_count=min_count + ) - def var(self, skipna=None, ddof=1, dtype=np.float64): - result = self._process_for_reduction(skipna=skipna) - if isinstance(result, cudf.core.column.ColumnBase): - return libcudf.reduce.reduce("var", result, dtype=dtype, ddof=ddof) - else: - return result + def mean(self, skipna: bool = None, dtype: Dtype = np.float64) -> float: + return self.reduce("mean", skipna=skipna, dtype=dtype) - def std(self, skipna=None, ddof=1, dtype=np.float64): - result_col = self._process_for_reduction(skipna=skipna) - if isinstance(result_col, cudf.core.column.ColumnBase): - return libcudf.reduce.reduce( - "std", result_col, dtype=dtype, ddof=ddof - ) - else: - return result_col + def var( + self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64 + ) -> float: + return self.reduce("var", skipna=skipna, dtype=dtype, ddof=ddof) - def sum_of_squares(self, dtype=None): + def std( + self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64 + ) -> float: + return self.reduce("std", skipna=skipna, dtype=dtype, ddof=ddof) + + def sum_of_squares(self, dtype: Dtype = None) -> float: return libcudf.reduce.reduce("sum_of_squares", self, dtype=dtype) - def kurtosis(self, skipna=None): + def kurtosis(self, skipna: bool = None) -> float: skipna = True if skipna is None else skipna if len(self) == 0 or (not skipna and self.has_nulls): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - self = self.nans_to_nulls().dropna() + self = self.nans_to_nulls().dropna() # type: ignore if len(self) < 4: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) @@ -250,13 +275,13 @@ def kurtosis(self, skipna=None): kurt = term_one_section_one * term_one_section_two - 3 * term_two return kurt - def skew(self, skipna=None): + def skew(self, skipna: bool = None) -> ScalarLike: skipna = True if skipna is None else skipna if len(self) == 0 or (not skipna and self.has_nulls): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - self = self.nans_to_nulls().dropna() + self = self.nans_to_nulls().dropna() # type: ignore if len(self) < 3: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) @@ -273,7 +298,9 @@ def skew(self, skipna=None): skew = unbiased_coef * m3 / (m2 ** (3 / 2)) return skew - def quantile(self, q, interpolation, exact): + def quantile( + self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + ) -> NumericalColumn: if isinstance(q, Number) or cudf.utils.dtypes.is_list_like(q): np_array_q = np.asarray(q) if np.logical_or(np_array_q < 0, np_array_q > 1).any(): @@ -284,15 +311,14 @@ def quantile(self, q, interpolation, exact): # will only have values in range [0, 1] result = self._numeric_quantile(q, interpolation, exact) if isinstance(q, Number): - result = result[0] return ( cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - if result is cudf.NA - else result + if result[0] is cudf.NA + else result[0] ) return result - def median(self, skipna=None): + def median(self, skipna: bool = None) -> NumericalColumn: skipna = True if skipna is None else skipna if not skipna and self.has_nulls: @@ -301,24 +327,17 @@ def median(self, skipna=None): # enforce linear in case the default ever changes return self.quantile(0.5, interpolation="linear", exact=True) - def _numeric_quantile(self, q, interpolation, exact): - is_number = isinstance(q, Number) - - if is_number: - quant = [float(q)] - elif isinstance(q, list) or isinstance(q, np.ndarray): - quant = q - else: - msg = "`q` must be either a single element, list or numpy array" - raise TypeError(msg) - + def _numeric_quantile( + self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + ) -> NumericalColumn: + quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q # get sorted indices and exclude nulls sorted_indices = self.as_frame()._get_sorted_inds(True, "first") sorted_indices = sorted_indices[self.null_count :] return cpp_quantile(self, quant, interpolation, sorted_indices, exact) - def cov(self, other): + def cov(self, other: ColumnBase) -> float: if ( len(self) == 0 or len(other) == 0 @@ -330,7 +349,7 @@ def cov(self, other): cov_sample = result.sum() / (len(self) - 1) return cov_sample - def corr(self, other): + def corr(self, other: ColumnBase) -> float: if len(self) == 0 or len(other) == 0: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) @@ -341,12 +360,14 @@ def corr(self, other): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) return cov / lhs_std / rhs_std - def round(self, decimals=0): + def round(self, decimals: int = 0) -> NumericalColumn: """Round the values in the Column to the given number of decimals. """ return libcudf.round.round(self, decimal_places=decimals) - def applymap(self, udf, out_dtype=None): + def applymap( + self, udf: Callable[[ScalarLike], ScalarLike], out_dtype: Dtype = None + ) -> ColumnBase: """Apply an element-wise function to transform the values in the Column. Parameters @@ -367,7 +388,7 @@ def applymap(self, udf, out_dtype=None): out = column.column_applymap(udf=udf, column=self, out_dtype=out_dtype) return out - def default_na_value(self): + def default_na_value(self) -> ScalarLike: """Returns the default NA value for this column """ dkind = self.dtype.kind @@ -382,7 +403,12 @@ def default_na_value(self): else: raise TypeError(f"numeric column of {self.dtype} has no NaN value") - def find_and_replace(self, to_replace, replacement, all_nan): + def find_and_replace( + self, + to_replace: ColumnLike, + replacement: ColumnLike, + all_nan: bool = False, + ) -> NumericalColumn: """ Return col with *to_replace* replaced with *value*. """ @@ -409,11 +435,16 @@ def find_and_replace(self, to_replace, replacement, all_nan): replaced, to_replace_col, replacement_col ) - def fillna(self, fill_value=None, method=None, fill_nan=True): + def fillna( + self, + fill_value: Any = None, + method: str = None, + dtype: Dtype = None, + fill_nan: bool = True, + ) -> NumericalColumn: """ Fill null values with *fill_value* """ - if fill_nan: col = self.nans_to_nulls() else: @@ -447,7 +478,9 @@ def fillna(self, fill_value=None, method=None, fill_nan=True): return super(NumericalColumn, col).fillna(fill_value, method) - def find_first_value(self, value, closest=False): + def find_first_value( + self, value: ScalarLike, closest: bool = False + ) -> int: """ Returns offset of first value that matches. For monotonic columns, returns the offset of the first larger value @@ -476,7 +509,7 @@ def find_first_value(self, value, closest=False): raise ValueError("value not found") return found - def find_last_value(self, value, closest=False): + def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: """ Returns offset of last value that matches. For monotonic columns, returns the offset of the last smaller value @@ -505,7 +538,7 @@ def find_last_value(self, value, closest=False): raise ValueError("value not found") return found - def can_cast_safely(self, to_dtype): + def can_cast_safely(self, to_dtype: DtypeObj) -> bool: """ Returns true if all the values in self can be safely cast to dtype @@ -603,9 +636,17 @@ def can_cast_safely(self, to_dtype): else: return False + return False + @annotate("BINARY_OP", color="orange", domain="cudf_python") -def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False): +def _numeric_column_binop( + lhs: Union[ColumnBase, ScalarLike], + rhs: Union[ColumnBase, ScalarLike], + op: str, + out_dtype: Dtype, + reflect: bool = False, +) -> ColumnBase: if reflect: lhs, rhs = rhs, lhs @@ -622,7 +663,7 @@ def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False): return out -def _numeric_column_unaryop(operand, op): +def _numeric_column_unaryop(operand: ColumnBase, op: str) -> ColumnBase: if callable(op): return libcudf.transform.transform(operand, op) @@ -630,7 +671,7 @@ def _numeric_column_unaryop(operand, op): return libcudf.unary.unary_operation(operand, op) -def _safe_cast_to_int(col, dtype): +def _safe_cast_to_int(col: ColumnBase, dtype: DtypeObj) -> ColumnBase: """ Cast given NumericalColumn to given integer dtype safely. """ @@ -649,7 +690,9 @@ def _safe_cast_to_int(col, dtype): ) -def _normalize_find_and_replace_input(input_column_dtype, col_to_normalize): +def _normalize_find_and_replace_input( + input_column_dtype: DtypeObj, col_to_normalize: Union[ColumnBase, list] +) -> ColumnBase: normalized_column = column.as_column( col_to_normalize, dtype=input_column_dtype if len(col_to_normalize) <= 0 else None, @@ -691,7 +734,9 @@ def _normalize_find_and_replace_input(input_column_dtype, col_to_normalize): return normalized_column.astype(input_column_dtype) -def digitize(column, bins, right=False): +def digitize( + column: ColumnBase, bins: np.ndarray, right: bool = False +) -> ColumnBase: """Return the indices of the bins to which each value in column belongs. Parameters @@ -706,7 +751,7 @@ def digitize(column, bins, right=False): Returns ------- - A device array containing the indices + A column containing the indices """ if not column.dtype == bins.dtype: raise ValueError( diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index f5df440b865..0124b421266 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1,10 +1,15 @@ # Copyright (c) 2019-2020, NVIDIA CORPORATION. +from __future__ import annotations + +import builtins import pickle import warnings +from typing import Any, Dict, Optional, Sequence, Tuple, Union, cast, overload import cupy import numpy as np import pandas as pd +from numba import cuda from nvtx import annotate import cudf @@ -140,6 +145,7 @@ translate as cpp_translate, ) from cudf._lib.strings.wrap import wrap as cpp_wrap +from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import column, datetime from cudf.core.column.methods import ColumnMethodsMixin @@ -197,6 +203,9 @@ } +ParentType = Union["cudf.Series", "cudf.Index"] + + class StringMethods(ColumnMethodsMixin): def __init__(self, column, parent=None): """ @@ -214,10 +223,9 @@ def __init__(self, column, parent=None): raise AttributeError( "Can only use .str accessor with string values" ) - self._column = column - self._parent = parent + super().__init__(column=column, parent=parent) - def htoi(self): + def htoi(self) -> ParentType: """ Returns integer value represented by each hex string. String is interpretted to have hex (base-16) characters. @@ -242,7 +250,7 @@ def htoi(self): return self._return_or_inplace(out, inplace=False) - def ip2int(self): + def ip2int(self) -> ParentType: """ This converts ip strings to integers @@ -279,7 +287,7 @@ def __getitem__(self, key): else: return self.get(key) - def len(self): + def len(self) -> ParentType: """ Computes the length of each element in the Series/Index. @@ -301,7 +309,7 @@ def len(self): return self._return_or_inplace(cpp_count_characters(self._column)) - def byte_count(self): + def byte_count(self) -> ParentType: """ Computes the number of bytes of each string in the Series/Index. @@ -328,6 +336,16 @@ def byte_count(self): """ return self._return_or_inplace(cpp_count_bytes(self._column),) + @overload + def cat(self, sep: str = None, na_rep: str = None) -> str: + ... + + @overload + def cat( + self, others, sep: str = None, na_rep: str = None + ) -> Union[ParentType, "cudf.core.column.StringColumn"]: + ... + def cat(self, others=None, sep=None, na_rep=None): """ Concatenate strings in the Series/Index with given separator. @@ -339,28 +357,28 @@ def cat(self, others=None, sep=None, na_rep=None): Parameters ---------- - others : Series or List of str - Strings to be appended. - The number of strings must match ``size()`` of this instance. - This must be either a Series of string dtype or a Python - list of strings. + others : Series or List of str + Strings to be appended. + The number of strings must match ``size()`` of this instance. + This must be either a Series of string dtype or a Python + list of strings. - sep : str - If specified, this separator will be appended to each string - before appending the others. + sep : str + If specified, this separator will be appended to each string + before appending the others. - na_rep : str - This character will take the place of any null strings - (not empty strings) in either list. + na_rep : str + This character will take the place of any null strings + (not empty strings) in either list. - - If ``na_rep`` is ``None``, and ``others`` is ``None``, - missing values in the Series/Index are - omitted from the result. + - If ``na_rep`` is ``None``, and ``others`` is ``None``, + missing values in the Series/Index are + omitted from the result. - - If ``na_rep`` is ``None``, and ``others`` is - not ``None``, a row containing a missing value - in any of the columns (before concatenation) - will have a missing value in the result. + - If ``na_rep`` is ``None``, and ``others`` is + not ``None``, a row containing a missing value + in any of the columns (before concatenation) + will have a missing value in the result. Returns ------- @@ -441,7 +459,7 @@ def cat(self, others=None, sep=None, na_rep=None): out = out[0] return out - def join(self, sep): + def join(self, sep) -> ParentType: """ Join lists contained as elements in the Series/Index with passed delimiter. @@ -453,7 +471,9 @@ def join(self, sep): "Columns of arrays / lists are not yet " "supported" ) - def extract(self, pat, flags=0, expand=True): + def extract( + self, pat: str, flags: int = 0, expand: bool = True + ) -> ParentType: """ Extract capture groups in the regex `pat` as columns in a DataFrame. @@ -517,7 +537,14 @@ def extract(self, pat, flags=0, expand=True): else: return self._return_or_inplace(out, expand=expand) - def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): + def contains( + self, + pat: Union[str, Sequence], + case: bool = True, + flags: int = 0, + na=np.nan, + regex: bool = True, + ) -> ParentType: """ Test if pattern or regex is contained within a string of a Series or Index. @@ -646,7 +673,15 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): ) return self._return_or_inplace(result_col) - def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + def replace( + self, + pat: Union[str, Sequence], + repl: Union[str, Sequence], + n: int = -1, + case=None, + flags: int = 0, + regex: bool = True, + ) -> ParentType: """ Replace occurrences of pattern/regex in the Series/Index with some other string. Equivalent to `str.replace() @@ -748,7 +783,7 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): ), ) - def replace_with_backrefs(self, pat, repl): + def replace_with_backrefs(self, pat: str, repl: str) -> ParentType: """ Use the ``repl`` back-ref template to create a new string with the extracted elements found using the ``pat`` expression. @@ -778,7 +813,9 @@ def replace_with_backrefs(self, pat, repl): cpp_replace_with_backrefs(self._column, pat, repl) ) - def slice(self, start=None, stop=None, step=None): + def slice( + self, start: int = None, stop: int = None, step: int = None + ) -> ParentType: """ Slice substrings from each element in the Series or Index. @@ -847,7 +884,7 @@ def slice(self, start=None, stop=None, step=None): cpp_slice_strings(self._column, start, stop, step), ) - def isinteger(self): + def isinteger(self) -> ParentType: """ Check whether all characters in each string form integer. @@ -907,7 +944,7 @@ def isinteger(self): """ return self._return_or_inplace(cpp_is_integer(self._column)) - def ishex(self): + def ishex(self) -> ParentType: """ Check whether all characters in each string form a hex integer. @@ -946,7 +983,7 @@ def ishex(self): """ return self._return_or_inplace(str_cast.is_hex(self._column)) - def istimestamp(self, format): + def istimestamp(self, format: str) -> ParentType: """ Check whether all characters in each string can be converted to a timestamp using the given format. @@ -970,7 +1007,7 @@ def istimestamp(self, format): str_cast.istimestamp(self._column, format) ) - def isfloat(self): + def isfloat(self) -> ParentType: """ Check whether all characters in each string form floating value. @@ -1033,7 +1070,7 @@ def isfloat(self): """ return self._return_or_inplace(cpp_is_float(self._column)) - def isdecimal(self): + def isdecimal(self) -> ParentType: """ Check whether all characters in each string are decimal. @@ -1094,7 +1131,7 @@ def isdecimal(self): """ return self._return_or_inplace(cpp_is_decimal(self._column)) - def isalnum(self): + def isalnum(self) -> ParentType: """ Check whether all characters in each string are alphanumeric. @@ -1163,7 +1200,7 @@ def isalnum(self): """ return self._return_or_inplace(cpp_is_alnum(self._column)) - def isalpha(self): + def isalpha(self) -> ParentType: """ Check whether all characters in each string are alphabetic. @@ -1219,7 +1256,7 @@ def isalpha(self): """ return self._return_or_inplace(cpp_is_alpha(self._column)) - def isdigit(self): + def isdigit(self) -> ParentType: """ Check whether all characters in each string are digits. @@ -1281,7 +1318,7 @@ def isdigit(self): """ return self._return_or_inplace(cpp_is_digit(self._column)) - def isnumeric(self): + def isnumeric(self) -> ParentType: """ Check whether all characters in each string are numeric. @@ -1349,7 +1386,7 @@ def isnumeric(self): """ return self._return_or_inplace(cpp_is_numeric(self._column)) - def isupper(self): + def isupper(self) -> ParentType: """ Check whether all characters in each string are uppercase. @@ -1406,7 +1443,7 @@ def isupper(self): """ return self._return_or_inplace(cpp_is_upper(self._column)) - def islower(self): + def islower(self) -> ParentType: """ Check whether all characters in each string are lowercase. @@ -1463,7 +1500,7 @@ def islower(self): """ return self._return_or_inplace(cpp_is_lower(self._column)) - def isipv4(self): + def isipv4(self) -> ParentType: """ Check whether all characters in each string form an IPv4 address. @@ -1487,7 +1524,7 @@ def isipv4(self): """ return self._return_or_inplace(str_cast.is_ipv4(self._column)) - def lower(self): + def lower(self) -> ParentType: """ Converts all characters to lowercase. @@ -1526,7 +1563,7 @@ def lower(self): """ return self._return_or_inplace(cpp_to_lower(self._column)) - def upper(self): + def upper(self) -> ParentType: """ Convert each string to uppercase. This only applies to ASCII characters at this time. @@ -1575,7 +1612,7 @@ def upper(self): """ return self._return_or_inplace(cpp_to_upper(self._column)) - def capitalize(self): + def capitalize(self) -> ParentType: """ Convert strings in the Series/Index to be capitalized. This only applies to ASCII characters at this time. @@ -1603,7 +1640,7 @@ def capitalize(self): """ return self._return_or_inplace(cpp_capitalize(self._column)) - def swapcase(self): + def swapcase(self) -> ParentType: """ Change each lowercase character to uppercase and vice versa. This only applies to ASCII characters at this time. @@ -1648,7 +1685,7 @@ def swapcase(self): """ return self._return_or_inplace(cpp_swapcase(self._column)) - def title(self): + def title(self) -> ParentType: """ Uppercase the first letter of each letter after a space and lowercase the rest. @@ -1693,7 +1730,9 @@ def title(self): """ return self._return_or_inplace(cpp_title(self._column)) - def filter_alphanum(self, repl=None, keep=True): + def filter_alphanum( + self, repl: str = None, keep: bool = True + ) -> ParentType: """ Remove non-alphanumeric characters from strings in this column. @@ -1728,7 +1767,9 @@ def filter_alphanum(self, repl=None, keep=True): cpp_filter_alphanum(self._column, cudf.Scalar(repl), keep), ) - def slice_from(self, starts, stops): + def slice_from( + self, starts: "cudf.Series", stops: "cudf.Series" + ) -> ParentType: """ Return substring of each string using positions for each string. @@ -1771,7 +1812,9 @@ def slice_from(self, starts, stops): ), ) - def slice_replace(self, start=None, stop=None, repl=None): + def slice_replace( + self, start: int = None, stop: int = None, repl: str = None + ) -> ParentType: """ Replace the specified section of each string with a new string. @@ -1856,7 +1899,7 @@ def slice_replace(self, start=None, stop=None, repl=None): cpp_slice_replace(self._column, start, stop, cudf.Scalar(repl)), ) - def insert(self, start=0, repl=None): + def insert(self, start: int = 0, repl: str = None) -> ParentType: """ Insert the specified string into each string in the specified position. @@ -1906,7 +1949,7 @@ def insert(self, start=0, repl=None): cpp_string_insert(self._column, start, cudf.Scalar(repl)), ) - def get(self, i=0): + def get(self, i: int = 0) -> ParentType: """ Extract element from each component at specified position. @@ -1950,7 +1993,9 @@ def get(self, i=0): return self._return_or_inplace(cpp_string_get(self._column, i)) - def split(self, pat=None, n=-1, expand=None): + def split( + self, pat: str = None, n: int = -1, expand: bool = None + ) -> ParentType: """ Split strings around given separator/delimiter. @@ -2079,14 +2124,14 @@ def split(self, pat=None, n=-1, expand=None): if expand: if self._column.null_count == len(self._column): - result_table = [self._column.copy()] + result_table = cudf.core.frame.Frame({0: self._column.copy()}) else: result_table = cpp_split( self._column, cudf.Scalar(pat, "str"), n ) if len(result_table._data) == 1: - if result_table._data[0].null_count == len(self._parent): - result_table = [] + if result_table._data[0].null_count == len(self._column): + result_table = cudf.core.frame.Frame({}) else: result_table = cpp_split_record( self._column, cudf.Scalar(pat, "str"), n @@ -2094,7 +2139,9 @@ def split(self, pat=None, n=-1, expand=None): return self._return_or_inplace(result_table, expand=expand) - def rsplit(self, pat=None, n=-1, expand=None): + def rsplit( + self, pat: str = None, n: int = -1, expand: bool = None + ) -> ParentType: """ Split strings around given separator/delimiter. @@ -2232,18 +2279,18 @@ def rsplit(self, pat=None, n=-1, expand=None): if expand: if self._column.null_count == len(self._column): - result_table = [self._column.copy()] + result_table = cudf.core.frame.Frame({0: self._column.copy()}) else: result_table = cpp_rsplit(self._column, cudf.Scalar(pat), n) if len(result_table._data) == 1: - if result_table._data[0].null_count == len(self._parent): - result_table = [] + if result_table._data[0].null_count == len(self._column): + result_table = cudf.core.frame.Frame({}) else: result_table = cpp_rsplit_record(self._column, cudf.Scalar(pat), n) return self._return_or_inplace(result_table, expand=expand) - def partition(self, sep=" ", expand=True): + def partition(self, sep: str = " ", expand: bool = True) -> ParentType: """ Split the string at the first occurrence of sep. @@ -2323,7 +2370,7 @@ def partition(self, sep=" ", expand=True): cpp_partition(self._column, cudf.Scalar(sep)), expand=expand ) - def rpartition(self, sep=" ", expand=True): + def rpartition(self, sep: str = " ", expand: bool = True) -> ParentType: """ Split the string at the last occurrence of sep. @@ -2387,7 +2434,9 @@ def rpartition(self, sep=" ", expand=True): cpp_rpartition(self._column, cudf.Scalar(sep)), expand=expand ) - def pad(self, width, side="left", fillchar=" "): + def pad( + self, width: int, side: str = "left", fillchar: str = " " + ) -> ParentType: """ Pad strings in the Series/Index up to width. @@ -2472,7 +2521,7 @@ def pad(self, width, side="left", fillchar=" "): cpp_pad(self._column, width, fillchar, side) ) - def zfill(self, width): + def zfill(self, width: int) -> ParentType: """ Pad strings in the Series/Index by prepending ‘0’ characters. @@ -2545,7 +2594,7 @@ def zfill(self, width): return self._return_or_inplace(cpp_zfill(self._column, width)) - def center(self, width, fillchar=" "): + def center(self, width: int, fillchar: str = " ") -> ParentType: """ Filling left and right side of strings in the Series/Index with an additional character. @@ -2617,7 +2666,7 @@ def center(self, width, fillchar=" "): cpp_center(self._column, width, fillchar) ) - def ljust(self, width, fillchar=" "): + def ljust(self, width: int, fillchar: str = " ") -> ParentType: """ Filling right side of strings in the Series/Index with an additional character. Equivalent to `str.ljust() @@ -2671,7 +2720,7 @@ def ljust(self, width, fillchar=" "): cpp_ljust(self._column, width, fillchar) ) - def rjust(self, width, fillchar=" "): + def rjust(self, width: int, fillchar: str = " ") -> ParentType: """ Filling left side of strings in the Series/Index with an additional character. Equivalent to `str.rjust() @@ -2725,7 +2774,7 @@ def rjust(self, width, fillchar=" "): cpp_rjust(self._column, width, fillchar) ) - def strip(self, to_strip=None): + def strip(self, to_strip: str = None) -> ParentType: """ Remove leading and trailing characters. @@ -2784,7 +2833,7 @@ def strip(self, to_strip=None): cpp_strip(self._column, cudf.Scalar(to_strip)) ) - def lstrip(self, to_strip=None): + def lstrip(self, to_strip: str = None) -> ParentType: """ Remove leading and trailing characters. @@ -2831,7 +2880,7 @@ def lstrip(self, to_strip=None): cpp_lstrip(self._column, cudf.Scalar(to_strip)) ) - def rstrip(self, to_strip=None): + def rstrip(self, to_strip: str = None) -> ParentType: """ Remove leading and trailing characters. @@ -2886,7 +2935,7 @@ def rstrip(self, to_strip=None): cpp_rstrip(self._column, cudf.Scalar(to_strip)) ) - def wrap(self, width, **kwargs): + def wrap(self, width: int, **kwargs) -> ParentType: """ Wrap long strings in the Series/Index to be formatted in paragraphs with length less than a given width. @@ -2980,7 +3029,7 @@ def wrap(self, width, **kwargs): return self._return_or_inplace(cpp_wrap(self._column, width)) - def count(self, pat, flags=0): + def count(self, pat: str, flags: int = 0) -> ParentType: """ Count occurrences of pattern in each string of the Series/Index. @@ -3040,7 +3089,9 @@ def count(self, pat, flags=0): return self._return_or_inplace(cpp_count_re(self._column, pat)) - def findall(self, pat, flags=0, expand=True): + def findall( + self, pat: str, flags: int = 0, expand: bool = True + ) -> ParentType: """ Find all occurrences of pattern or regular expression in the Series/Index. @@ -3108,7 +3159,7 @@ def findall(self, pat, flags=0, expand=True): cpp_findall(self._column, pat), expand=expand ) - def isempty(self): + def isempty(self) -> ParentType: """ Check whether each string is an empty string. @@ -3128,9 +3179,9 @@ def isempty(self): 4 False dtype: bool """ - return self._return_or_inplace((self._parent == "").fillna(False)) + return self._return_or_inplace((self._column == "").fillna(False)) - def isspace(self): + def isspace(self) -> ParentType: """ Check whether all characters in each string are whitespace. @@ -3186,7 +3237,7 @@ def isspace(self): """ return self._return_or_inplace(cpp_isspace(self._column)) - def endswith(self, pat): + def endswith(self, pat: str) -> ParentType: """ Test if the end of each string element matches a pattern. @@ -3240,7 +3291,7 @@ def endswith(self, pat): return self._return_or_inplace(result_col) - def startswith(self, pat): + def startswith(self, pat: Union[str, Sequence]) -> ParentType: """ Test if the start of each string element matches a pattern. @@ -3300,7 +3351,7 @@ def startswith(self, pat): return self._return_or_inplace(result_col) - def find(self, sub, start=0, end=None): + def find(self, sub: str, start: int = 0, end: int = None) -> ParentType: """ Return lowest indexes in each strings in the Series/Index where the substring is fully contained between ``[start:end]``. @@ -3355,7 +3406,7 @@ def find(self, sub, start=0, end=None): return self._return_or_inplace(result_col) - def rfind(self, sub, start=0, end=None): + def rfind(self, sub: str, start: int = 0, end: int = None) -> ParentType: """ Return highest indexes in each strings in the Series/Index where the substring is fully contained between ``[start:end]``. @@ -3414,7 +3465,7 @@ def rfind(self, sub, start=0, end=None): return self._return_or_inplace(result_col) - def index(self, sub, start=0, end=None): + def index(self, sub: str, start: int = 0, end: int = None) -> ParentType: """ Return lowest indexes in each strings where the substring is fully contained between ``[start:end]``. This is the same @@ -3474,7 +3525,7 @@ def index(self, sub, start=0, end=None): else: return result - def rindex(self, sub, start=0, end=None): + def rindex(self, sub: str, start: int = 0, end: int = None) -> ParentType: """ Return highest indexes in each strings where the substring is fully contained between ``[start:end]``. This is the same @@ -3534,7 +3585,7 @@ def rindex(self, sub, start=0, end=None): else: return result - def match(self, pat, case=True, flags=0): + def match(self, pat: str, case: bool = True, flags: int = 0) -> ParentType: """ Determine if each string matches a regular expression. @@ -3579,7 +3630,7 @@ def match(self, pat, case=True, flags=0): return self._return_or_inplace(cpp_match_re(self._column, pat)) - def url_decode(self): + def url_decode(self) -> ParentType: """ Returns a URL-decoded format of each string. No format checking is performed. All characters @@ -3609,7 +3660,7 @@ def url_decode(self): return self._return_or_inplace(cpp_url_decode(self._column)) - def url_encode(self): + def url_encode(self) -> ParentType: """ Returns a URL-encoded format of each string. No format checking is performed. @@ -3640,7 +3691,7 @@ def url_encode(self): """ return self._return_or_inplace(cpp_url_encode(self._column)) - def code_points(self): + def code_points(self) -> ParentType: """ Returns an array by filling it with the UTF-8 code point values for each character of each string. @@ -3673,14 +3724,14 @@ def code_points(self): """ new_col = cpp_code_points(self._column) - if self._parent is None: - return new_col - elif isinstance(self._parent, cudf.Series): + if isinstance(self._parent, cudf.Series): return cudf.Series(new_col, name=self._parent.name) elif isinstance(self._parent, cudf.Index): return cudf.core.index.as_index(new_col, name=self._parent.name) + else: + return new_col - def translate(self, table): + def translate(self, table: dict) -> ParentType: """ Map all characters in the string through the given mapping table. @@ -3723,7 +3774,9 @@ def translate(self, table): table = str.maketrans(table) return self._return_or_inplace(cpp_translate(self._column, table)) - def filter_characters(self, table, keep=True, repl=None): + def filter_characters( + self, table: dict, keep: bool = True, repl: str = None + ) -> ParentType: """ Remove characters from each string using the character ranges in the given mapping table. @@ -3774,7 +3827,7 @@ def filter_characters(self, table, keep=True, repl=None): ), ) - def normalize_spaces(self): + def normalize_spaces(self) -> ParentType: """ Remove extra whitespace between tokens and trim whitespace from the beginning and the end of each string. @@ -3794,7 +3847,7 @@ def normalize_spaces(self): """ return self._return_or_inplace(cpp_normalize_spaces(self._column)) - def normalize_characters(self, do_lower=True): + def normalize_characters(self, do_lower: bool = True) -> ParentType: """ Normalizes strings characters for tokenizing. @@ -3843,7 +3896,7 @@ def normalize_characters(self, do_lower=True): cpp_normalize_characters(self._column, do_lower) ) - def tokenize(self, delimiter=" "): + def tokenize(self, delimiter: str = " ") -> ParentType: """ Each string is split into tokens using the provided delimiter(s). The sequence returned contains the tokens in the order @@ -3890,7 +3943,9 @@ def tokenize(self, delimiter=" "): for delimiters, but got {type(delimiter)}" ) - def detokenize(self, indices, separator=" "): + def detokenize( + self, indices: "cudf.Series", separator: str = " " + ) -> ParentType: """ Combines tokens into strings by concatenating them in the order in which they appear in the ``indices`` column. The ``separator`` is @@ -3898,7 +3953,7 @@ def detokenize(self, indices, separator=" "): Parameters ---------- - indices : list of ints + indices : Series Each value identifies the output row for the corresponding token. separator : str The string concatenated between each token in an output row. @@ -3925,7 +3980,7 @@ def detokenize(self, indices, separator=" "): retain_index=False, ) - def character_tokenize(self): + def character_tokenize(self) -> ParentType: """ Each string is split into individual characters. The sequence returned contains each character as an individual string. @@ -3973,14 +4028,14 @@ def character_tokenize(self): dtype: object """ result_col = cpp_character_tokenize(self._column) - if self._parent is None: - return result_col - elif isinstance(self._parent, cudf.Series): + if isinstance(self._parent, cudf.Series): return cudf.Series(result_col, name=self._parent.name) elif isinstance(self._parent, cudf.Index): return cudf.core.index.as_index(result_col, name=self._parent.name) + else: + return result_col - def token_count(self, delimiter=" "): + def token_count(self, delimiter: str = " ") -> ParentType: """ Each string is split into tokens using the provided delimiter. The returned integer sequence is the number of tokens in each string. @@ -4022,7 +4077,7 @@ def token_count(self, delimiter=" "): for delimiters, but got {type(delimiter)}" ) - def ngrams(self, n=2, separator="_"): + def ngrams(self, n: int = 2, separator: str = "_") -> ParentType: """ Generate the n-grams from a set of tokens, each record in series is treated a token. @@ -4059,7 +4114,7 @@ def ngrams(self, n=2, separator="_"): cpp_generate_ngrams(self._column, n, separator), retain_index=False ) - def character_ngrams(self, n=2): + def character_ngrams(self, n: int = 2) -> ParentType: """ Generate the n-grams from characters in a column of strings. @@ -4095,7 +4150,9 @@ def character_ngrams(self, n=2): cpp_generate_character_ngrams(self._column, n), retain_index=False ) - def ngrams_tokenize(self, n=2, delimiter=" ", separator="_"): + def ngrams_tokenize( + self, n: int = 2, delimiter: str = " ", separator: str = "_" + ) -> ParentType: """ Generate the n-grams using tokens from each string. This will tokenize each string and then generate ngrams for each @@ -4131,7 +4188,9 @@ def ngrams_tokenize(self, n=2, delimiter=" ", separator="_"): retain_index=False, ) - def replace_tokens(self, targets, replacements, delimiter=None): + def replace_tokens( + self, targets, replacements, delimiter: str = None + ) -> ParentType: """ The targets tokens are searched for within each string in the series and replaced with the corresponding replacements if found. @@ -4213,8 +4272,11 @@ def replace_tokens(self, targets, replacements, delimiter=None): ) def filter_tokens( - self, min_token_length, replacement=None, delimiter=None - ): + self, + min_token_length: int, + replacement: str = None, + delimiter: str = None, + ) -> ParentType: """ Remove tokens from within each string in the series that are smaller than min_token_length and optionally replace them @@ -4282,13 +4344,13 @@ def filter_tokens( def subword_tokenize( self, - hash_file, - max_length=64, - stride=48, - do_lower=True, - do_truncate=False, - max_rows_tensor=500, - ): + hash_file: str, + max_length: int = 64, + stride: int = 48, + do_lower: bool = True, + do_truncate: bool = False, + max_rows_tensor: int = 500, + ) -> Tuple[cupy.ndarray, cupy.ndarray, cupy.ndarray]: """ Run CUDA BERT subword tokenizer on cuDF strings column. Encodes words to token ids using vocabulary from a pretrained @@ -4337,12 +4399,12 @@ def subword_tokenize( Returns ------- - token-ids : Column + token-ids : cupy.ndarray The token-ids for each string padded with 0s to max_length. - attention-mask : Column + attention-mask : cupy.ndarray The mask for token-ids result where corresponding positions identify valid token-id values. - metadata : Column + metadata : cupy.ndarray Each row contains the index id of the original string and the first and last index of the token-ids that are non-padded and non-overlapping. @@ -4383,7 +4445,7 @@ def subword_tokenize( cupy.asarray(metadata), ) - def porter_stemmer_measure(self): + def porter_stemmer_measure(self) -> ParentType: """ Compute the Porter Stemmer measure for each string. The Porter Stemmer algorithm is described `here @@ -4406,7 +4468,7 @@ def porter_stemmer_measure(self): cpp_porter_stemmer_measure(self._column) ) - def is_consonant(self, position): + def is_consonant(self, position) -> ParentType: """ Return true for strings where the character at ``position`` is a consonant. The ``position`` parameter may also be a list of integers @@ -4450,7 +4512,7 @@ def is_consonant(self, position): cpp_is_letter(self._column, ltype, position) ) - def is_vowel(self, position): + def is_vowel(self, position) -> ParentType: """ Return true for strings where the character at ``position`` is a vowel -- not a consonant. The ``position`` parameter may also be @@ -4494,7 +4556,7 @@ def is_vowel(self, position): cpp_is_letter(self._column, ltype, position) ) - def edit_distance(self, targets): + def edit_distance(self, targets) -> ParentType: """ The ``targets`` strings are measured against the strings in this instance using the Levenshtein edit distance algorithm. @@ -4576,8 +4638,17 @@ class StringColumn(column.ColumnBase): """Implements operations for Columns of String type """ + _start_offset: Optional[int] + _end_offset: Optional[int] + _cached_sizeof: Optional[int] + def __init__( - self, mask=None, size=None, offset=0, null_count=None, children=() + self, + mask: Buffer = None, + size: int = None, + offset: int = 0, + null_count: int = None, + children: Tuple["column.ColumnBase", ...] = (), ): """ Parameters @@ -4627,34 +4698,38 @@ def __init__( self._end_offset = None @property - def start_offset(self): + def start_offset(self) -> int: if self._start_offset is None: if ( len(self.base_children) == 2 and self.offset < self.base_children[0].size ): - self._start_offset = int(self.base_children[0][self.offset]) + self._start_offset = int( + self.base_children[0].element_indexing(self.offset) + ) else: self._start_offset = 0 return self._start_offset @property - def end_offset(self): + def end_offset(self) -> int: if self._end_offset is None: if ( len(self.base_children) == 2 and (self.offset + self.size) < self.base_children[0].size ): self._end_offset = int( - self.base_children[0][self.offset + self.size] + self.base_children[0].element_indexing( + self.offset + self.size + ) ) else: self._end_offset = 0 return self._end_offset - def __sizeof__(self): + def __sizeof__(self) -> int: if self._cached_sizeof is None: n = 0 if len(self.base_children) == 2: @@ -4676,7 +4751,7 @@ def __sizeof__(self): return self._cached_sizeof @property - def base_size(self): + def base_size(self) -> int: if len(self.base_children) == 0: return 0 else: @@ -4685,7 +4760,13 @@ def base_size(self): / self.base_children[0].dtype.itemsize ) - def sum(self, skipna=None, dtype=None, min_count=0): + @property + def data_array_view(self) -> cuda.devicearray.DeviceNDArray: + raise ValueError("Cannot get an array view of a StringColumn") + + def sum( + self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 + ): result_col = self._process_for_reduction( skipna=skipna, min_count=min_count ) @@ -4703,39 +4784,38 @@ def set_base_data(self, value): else: super().set_base_data(value) - def set_base_mask(self, value): + def set_base_mask(self, value: Optional[Buffer]): super().set_base_mask(value) - def set_base_children(self, value): + def set_base_children(self, value: Tuple["column.ColumnBase", ...]): # TODO: Implement dtype validation of the children here somehow super().set_base_children(value) - def __contains__(self, item): + def __contains__(self, item: ScalarLike) -> bool: return True in self.str().contains(f"^{item}$") - def str(self, parent=None): + def str(self, parent: ParentType = None) -> StringMethods: return StringMethods(self, parent=parent) - def unary_operator(self, unaryop): + def unary_operator(self, unaryop: builtins.str): raise TypeError( f"Series of dtype `str` cannot perform the operation: " f"{unaryop}" ) - def __len__(self): + def __len__(self) -> int: return self.size - def _set_mask(self, value): - super()._set_mask(value) - @property - def _nbytes(self): + def _nbytes(self) -> int: if self.size == 0: return 0 else: return self.children[1].size - def as_numerical_column(self, dtype): + def as_numerical_column( + self, dtype: Dtype + ) -> "cudf.core.column.NumericalColumn": out_dtype = np.dtype(dtype) if out_dtype.kind in {"i", "u"}: @@ -4775,42 +4855,49 @@ def _as_datetime_or_timedelta_column(self, dtype, format): return result_col - def as_datetime_column(self, dtype, format=None): + def as_datetime_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.DatetimeColumn": out_dtype = np.dtype(dtype) + # infer on host from the first not na element + # or return all null column if all values + # are null in current column + format = kwargs.get("format", None) if format is None: - # infer on host from the first not na element - # or return all null column if all values - # are null in current column if self.null_count == len(self): - return column.column_empty( - len(self), dtype=out_dtype, masked=True + return cast( + "cudf.core.column.DatetimeColumn", + column.column_empty( + len(self), dtype=out_dtype, masked=True + ), ) else: - format = datetime.infer_format(self[self.notna()][0]) + format = datetime.infer_format( + self.apply_boolean_mask(self.notna()).element_indexing(0) + ) return self._as_datetime_or_timedelta_column(out_dtype, format) - def as_timedelta_column(self, dtype, format=None): + def as_timedelta_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.TimeDeltaColumn": out_dtype = np.dtype(dtype) - - if format is None: - format = "%D days %H:%M:%S" - + format = "%D days %H:%M:%S" return self._as_datetime_or_timedelta_column(out_dtype, format) - def as_string_column(self, dtype): + def as_string_column(self, dtype: Dtype, format=None) -> StringColumn: return self @property - def values_host(self): + def values_host(self) -> np.ndarray: """ Return a numpy representation of the StringColumn. """ return self.to_pandas().values @property - def values(self): + def values(self) -> cupy.ndarray: """ Return a CuPy representation of the StringColumn. """ @@ -4818,7 +4905,7 @@ def values(self): "String Arrays is not yet implemented in cudf" ) - def to_array(self, fillna=None): + def to_array(self, fillna: bool = None) -> np.ndarray: """Get a dense numpy array for the data. Notes @@ -4851,8 +4938,8 @@ def __arrow_array__(self, type=None): "consider using .to_arrow()" ) - def serialize(self): - header = {"null_count": self.null_count} + def serialize(self) -> Tuple[dict, list]: + header = {"null_count": self.null_count} # type: Dict[Any, Any] header["type-serialized"] = pickle.dumps(type(self)) header["size"] = self.size @@ -4872,7 +4959,7 @@ def serialize(self): return header, frames @classmethod - def deserialize(cls, header, frames): + def deserialize(cls, header: dict, frames: list) -> StringColumn: size = header["size"] if not isinstance(size, int): size = pickle.loads(size) @@ -4880,26 +4967,28 @@ def deserialize(cls, header, frames): # Deserialize the mask, value, and offset frames buffers = [Buffer(each_frame) for each_frame in frames] + nbuf = None if header["null_count"] > 0: nbuf = buffers[2] - else: - nbuf = None children = [] for h, b in zip(header["subheaders"], buffers[:2]): column_type = pickle.loads(h["type-serialized"]) children.append(column_type.deserialize(h, [b])) - col = column.build_column( - data=None, - dtype="str", - mask=nbuf, - children=tuple(children), - size=size, + col = cast( + StringColumn, + column.build_column( + data=None, + dtype="str", + mask=nbuf, + children=tuple(children), + size=size, + ), ) return col - def can_cast_safely(self, to_dtype): + def can_cast_safely(self, to_dtype: Dtype) -> bool: to_dtype = np.dtype(to_dtype) if self.dtype == to_dtype: @@ -4911,7 +5000,12 @@ def can_cast_safely(self, to_dtype): else: return True - def find_and_replace(self, to_replace, replacement, all_nan): + def find_and_replace( + self, + to_replace: ColumnLike, + replacement: ColumnLike, + all_nan: bool = False, + ) -> StringColumn: """ Return col with *to_replace* replaced with *value* """ @@ -4919,7 +5013,12 @@ def find_and_replace(self, to_replace, replacement, all_nan): replacement = column.as_column(replacement, dtype=self.dtype) return libcudf.replace.replace(self, to_replace, replacement) - def fillna(self, fill_value=None, method=None): + def fillna( + self, + fill_value: Any = None, + method: builtins.str = None, + dtype: Dtype = None, + ) -> StringColumn: if fill_value is not None: if not is_scalar(fill_value): fill_value = column.as_column(fill_value, dtype=self.dtype) @@ -4927,24 +5026,26 @@ def fillna(self, fill_value=None, method=None): else: return super().fillna(method=method) - def _find_first_and_last(self, value): + def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]: found_indices = self.str().contains(f"^{value}$") found_indices = libcudf.unary.cast(found_indices, dtype=np.int32) first = column.as_column(found_indices).find_first_value(1) last = column.as_column(found_indices).find_last_value(1) return first, last - def find_first_value(self, value, closest=False): + def find_first_value( + self, value: ScalarLike, closest: bool = False + ) -> int: return self._find_first_and_last(value)[0] - def find_last_value(self, value, closest=False): + def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: return self._find_first_and_last(value)[1] - def normalize_binop_value(self, other): + def normalize_binop_value(self, other) -> "column.ColumnBase": # fastpath: gpu scalar if isinstance(other, cudf.Scalar) and other.dtype == "object": return column.as_column(other, length=len(self)) - if isinstance(other, column.Column): + if isinstance(other, column.ColumnBase): return other.astype(self.dtype) elif isinstance(other, str) or other is None: col = utils.scalar_broadcast_to( @@ -4959,16 +5060,18 @@ def normalize_binop_value(self, other): else: raise TypeError(f"cannot broadcast {type(other)}") - def default_na_value(self): + def default_na_value(self) -> ScalarLike: return None - def binary_operator(self, op, rhs, reflect=False): + def binary_operator( + self, op: builtins.str, rhs, reflect: bool = False + ) -> "column.ColumnBase": lhs = self if reflect: lhs, rhs = rhs, lhs if isinstance(rhs, (StringColumn, str, cudf.Scalar)): if op == "add": - return lhs.str().cat(others=rhs) + return cast("column.ColumnBase", lhs.str().cat(others=rhs)) elif op in ("eq", "ne", "gt", "lt", "ge", "le"): return _string_column_binop(self, rhs, op=op, out_dtype="bool") @@ -4977,7 +5080,7 @@ def binary_operator(self, op, rhs, reflect=False): ) @property - def is_unique(self): + def is_unique(self) -> bool: return len(self.unique()) == len(self) @property @@ -4986,19 +5089,17 @@ def __cuda_array_interface__(self): "Strings are not yet supported via `__cuda_array_interface__`" ) - def _mimic_inplace(self, other_col, inplace=False): - out = super()._mimic_inplace(other_col, inplace=inplace) - return out - @copy_docstring(column.ColumnBase.view) - def view(self, dtype): + def view(self, dtype) -> "cudf.core.column.ColumnBase": if self.null_count > 0: raise ValueError( "Can not produce a view of a string column with nulls" ) dtype = np.dtype(dtype) - str_byte_offset = self.base_children[0][self.offset] - str_end_byte_offset = self.base_children[0][self.offset + self.size] + str_byte_offset = self.base_children[0].element_indexing(self.offset) + str_end_byte_offset = self.base_children[0].element_indexing( + self.offset + self.size + ) char_dtype_size = self.base_children[1].dtype.itemsize n_bytes_to_view = ( @@ -5016,7 +5117,12 @@ def view(self, dtype): @annotate("BINARY_OP", color="orange", domain="cudf_python") -def _string_column_binop(lhs, rhs, op, out_dtype): +def _string_column_binop( + lhs: "column.ColumnBase", + rhs: "column.ColumnBase", + op: str, + out_dtype: Dtype, +) -> "column.ColumnBase": out = libcudf.binaryop.binaryop(lhs=lhs, rhs=rhs, op=op, dtype=out_dtype) return out diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 9036f1e2962..f797bdf9635 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -1,6 +1,9 @@ # Copyright (c) 2020, NVIDIA CORPORATION. +from __future__ import annotations + import datetime as dt from numbers import Number +from typing import Any, Sequence, Tuple, Union, cast import numpy as np import pandas as pd @@ -9,6 +12,14 @@ import cudf from cudf import _lib as libcudf +from cudf._typing import ( + BinaryOperand, + DatetimeLikeScalar, + Dtype, + DtypeObj, + ScalarLike, +) +from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, column, string from cudf.core.column.datetime import _numpy_to_pandas_conversion from cudf.utils.dtypes import is_scalar, np_to_pa_dtype @@ -24,7 +35,13 @@ class TimeDeltaColumn(column.ColumnBase): def __init__( - self, data, dtype, size, mask=None, offset=0, null_count=None + self, + data: Buffer, + dtype: Dtype, + size: int = None, + mask: Buffer = None, + offset: int = 0, + null_count: int = None, ): """ Parameters @@ -46,7 +63,9 @@ def __init__( dtype = np.dtype(dtype) if data.size % dtype.itemsize: raise ValueError("Buffer size must be divisible by element size") - + if size is None: + size = data.size // dtype.itemsize + size = size - offset super().__init__( data, size=size, @@ -61,7 +80,7 @@ def __init__( self._time_unit, _ = np.datetime_data(self.dtype) - def __contains__(self, item): + def __contains__(self, item: DatetimeLikeScalar) -> bool: try: item = np.timedelta64(item, self._time_unit) except ValueError: @@ -71,7 +90,7 @@ def __contains__(self, item): return False return item.view("int64") in self.as_numerical - def to_arrow(self): + def to_arrow(self) -> pa.Array: mask = None if self.nullable: mask = pa.py_buffer(self.mask_array_view.copy_to_host()) @@ -84,7 +103,9 @@ def to_arrow(self): null_count=self.null_count, ) - def to_pandas(self, index=None, **kwargs): + def to_pandas( + self, index=None, nullable: bool = False, **kwargs + ) -> pd.Series: # Workaround until following issue is fixed: # https://issues.apache.org/jira/browse/ARROW-9772 @@ -98,8 +119,10 @@ def to_pandas(self, index=None, **kwargs): return pd_series - def _binary_op_floordiv(self, rhs): - lhs, rhs = self, rhs + def _binary_op_floordiv( + self, rhs: BinaryOperand + ) -> Tuple["column.ColumnBase", BinaryOperand, DtypeObj]: + lhs = self # type: column.ColumnBase if pd.api.types.is_timedelta64_dtype(rhs.dtype): common_dtype = determine_out_dtype(self.dtype, rhs.dtype) lhs = lhs.astype(common_dtype).astype("float64") @@ -122,7 +145,7 @@ def _binary_op_floordiv(self, rhs): return lhs, rhs, out_dtype - def _binary_op_mul(self, rhs): + def _binary_op_mul(self, rhs: BinaryOperand) -> DtypeObj: if rhs.dtype.kind in ("f", "i", "u"): out_dtype = self.dtype else: @@ -132,7 +155,7 @@ def _binary_op_mul(self, rhs): ) return out_dtype - def _binary_op_mod(self, rhs): + def _binary_op_mod(self, rhs: BinaryOperand) -> DtypeObj: if pd.api.types.is_timedelta64_dtype(rhs.dtype): out_dtype = determine_out_dtype(self.dtype, rhs.dtype) elif rhs.dtype.kind in ("f", "i", "u"): @@ -144,7 +167,7 @@ def _binary_op_mod(self, rhs): ) return out_dtype - def _binary_op_eq_ne(self, rhs): + def _binary_op_eq_ne(self, rhs: BinaryOperand) -> DtypeObj: if pd.api.types.is_timedelta64_dtype(rhs.dtype): out_dtype = np.bool else: @@ -154,7 +177,7 @@ def _binary_op_eq_ne(self, rhs): ) return out_dtype - def _binary_op_lt_gt_le_ge(self, rhs): + def _binary_op_lt_gt_le_ge(self, rhs: BinaryOperand) -> DtypeObj: if pd.api.types.is_timedelta64_dtype(rhs.dtype): return np.bool else: @@ -163,8 +186,10 @@ def _binary_op_lt_gt_le_ge(self, rhs): f" and {rhs.dtype}" ) - def _binary_op_truediv(self, rhs): - lhs, rhs = self, rhs + def _binary_op_truediv( + self, rhs: BinaryOperand + ) -> Tuple["column.ColumnBase", BinaryOperand, DtypeObj]: + lhs = self # type: column.ColumnBase if pd.api.types.is_timedelta64_dtype(rhs.dtype): common_dtype = determine_out_dtype(self.dtype, rhs.dtype) lhs = lhs.astype(common_dtype).astype("float64") @@ -187,7 +212,9 @@ def _binary_op_truediv(self, rhs): return lhs, rhs, out_dtype - def binary_operator(self, op, rhs, reflect=False): + def binary_operator( + self, op: str, rhs: BinaryOperand, reflect: bool = False + ) -> "column.ColumnBase": lhs, rhs = self, rhs if op in ("eq", "ne"): @@ -199,14 +226,14 @@ def binary_operator(self, op, rhs, reflect=False): elif op == "mod": out_dtype = self._binary_op_mod(rhs) elif op == "truediv": - lhs, rhs, out_dtype = self._binary_op_truediv(rhs) + lhs, rhs, out_dtype = self._binary_op_truediv(rhs) # type: ignore elif op == "floordiv": - lhs, rhs, out_dtype = self._binary_op_floordiv(rhs) + lhs, rhs, out_dtype = self._binary_op_floordiv(rhs) # type: ignore op = "truediv" elif op == "add": - out_dtype = _timedelta_binary_op_add(lhs, rhs) + out_dtype = _timedelta_add_result_dtype(lhs, rhs) elif op == "sub": - out_dtype = _timedelta_binary_op_sub(lhs, rhs) + out_dtype = _timedelta_sub_result_dtype(lhs, rhs) else: raise TypeError( f"Series of dtype {self.dtype} cannot perform " @@ -214,10 +241,11 @@ def binary_operator(self, op, rhs, reflect=False): ) if reflect: - lhs, rhs = rhs, lhs + lhs, rhs = rhs, lhs # type: ignore + return binop(lhs, rhs, op=op, out_dtype=out_dtype) - def normalize_binop_value(self, other): + def normalize_binop_value(self, other) -> BinaryOperand: if isinstance(other, cudf.Scalar): return other @@ -247,30 +275,34 @@ def normalize_binop_value(self, other): raise TypeError(f"cannot normalize {type(other)}") @property - def as_numerical(self): - - return column.build_column( - data=self.base_data, - dtype=np.int64, - mask=self.base_mask, - offset=self.offset, - size=self.size, + def as_numerical(self) -> "cudf.core.column.NumericalColumn": + return cast( + "cudf.core.column.NumericalColumn", + column.build_column( + data=self.base_data, + dtype=np.int64, + mask=self.base_mask, + offset=self.offset, + size=self.size, + ), ) - def default_na_value(self): + def default_na_value(self) -> ScalarLike: """Returns the default NA value for this column """ return np.timedelta64("nat", self.time_unit) @property - def time_unit(self): + def time_unit(self) -> str: return self._time_unit - def fillna(self, fill_value=None, method=None): + def fillna( + self, fill_value: Any = None, method: str = None, dtype: Dtype = None + ) -> TimeDeltaColumn: if fill_value is not None: if cudf.utils.utils.isnat(fill_value): return _fillna_natwise(self) - col = self + col = self # type: column.ColumnBase if is_scalar(fill_value): if isinstance(fill_value, np.timedelta64): dtype = determine_out_dtype(self.dtype, fill_value.dtype) @@ -280,51 +312,61 @@ def fillna(self, fill_value=None, method=None): fill_value = cudf.Scalar(fill_value, dtype=dtype) else: fill_value = column.as_column(fill_value, nan_as_null=False) - - return ColumnBase.fillna(col, fill_value) + return cast(TimeDeltaColumn, ColumnBase.fillna(col, fill_value)) else: return super().fillna(method=method) - def as_numerical_column(self, dtype): - return self.as_numerical.astype(dtype) + def as_numerical_column( + self, dtype: Dtype + ) -> "cudf.core.column.NumericalColumn": + return cast( + "cudf.core.column.NumericalColumn", self.as_numerical.astype(dtype) + ) - def as_datetime_column(self, dtype, **kwargs): + def as_datetime_column( + self, dtype: Dtype, **kwargs + ) -> "cudf.core.column.DatetimeColumn": raise TypeError( f"cannot astype a timedelta from [{self.dtype}] to [{dtype}]" ) - def as_string_column(self, dtype, **kwargs): - - if not kwargs.get("format"): - fmt = _dtype_to_format_conversion.get( + def as_string_column( + self, dtype: Dtype, format=None + ) -> "cudf.core.column.StringColumn": + if format is None: + format = _dtype_to_format_conversion.get( self.dtype.name, "%D days %H:%M:%S" ) - kwargs["format"] = fmt if len(self) > 0: return string._timedelta_to_str_typecast_functions[ np.dtype(self.dtype) - ](self, **kwargs) + ](self, format=format) else: - return column.column_empty(0, dtype="object", masked=False) + return cast( + "cudf.core.column.StringColumn", + column.column_empty(0, dtype="object", masked=False), + ) - def as_timedelta_column(self, dtype, **kwargs): + def as_timedelta_column(self, dtype: Dtype, **kwargs) -> TimeDeltaColumn: dtype = np.dtype(dtype) if dtype == self.dtype: return self return libcudf.unary.cast(self, dtype=dtype) - def mean(self, skipna=None, dtype=np.float64): + def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta: return pd.Timedelta( self.as_numerical.mean(skipna=skipna, dtype=dtype), unit=self.time_unit, ) - def median(self, skipna=None): + def median(self, skipna: bool = None) -> pd.Timedelta: return pd.Timedelta( self.as_numerical.median(skipna=skipna), unit=self.time_unit ) - def quantile(self, q, interpolation, exact): + def quantile( + self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + ) -> "column.ColumnBase": result = self.as_numerical.quantile( q=q, interpolation=interpolation, exact=exact ) @@ -332,7 +374,9 @@ def quantile(self, q, interpolation, exact): return pd.Timedelta(result, unit=self.time_unit) return result.astype(self.dtype) - def sum(self, skipna=None, dtype=None, min_count=0): + def sum( + self, skipna: bool = None, dtype: Dtype = None, min_count=0 + ) -> pd.Timedelta: if len(self) == 0: return pd.Timedelta(None, unit=self.time_unit) else: @@ -343,13 +387,15 @@ def sum(self, skipna=None, dtype=None, min_count=0): unit=self.time_unit, ) - def std(self, skipna=None, ddof=1, dtype=np.float64): + def std( + self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64 + ) -> pd.Timedelta: return pd.Timedelta( self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype), unit=self.time_unit, ) - def components(self, index=None): + def components(self, index=None) -> "cudf.DataFrame": """ Return a Dataframe of the components of the Timedeltas. @@ -443,7 +489,7 @@ def components(self, index=None): ) @property - def days(self): + def days(self) -> "cudf.core.column.NumericalColumn": """ Number of days for each element. @@ -456,7 +502,7 @@ def days(self): ) @property - def seconds(self): + def seconds(self) -> "cudf.core.column.NumericalColumn": """ Number of seconds (>= 0 and less than 1 day). @@ -479,7 +525,7 @@ def seconds(self): ) @property - def microseconds(self): + def microseconds(self) -> "cudf.core.column.NumericalColumn": """ Number of microseconds (>= 0 and less than 1 second). @@ -499,7 +545,7 @@ def microseconds(self): ) @property - def nanoseconds(self): + def nanoseconds(self) -> "cudf.core.column.NumericalColumn": """ Return the number of nanoseconds (n), where 0 <= n < 1 microsecond. @@ -524,12 +570,17 @@ def nanoseconds(self): @annotate("BINARY_OP", color="orange", domain="cudf_python") -def binop(lhs, rhs, op, out_dtype): +def binop( + lhs: "column.ColumnBase", + rhs: "column.ColumnBase", + op: str, + out_dtype: DtypeObj, +) -> "cudf.core.column.ColumnBase": out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) return out -def determine_out_dtype(lhs_dtype, rhs_dtype): +def determine_out_dtype(lhs_dtype: Dtype, rhs_dtype: Dtype) -> Dtype: if np.can_cast(np.dtype(lhs_dtype), np.dtype(rhs_dtype)): return rhs_dtype elif np.can_cast(np.dtype(rhs_dtype), np.dtype(lhs_dtype)): @@ -538,7 +589,9 @@ def determine_out_dtype(lhs_dtype, rhs_dtype): raise TypeError(f"Cannot type-cast {lhs_dtype} and {rhs_dtype}") -def _timedelta_binary_op_add(lhs, rhs): +def _timedelta_add_result_dtype( + lhs: BinaryOperand, rhs: BinaryOperand +) -> Dtype: if pd.api.types.is_timedelta64_dtype(rhs.dtype): out_dtype = determine_out_dtype(lhs.dtype, rhs.dtype) elif pd.api.types.is_datetime64_dtype(rhs.dtype): @@ -557,7 +610,9 @@ def _timedelta_binary_op_add(lhs, rhs): return out_dtype -def _timedelta_binary_op_sub(lhs, rhs): +def _timedelta_sub_result_dtype( + lhs: BinaryOperand, rhs: BinaryOperand +) -> Dtype: if pd.api.types.is_timedelta64_dtype( lhs.dtype ) and pd.api.types.is_timedelta64_dtype(rhs.dtype): diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index c750cc92f30..f5823528d02 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -1,5 +1,11 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from __future__ import annotations + import itertools +from collections import OrderedDict from collections.abc import MutableMapping +from typing import TYPE_CHECKING, Any, Tuple, Union import pandas as pd @@ -11,9 +17,22 @@ to_nested_dict, ) +if TYPE_CHECKING: + from cudf.core.column import ColumnBase + class ColumnAccessor(MutableMapping): - def __init__(self, data=None, multiindex=False, level_names=None): + + _data: "OrderedDict[Any, ColumnBase]" + multiindex: bool + _level_names: Tuple[Any, ...] + + def __init__( + self, + data: Union[MutableMapping, ColumnAccessor] = None, + multiindex: bool = False, + level_names=None, + ): """ Parameters ---------- @@ -33,7 +52,7 @@ def __init__(self, data=None, multiindex=False, level_names=None): if isinstance(data, ColumnAccessor): multiindex = multiindex or data.multiindex level_names = level_names or data.level_names - self._data = data + self._data = data._data self.multiindex = multiindex self._level_names = level_names @@ -44,21 +63,21 @@ def __init__(self, data=None, multiindex=False, level_names=None): def __iter__(self): return self._data.__iter__() - def __getitem__(self, key): + def __getitem__(self, key: Any) -> ColumnBase: return self._data[key] - def __setitem__(self, key, value): + def __setitem__(self, key: Any, value: Any): self.set_by_label(key, value) self._clear_cache() - def __delitem__(self, key): + def __delitem__(self, key: Any): self._data.__delitem__(key) self._clear_cache() - def __len__(self): + def __len__(self) -> int: return len(self._data) - def __repr__(self): + def __repr__(self) -> str: data_repr = self._data.__repr__() multiindex_repr = self.multiindex.__repr__() level_names_repr = self.level_names.__repr__() @@ -70,14 +89,14 @@ def __repr__(self): ) @property - def level_names(self): + def level_names(self) -> Tuple[Any, ...]: if self._level_names is None or len(self._level_names) == 0: return tuple((None,) * max(1, self.nlevels)) else: return self._level_names @property - def nlevels(self): + def nlevels(self) -> int: if len(self._data) == 0: return 0 if not self.multiindex: @@ -86,28 +105,28 @@ def nlevels(self): return len(next(iter(self.keys()))) @property - def name(self): + def name(self) -> Any: if len(self._data) == 0: return None return self.level_names[-1] @property - def nrows(self): + def nrows(self) -> int: if len(self._data) == 0: return 0 else: return len(next(iter(self.values()))) @cached_property - def names(self): + def names(self) -> Tuple[Any, ...]: return tuple(self.keys()) @cached_property - def columns(self): + def columns(self) -> Tuple[ColumnBase, ...]: return tuple(self.values()) @cached_property - def _grouped_data(self): + def _grouped_data(self) -> MutableMapping: """ If self.multiindex is True, return the underlying mapping as a nested mapping. @@ -125,7 +144,7 @@ def _clear_cache(self): except AttributeError: pass - def to_pandas_index(self): + def to_pandas_index(self) -> pd.Index: """" Convert the keys of the ColumnAccessor to a Pandas Index object. """ @@ -142,7 +161,7 @@ def to_pandas_index(self): result = pd.Index(self.names, name=self.name, tupleize_cols=False) return result - def insert(self, name, value, loc=-1): + def insert(self, name: Any, value: Any, loc: int = -1): """ Insert column into the ColumnAccessor at the specified location. @@ -176,10 +195,10 @@ def insert(self, name, value, loc=-1): else: new_keys = self.names[:loc] + (name,) + self.names[loc:] new_values = self.columns[:loc] + (value,) + self.columns[loc:] - self._data = self._data.__class__(zip(new_keys, new_values),) + self._data = self._data.__class__(zip(new_keys, new_values)) self._clear_cache() - def copy(self, deep=False): + def copy(self, deep=False) -> ColumnAccessor: """ Make a copy of this ColumnAccessor. """ @@ -195,7 +214,7 @@ def copy(self, deep=False): level_names=self.level_names, ) - def select_by_label(self, key): + def select_by_label(self, key: Any) -> ColumnAccessor: """ Return a subset of this column accessor, composed of the keys specified by `key`. @@ -218,7 +237,7 @@ def select_by_label(self, key): return self._select_by_label_with_wildcard(key) return self._select_by_label_grouped(key) - def select_by_index(self, index): + def select_by_index(self, index: Any) -> ColumnAccessor: """ Return a ColumnAccessor composed of the columns specified by index. @@ -243,7 +262,7 @@ def select_by_index(self, index): data, multiindex=self.multiindex, level_names=self.level_names, ) - def set_by_label(self, key, value): + def set_by_label(self, key: Any, value: Any): """ Add (or modify) column by name. @@ -256,14 +275,14 @@ def set_by_label(self, key, value): self._data[key] = value self._clear_cache() - def _select_by_label_list_like(self, key): + def _select_by_label_list_like(self, key: Any) -> ColumnAccessor: return self.__class__( to_flat_dict({k: self._grouped_data[k] for k in key}), multiindex=self.multiindex, level_names=self.level_names, ) - def _select_by_label_grouped(self, key): + def _select_by_label_grouped(self, key: Any) -> ColumnAccessor: result = self._grouped_data[key] if isinstance(result, cudf.core.column.ColumnBase): return self.__class__({key: result}) @@ -277,7 +296,7 @@ def _select_by_label_grouped(self, key): level_names=self.level_names[len(key) :], ) - def _select_by_label_slice(self, key): + def _select_by_label_slice(self, key: slice) -> ColumnAccessor: start, stop = key.start, key.stop if key.step is not None: raise TypeError("Label slicing with step is not supported") @@ -303,7 +322,7 @@ def _select_by_label_slice(self, key): level_names=self.level_names, ) - def _select_by_label_with_wildcard(self, key): + def _select_by_label_with_wildcard(self, key: Any) -> ColumnAccessor: key = self._pad_key(key, slice(None)) return self.__class__( {k: self._data[k] for k in self._data if _compare_keys(k, key)}, @@ -311,7 +330,7 @@ def _select_by_label_with_wildcard(self, key): level_names=self.level_names, ) - def _pad_key(self, key, pad_value=""): + def _pad_key(self, key: Any, pad_value="") -> Any: """ Pad the provided key to a length equal to the number of levels. @@ -323,7 +342,7 @@ def _pad_key(self, key, pad_value=""): return key + (pad_value,) * (self.nlevels - len(key)) -def _compare_keys(target, key): +def _compare_keys(target: Any, key: Any) -> bool: """ Compare `key` to `target`. diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 6523b08cb27..e5626190098 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -9,6 +9,7 @@ import warnings from collections import OrderedDict, defaultdict from collections.abc import Iterable, Mapping, Sequence +from typing import Any, Set import cupy import numpy as np @@ -2364,7 +2365,7 @@ def iteritems(self): for k in self: yield (k, self[k]) - @property + @property # type: ignore @annotate("DATAFRAME_LOC", color="blue", domain="cudf_python") def loc(self): """ @@ -2535,14 +2536,14 @@ def at(self): """ return self.loc - @property + @property # type: ignore @annotate("DATAFRAME_COLUMNS_GETTER", color="yellow", domain="cudf_python") def columns(self): """Returns a tuple of columns """ return self._data.to_pandas_index() - @columns.setter + @columns.setter # type: ignore @annotate("DATAFRAME_COLUMNS_SETTER", color="yellow", domain="cudf_python") def columns(self, columns): if isinstance(columns, (cudf.MultiIndex, cudf.Index)): @@ -4229,7 +4230,6 @@ def join( ) return df - @copy_docstring(DataFrameGroupBy.__init__) def groupby( self, by=None, @@ -4274,7 +4274,6 @@ def groupby( sort=sort, ) - @copy_docstring(Rolling) def rolling( self, window, min_periods=None, center=False, axis=0, win_type=None ): @@ -7272,7 +7271,7 @@ def equals(self, other): return False return super().equals(other) - _accessors = set() + _accessors = set() # type: Set[Any] def from_pandas(obj, nan_as_null=None): diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 1f23fcd50f8..b89b3ddb2be 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -2,6 +2,7 @@ import decimal import pickle +from typing import Any import numpy as np import pandas as pd @@ -9,10 +10,11 @@ from pandas.api.extensions import ExtensionDtype import cudf +from cudf._typing import Dtype class CategoricalDtype(ExtensionDtype): - def __init__(self, categories=None, ordered=None): + def __init__(self, categories=None, ordered: bool = None) -> None: """ dtype similar to pd.CategoricalDtype with the categories stored on the GPU. @@ -21,7 +23,7 @@ def __init__(self, categories=None, ordered=None): self.ordered = ordered @property - def categories(self): + def categories(self) -> "cudf.core.index.Index": if self._categories is None: return cudf.core.index.as_index( cudf.core.column.column_empty(0, dtype="object", masked=False) @@ -41,23 +43,23 @@ def str(self): return "|O08" @classmethod - def from_pandas(cls, dtype): + def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype": return CategoricalDtype( categories=dtype.categories, ordered=dtype.ordered ) - def to_pandas(self): + def to_pandas(self) -> pd.CategoricalDtype: if self.categories is None: categories = None else: categories = self.categories.to_pandas() return pd.CategoricalDtype(categories=categories, ordered=self.ordered) - def _init_categories(self, categories): + def _init_categories(self, categories: Any): if categories is None: return categories if len(categories) == 0: - dtype = "object" + dtype = "object" # type: Any else: dtype = None @@ -68,7 +70,7 @@ def _init_categories(self, categories): else: return column - def __eq__(self, other): + def __eq__(self, other: Dtype) -> bool: if isinstance(other, str): return other == self.name elif other is self: @@ -111,10 +113,10 @@ def deserialize(cls, header, frames): class ListDtype(ExtensionDtype): + _typ: pa.ListType + name: str = "list" - name = "list" - - def __init__(self, element_type): + def __init__(self, element_type: Any) -> None: if isinstance(element_type, ListDtype): self._typ = pa.list_(element_type._typ) else: @@ -124,7 +126,7 @@ def __init__(self, element_type): self._typ = pa.list_(element_type) @property - def element_type(self): + def element_type(self) -> Dtype: if isinstance(self._typ.value_type, pa.ListType): return ListDtype.from_arrow(self._typ.value_type) else: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index e60c8c52944..3d12ac2e6cc 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -5,6 +5,7 @@ import operator import warnings from collections import OrderedDict, abc as abc +from typing import overload import cupy import numpy as np @@ -12,6 +13,7 @@ import pyarrow as pa from nvtx import annotate from pandas.api.types import is_dict_like, is_dtype_equal +from typing_extensions import Literal import cudf from cudf import _lib as libcudf @@ -39,9 +41,23 @@ class Frame(libcudf.table.Table): """ @classmethod - def _from_table(cls, table): + def _from_table(cls, table: "Frame"): return cls(table._data, index=table._index) + @overload + def _mimic_inplace(self, result: "Frame") -> "Frame": + ... + + @overload + def _mimic_inplace(self, result: "Frame", inplace: Literal[True]): + ... + + @overload + def _mimic_inplace( + self, result: "Frame", inplace: Literal[False] + ) -> "Frame": + ... + def _mimic_inplace(self, result, inplace=False): if inplace: for col in self._data: diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 219d355d3cc..e3899a403f1 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1,8 +1,9 @@ # Copyright (c) 2018-2020, NVIDIA CORPORATION. -from __future__ import division, print_function +from __future__ import annotations, division, print_function import pickle from numbers import Number +from typing import Any, Dict, Set, Type import cupy import numpy as np @@ -132,6 +133,13 @@ def __init__( """ pass + @cached_property + def _values(self) -> ColumnBase: + raise NotImplementedError + + def __getitem__(self, key): + raise NotImplementedError() + def drop_duplicates(self, keep="first"): """ Return Index with duplicate values removed @@ -1485,7 +1493,11 @@ def _from_table(cls, table): else: return as_index(table) - _accessors = set() + _accessors = set() # type: Set[Any] + + @property + def _constructor_expanddim(self): + return cudf.MultiIndex class RangeIndex(Index): @@ -1773,7 +1785,7 @@ def find_label_range(self, first=None, last=None): return begin, end - @copy_docstring(_to_frame) + @copy_docstring(_to_frame) # type: ignore def to_frame(self, index=True, name=None): return _to_frame(self, index, name) @@ -2028,7 +2040,7 @@ def __getitem__(self, index): else: return res - @copy_docstring(_to_frame) + @copy_docstring(_to_frame) # type: ignore def to_frame(self, index=True, name=None): return _to_frame(self, index, name) @@ -2705,15 +2717,11 @@ def __repr__(self): + ")" ) - @copy_docstring(StringMethods.__init__) + @copy_docstring(StringMethods.__init__) # type: ignore @property def str(self): return StringMethods(column=self._values, parent=self) - @property - def _constructor_expanddim(self): - return cudf.MultiIndex - def _clean_nulls_from_index(self): """ Convert all na values(if any) in Index object @@ -2725,7 +2733,7 @@ def _clean_nulls_from_index(self): return self -def as_index(arbitrary, **kwargs): +def as_index(arbitrary, **kwargs) -> Index: """Create an Index from an arbitrary object Currently supported inputs are: @@ -2794,7 +2802,7 @@ def as_index(arbitrary, **kwargs): np.uint64: UInt64Index, np.float32: Float32Index, np.float64: Float64Index, -} +} # type: Dict[Any, Type[Index]] _index_to_dtype = { Int8Index: np.int8, diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 3872e296ed5..4ea32c77724 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -329,6 +329,9 @@ def _dispatch_scalar_unaop(self, op): return np.ceil(self.value) return getattr(self.value, op)() + def astype(self, dtype): + return Scalar(self.device_value, dtype) + class _NAType(object): def __init__(self): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 76d342eab2c..dfc687eb76d 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -5,6 +5,7 @@ from collections import abc as abc from numbers import Number from shutil import get_terminal_size +from typing import Any, Set from uuid import uuid4 import cupy @@ -1707,17 +1708,17 @@ def __neg__(self): """ return self.__mul__(-1) - @copy_docstring(CategoricalAccessor.__init__) + @copy_docstring(CategoricalAccessor.__init__) # type: ignore @property def cat(self): return CategoricalAccessor(column=self._column, parent=self) - @copy_docstring(StringMethods.__init__) + @copy_docstring(StringMethods.__init__) # type: ignore @property def str(self): return StringMethods(column=self._column, parent=self) - @copy_docstring(ListMethods.__init__) + @copy_docstring(ListMethods.__init__) # type: ignore @property def list(self): return ListMethods(column=self._column, parent=self) @@ -4444,7 +4445,7 @@ def keys(self): """ return self.index - _accessors = set() + _accessors = set() # type: Set[Any] truediv_int_dtype_corrections = { diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index d6e0fedf8e0..7c8455b6575 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -7,9 +7,11 @@ from pyarrow import orc as orc import cudf -from cudf import _lib as libcudf +from cudf._lib import orc as liborc from cudf.utils import ioutils -from cudf.utils.metadata import orc_column_statistics_pb2 as cs_pb2 +from cudf.utils.metadata import ( # type: ignore + orc_column_statistics_pb2 as cs_pb2, +) def _make_empty_df(filepath_or_buffer, columns): @@ -127,7 +129,7 @@ def read_orc_statistics( column_names, raw_file_statistics, raw_stripes_statistics, - ) = libcudf.orc.read_raw_orc_statistics(filepath_or_buffer) + ) = liborc.read_raw_orc_statistics(filepath_or_buffer) # Parse column names column_names = [ @@ -257,7 +259,7 @@ def read_orc( if engine == "cudf": df = DataFrame._from_table( - libcudf.orc.read_orc( + liborc.read_orc( filepath_or_buffer, columns, stripes, @@ -324,9 +326,9 @@ def to_orc(df, fname, compression=None, enable_statistics=True, **kwargs): if ioutils.is_fsspec_open_file(path_or_buf): with path_or_buf as file_obj: file_obj = ioutils.get_IOBase_writer(file_obj) - libcudf.orc.write_orc(df, file_obj, compression, enable_statistics) + liborc.write_orc(df, file_obj, compression, enable_statistics) else: - libcudf.orc.write_orc(df, path_or_buf, compression, enable_statistics) + liborc.write_orc(df, path_or_buf, compression, enable_statistics) -ORCWriter = libcudf.orc.ORCWriter +ORCWriter = liborc.ORCWriter diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index bf8898825c0..2048e574acc 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -1,5 +1,7 @@ # Copyright (c) 2020, NVIDIA CORPORATION. +from __future__ import annotations + from typing import Union import numpy as np diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py index 62427cc593e..964e79a57b0 100644 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ b/python/cudf/cudf/tests/test_column_accessor.py @@ -1,5 +1,6 @@ # Copyright (c) 2020, NVIDIA CORPORATION. + import pandas as pd import pytest diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index d590a3ddb52..85e61acd8e6 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1,8 +1,7 @@ # Copyright (c) 2019-2021, NVIDIA CORPORATION. -import os import datetime -import math +import os from io import BytesIO import numpy as np @@ -12,9 +11,8 @@ import pytest import cudf -from cudf.tests.utils import assert_eq, supported_numpy_dtypes, gen_rand_series - from cudf.io.orc import ORCWriter +from cudf.tests.utils import assert_eq, gen_rand_series, supported_numpy_dtypes @pytest.fixture(scope="module") @@ -565,7 +563,7 @@ def normalized_equals(value1, value2): # Compare integers with floats now if isinstance(value1, float) or isinstance(value2, float): - return math.isclose(value1, value2) + return np.isclose(value1, value2) return value1 == value2 diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index c95f408f309..656b66bf793 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -150,7 +150,7 @@ def test_serialize_groupby_df(): outgb = gb.deserialize(*gb.serialize()) expect = gb.mean() got = outgb.mean() - assert_eq(got, expect) + assert_eq(got.sort_index(), expect.sort_index()) def test_serialize_groupby_external(): @@ -160,7 +160,7 @@ def test_serialize_groupby_external(): outgb = gb.deserialize(*gb.serialize()) expect = gb.mean() got = outgb.mean() - assert_eq(got, expect) + assert_eq(got.sort_index(), expect.sort_index()) def test_serialize_groupby_level(): @@ -171,7 +171,7 @@ def test_serialize_groupby_level(): expect = gb.mean() outgb = gb.deserialize(*gb.serialize()) got = outgb.mean() - assert_eq(expect, got) + assert_eq(expect.sort_index(), got.sort_index()) def test_serialize_groupby_sr(): @@ -180,7 +180,7 @@ def test_serialize_groupby_sr(): outgb = gb.deserialize(*gb.serialize()) got = gb.mean() expect = outgb.mean() - assert_eq(got, expect) + assert_eq(got.sort_index(), expect.sort_index()) def test_serialize_datetime(): diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index cc580bedc08..1e8beb18234 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -1,5 +1,7 @@ # Copyright (c) 2018, NVIDIA CORPORATION. + import functools +from typing import Any, Dict from numba import cuda @@ -332,7 +334,7 @@ def chunk_wise_kernel(nrows, chunks, {args}): return kernel -_cache = dict() # WeakKeyDictionary() +_cache = dict() # type: Dict[Any, Any] @functools.wraps(_make_row_wise_kernel) diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index 82a51b3f9b4..c71a6dbccb1 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -2,6 +2,7 @@ import ast import datetime as dt +from typing import Any, Dict import numpy as np import six @@ -101,7 +102,7 @@ def _check_error(tree): raise QuerySyntaxError("too many expressions") -_cache = {} +_cache = {} # type: Dict[Any, Any] def query_compile(expr): diff --git a/python/cudf/setup.cfg b/python/cudf/setup.cfg index 0b2711155d7..3067d2daafd 100644 --- a/python/cudf/setup.cfg +++ b/python/cudf/setup.cfg @@ -46,6 +46,21 @@ skip= dist __init__.py +[mypy] +ignore_missing_imports = True + +[mypy-cudf._lib.*] +ignore_errors = True + +[mypy-cudf._version] +ignore_errors = True + +[mypy-cudf.utils.metadata.orc_column_statistics_pb2] +ignore_errors = True + +[mypy-cudf.tests.*] +ignore_errors = True + [tool:pytest] addopts = --benchmark-warmup=off @@ -60,4 +75,3 @@ python_files = python_functions = bench_* test_* -