diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index fd9aab038d4..c38c560b982 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -5,16 +5,16 @@ from __future__ import annotations from typing import Dict, Optional, Tuple, TypeVar from cudf._typing import Dtype, DtypeObj, ScalarLike -from cudf.core.buffer import DeviceBufferLike +from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase T = TypeVar("T") class Column: - _data: Optional[DeviceBufferLike] - _mask: Optional[DeviceBufferLike] - _base_data: Optional[DeviceBufferLike] - _base_mask: Optional[DeviceBufferLike] + _data: Optional[Buffer] + _mask: Optional[Buffer] + _base_data: Optional[Buffer] + _base_mask: Optional[Buffer] _dtype: DtypeObj _size: int _offset: int @@ -25,10 +25,10 @@ class Column: def __init__( self, - data: Optional[DeviceBufferLike], + data: Optional[Buffer], size: int, dtype: Dtype, - mask: Optional[DeviceBufferLike] = None, + mask: Optional[Buffer] = None, offset: int = None, null_count: int = None, children: Tuple[ColumnBase, ...] = (), @@ -40,27 +40,27 @@ class Column: @property def size(self) -> int: ... @property - def base_data(self) -> Optional[DeviceBufferLike]: ... + def base_data(self) -> Optional[Buffer]: ... @property def base_data_ptr(self) -> int: ... @property - def data(self) -> Optional[DeviceBufferLike]: ... + def data(self) -> Optional[Buffer]: ... @property def data_ptr(self) -> int: ... - def set_base_data(self, value: DeviceBufferLike) -> None: ... + def set_base_data(self, value: Buffer) -> None: ... @property def nullable(self) -> bool: ... def has_nulls(self, include_nan: bool = False) -> bool: ... @property - def base_mask(self) -> Optional[DeviceBufferLike]: ... + def base_mask(self) -> Optional[Buffer]: ... @property def base_mask_ptr(self) -> int: ... @property - def mask(self) -> Optional[DeviceBufferLike]: ... + def mask(self) -> Optional[Buffer]: ... @property def mask_ptr(self) -> int: ... - def set_base_mask(self, value: Optional[DeviceBufferLike]) -> None: ... - def set_mask(self: T, value: Optional[DeviceBufferLike]) -> T: ... + def set_base_mask(self, value: Optional[Buffer]) -> None: ... + def set_mask(self: T, value: Optional[Buffer]) -> T: ... @property def null_count(self) -> int: ... @property diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 1e7f0b175bc..918d786fb83 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -8,7 +8,7 @@ import rmm import cudf import cudf._lib as libcudf from cudf.api.types import is_categorical_dtype -from cudf.core.buffer import Buffer, DeviceBufferLike, as_device_buffer_like +from cudf.core.buffer import Buffer, as_buffer from cpython.buffer cimport PyObject_CheckBuffer from libc.stdint cimport uintptr_t @@ -39,9 +39,9 @@ cdef class Column: A Column stores columnar data in device memory. A Column may be composed of: - * A *data* DeviceBufferLike + * A *data* Buffer * One or more (optional) *children* Columns - * An (optional) *mask* DeviceBufferLike representing the nullmask + * An (optional) *mask* Buffer representing the nullmask The *dtype* indicates the Column's element type. """ @@ -106,9 +106,9 @@ cdef class Column: return self.data.ptr def set_base_data(self, value): - if value is not None and not isinstance(value, DeviceBufferLike): + if value is not None and not isinstance(value, Buffer): raise TypeError( - "Expected a DeviceBufferLike or None for data, " + "Expected a Buffer or None for data, " f"got {type(value).__name__}" ) @@ -155,9 +155,9 @@ cdef class Column: modify size or offset in any way, so the passed mask is expected to be compatible with the current offset. """ - if value is not None and not isinstance(value, DeviceBufferLike): + if value is not None and not isinstance(value, Buffer): raise TypeError( - "Expected a DeviceBufferLike or None for mask, " + "Expected a Buffer or None for mask, " f"got {type(value).__name__}" ) @@ -165,7 +165,7 @@ cdef class Column: required_size = bitmask_allocation_size_bytes(self.base_size) if value.size < required_size: error_msg = ( - "The DeviceBufferLike for mask is smaller than expected, " + "The Buffer for mask is smaller than expected, " f"got {value.size} bytes, expected {required_size} bytes." ) if self.offset > 0 or self.size < self.base_size: @@ -210,30 +210,30 @@ cdef class Column: if isinstance(value, Column): value = value.data_array_view value = cp.asarray(value).view('|u1') - mask = as_device_buffer_like(value) + mask = as_buffer(value) if mask.size < required_num_bytes: raise ValueError(error_msg.format(str(value.size))) if mask.size < mask_size: dbuf = rmm.DeviceBuffer(size=mask_size) dbuf.copy_from_device(value) - mask = as_device_buffer_like(dbuf) + mask = as_buffer(dbuf) elif hasattr(value, "__array_interface__"): value = np.asarray(value).view("u1")[:mask_size] if value.size < required_num_bytes: raise ValueError(error_msg.format(str(value.size))) dbuf = rmm.DeviceBuffer(size=mask_size) dbuf.copy_from_host(value) - mask = as_device_buffer_like(dbuf) + mask = as_buffer(dbuf) elif PyObject_CheckBuffer(value): value = np.asarray(value).view("u1")[:mask_size] if value.size < required_num_bytes: raise ValueError(error_msg.format(str(value.size))) dbuf = rmm.DeviceBuffer(size=mask_size) dbuf.copy_from_host(value) - mask = as_device_buffer_like(dbuf) + mask = as_buffer(dbuf) else: raise TypeError( - "Expected a DeviceBufferLike object or None for mask, " + "Expected a Buffer object or None for mask, " f"got {type(value).__name__}" ) @@ -432,11 +432,11 @@ cdef class Column: cdef column_contents contents = move(c_col.get()[0].release()) data = DeviceBuffer.c_from_unique_ptr(move(contents.data)) - data = as_device_buffer_like(data) + data = as_buffer(data) if null_count > 0: mask = DeviceBuffer.c_from_unique_ptr(move(contents.null_mask)) - mask = as_device_buffer_like(mask) + mask = as_buffer(mask) else: mask = None @@ -461,8 +461,8 @@ cdef class Column: Given a ``cudf::column_view``, constructs a ``cudf.Column`` from it, along with referencing an ``owner`` Python object that owns the memory lifetime. If ``owner`` is a ``cudf.Column``, we reach inside of it and - make the owner of each newly created ``DeviceBufferLike`` the - respective ``DeviceBufferLike`` from the ``owner`` ``cudf.Column``. + make the owner of each newly created ``Buffer`` the respective + ``Buffer`` from the ``owner`` ``cudf.Column``. If ``owner`` is ``None``, we allocate new memory for the resulting ``cudf.Column``. """ @@ -487,18 +487,18 @@ cdef class Column: if data_ptr: if data_owner is None: - data = as_device_buffer_like( + data = as_buffer( rmm.DeviceBuffer(ptr=data_ptr, size=(size+offset) * dtype.itemsize) ) else: - data = Buffer( + data = as_buffer( data=data_ptr, size=(base_size) * dtype.itemsize, owner=data_owner ) else: - data = as_device_buffer_like( + data = as_buffer( rmm.DeviceBuffer(ptr=data_ptr, size=0) ) @@ -528,14 +528,14 @@ cdef class Column: # result: mask = None else: - mask = as_device_buffer_like( + mask = as_buffer( rmm.DeviceBuffer( ptr=mask_ptr, size=bitmask_allocation_size_bytes(size+offset) ) ) else: - mask = Buffer( + mask = as_buffer( data=mask_ptr, size=bitmask_allocation_size_bytes(base_size), owner=mask_owner diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx index ed858034032..75e2d3bfbdc 100644 --- a/python/cudf/cudf/_lib/concat.pyx +++ b/python/cudf/cudf/_lib/concat.pyx @@ -19,7 +19,7 @@ from cudf._lib.utils cimport ( table_view_from_table, ) -from cudf.core.buffer import as_device_buffer_like +from cudf.core.buffer import as_buffer from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer @@ -31,7 +31,7 @@ cpdef concat_masks(object columns): with nogil: c_result = move(libcudf_concatenate_masks(c_views)) c_unique_result = make_unique[device_buffer](move(c_result)) - return as_device_buffer_like( + return as_buffer( DeviceBuffer.c_from_unique_ptr(move(c_unique_result)) ) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 26ec2fbcdfc..d9a7a5b8754 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -12,7 +12,7 @@ from libcpp.vector cimport vector from rmm._lib.device_buffer cimport DeviceBuffer import cudf -from cudf.core.buffer import Buffer +from cudf.core.buffer import Buffer, as_buffer from cudf._lib.column cimport Column @@ -721,7 +721,7 @@ cdef class _CPackedColumns: header = {} frames = [] - gpu_data = Buffer( + gpu_data = as_buffer( data=self.gpu_data_ptr, size=self.gpu_data_size, owner=self diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx index 976fe0e78fc..61988019c70 100644 --- a/python/cudf/cudf/_lib/null_mask.pyx +++ b/python/cudf/cudf/_lib/null_mask.pyx @@ -22,7 +22,7 @@ from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport mask_state, size_type from cudf._lib.utils cimport table_view_from_columns -from cudf.core.buffer import as_device_buffer_like +from cudf.core.buffer import as_buffer class MaskState(Enum): @@ -52,7 +52,7 @@ def copy_bitmask(Column col): up_db = make_unique[device_buffer](move(db)) rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db)) - buf = as_device_buffer_like(rmm_db) + buf = as_buffer(rmm_db) return buf @@ -98,7 +98,7 @@ def create_null_mask(size_type size, state=MaskState.UNINITIALIZED): up_db = make_unique[device_buffer](move(db)) rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db)) - buf = as_device_buffer_like(rmm_db) + buf = as_buffer(rmm_db) return buf @@ -110,7 +110,7 @@ def bitmask_and(columns: list): c_result = move(cpp_bitmask_and(c_view)) up_db = make_unique[device_buffer](move(c_result.first)) dbuf = DeviceBuffer.c_from_unique_ptr(move(up_db)) - buf = as_device_buffer_like(dbuf) + buf = as_buffer(dbuf) return buf, c_result.second @@ -122,5 +122,5 @@ def bitmask_or(columns: list): c_result = move(cpp_bitmask_or(c_view)) up_db = make_unique[device_buffer](move(c_result.first)) dbuf = DeviceBuffer.c_from_unique_ptr(move(up_db)) - buf = as_device_buffer_like(dbuf) + buf = as_buffer(dbuf) return buf, c_result.second diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index e1612855dae..b95bce0db58 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -5,7 +5,7 @@ from numba.np import numpy_support import cudf from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES from cudf.core._internals.expressions import parse_expression -from cudf.core.buffer import as_device_buffer_like +from cudf.core.buffer import as_buffer from cudf.utils import cudautils from cython.operator cimport dereference @@ -37,7 +37,7 @@ from cudf._lib.utils cimport ( def bools_to_mask(Column col): """ Given an int8 (boolean) column, compress the data from booleans to bits and - return a DeviceBufferLike + return a Buffer """ cdef column_view col_view = col.view() cdef pair[unique_ptr[device_buffer], size_type] cpp_out @@ -48,7 +48,7 @@ def bools_to_mask(Column col): up_db = move(cpp_out.first) rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db)) - buf = as_device_buffer_like(rmm_db) + buf = as_buffer(rmm_db) return buf @@ -57,9 +57,9 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit): Given a mask buffer, returns a boolean column representng bit 0 -> False and 1 -> True within range of [begin_bit, end_bit), """ - if not isinstance(mask_buffer, cudf.core.buffer.DeviceBufferLike): + if not isinstance(mask_buffer, cudf.core.buffer.Buffer): raise TypeError("mask_buffer is not an instance of " - "cudf.core.buffer.DeviceBufferLike") + "cudf.core.buffer.Buffer") cdef bitmask_type* bit_mask = (mask_buffer.ptr) cdef unique_ptr[column] result @@ -84,7 +84,7 @@ def nans_to_nulls(Column input): return None buffer = DeviceBuffer.c_from_unique_ptr(move(c_buffer)) - buffer = as_device_buffer_like(buffer) + buffer = as_buffer(buffer) return buffer diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 989d12caca0..5f4d3e17fbc 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -340,8 +340,8 @@ cdef data_from_table_view( along with referencing an ``owner`` Python object that owns the memory lifetime. If ``owner`` is a Frame we reach inside of it and reach inside of each ``cudf.Column`` to make the owner of each newly - created ``DeviceBufferLike`` underneath the ``cudf.Column`` objects of the - created Frame the respective ``DeviceBufferLike`` from the relevant + created ``Buffer`` underneath the ``cudf.Column`` objects of the + created Frame the respective ``Buffer`` from the relevant ``cudf.Column`` of the ``owner`` Frame """ cdef size_type column_idx = 0 diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py index dcbf96313a7..1c8874a2abd 100644 --- a/python/cudf/cudf/core/abc.py +++ b/python/cudf/cudf/core/abc.py @@ -1,20 +1,10 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. """Common abstract base classes for cudf.""" -import sys - -import rmm +import pickle import cudf -if sys.version_info < (3, 8): - try: - import pickle5 as pickle - except ImportError: - import pickle # type: ignore -else: - import pickle # type: ignore - class Serializable: """A serializable object composed of device memory buffers. @@ -90,14 +80,14 @@ def device_serialize(self): header : dict The metadata required to reconstruct the object. frames : list - The DeviceBufferLike or memoryview objects that the object + The Buffer or memoryview objects that the object should contain. :meta private: """ header, frames = self.serialize() assert all( - isinstance(f, (cudf.core.buffer.DeviceBufferLike, memoryview)) + isinstance(f, (cudf.core.buffer.Buffer, memoryview)) for f in frames ) header["type-serialized"] = pickle.dumps(type(self)) @@ -132,18 +122,10 @@ def device_deserialize(cls, header, frames): """ typ = pickle.loads(header["type-serialized"]) frames = [ - cudf.core.buffer.as_device_buffer_like(f) if c else memoryview(f) + cudf.core.buffer.as_buffer(f) if c else memoryview(f) for c, f in zip(header["is-cuda"], frames) ] - assert all( - (type(f._owner) is rmm.DeviceBuffer) - if c - else (type(f) is memoryview) - for c, f in zip(header["is-cuda"], frames) - ) - obj = typ.deserialize(header, frames) - - return obj + return typ.deserialize(header, frames) def host_serialize(self): """Serialize data and metadata associated with host memory. @@ -186,7 +168,7 @@ def host_deserialize(cls, header, frames): :meta private: """ frames = [ - rmm.DeviceBuffer.to_device(f) if c else f + cudf.core.buffer.as_buffer(f) if c else f for c, f in zip(header["is-cuda"], map(memoryview, frames)) ] obj = cls.device_deserialize(header, frames) diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py deleted file mode 100644 index 647e747e127..00000000000 --- a/python/cudf/cudf/core/buffer.py +++ /dev/null @@ -1,325 +0,0 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. - -from __future__ import annotations - -import math -import pickle -from typing import ( - Any, - Dict, - List, - Mapping, - Protocol, - Sequence, - Tuple, - Union, - runtime_checkable, -) - -import numpy as np - -import rmm - -import cudf -from cudf.core.abc import Serializable -from cudf.utils.string import format_bytes - -# Frame type for serialization and deserialization of `DeviceBufferLike` -Frame = Union[memoryview, "DeviceBufferLike"] - - -@runtime_checkable -class DeviceBufferLike(Protocol): - def __getitem__(self, key: slice) -> DeviceBufferLike: - """Create a new view of the buffer.""" - - @property - def size(self) -> int: - """Size of the buffer in bytes.""" - - @property - def nbytes(self) -> int: - """Size of the buffer in bytes.""" - - @property - def ptr(self) -> int: - """Device pointer to the start of the buffer.""" - - @property - def owner(self) -> Any: - """Object owning the memory of the buffer.""" - - @property - def __cuda_array_interface__(self) -> Mapping: - """Implementation of the CUDA Array Interface.""" - - def memoryview(self) -> memoryview: - """Read-only access to the buffer through host memory.""" - - def serialize(self) -> Tuple[dict, List[Frame]]: - """Serialize the buffer into header and frames. - - The frames can be a mixture of memoryview and device-buffer-like - objects. - - Returns - ------- - Tuple[Dict, List] - The first element of the returned tuple is a dict containing any - serializable metadata required to reconstruct the object. The - second element is a list containing the device buffers and - memoryviews of the object. - """ - - @classmethod - def deserialize( - cls, header: dict, frames: List[Frame] - ) -> DeviceBufferLike: - """Generate an buffer from a serialized representation. - - Parameters - ---------- - header : dict - The metadata required to reconstruct the object. - frames : list - The device-buffer-like and memoryview buffers that the object - should contain. - - Returns - ------- - DeviceBufferLike - A new object that implements DeviceBufferLike. - """ - - -def as_device_buffer_like(obj: Any) -> DeviceBufferLike: - """ - Factory function to wrap `obj` in a DeviceBufferLike object. - - If `obj` isn't device-buffer-like already, a new buffer that implements - DeviceBufferLike and points to the memory of `obj` is created. If `obj` - represents host memory, it is copied to a new `rmm.DeviceBuffer` device - allocation. Otherwise, the data of `obj` is **not** copied, instead the - new buffer keeps a reference to `obj` in order to retain the lifetime - of `obj`. - - Raises ValueError if the data of `obj` isn't C-contiguous. - - Parameters - ---------- - obj : buffer-like or array-like - An object that exposes either device or host memory through - `__array_interface__`, `__cuda_array_interface__`, or the - buffer protocol. If `obj` represents host memory, data will - be copied. - - Return - ------ - DeviceBufferLike - A device-buffer-like instance that represents the device memory - of `obj`. - """ - - if isinstance(obj, DeviceBufferLike): - return obj - return Buffer(obj) - - -class Buffer(Serializable): - """ - A Buffer represents device memory. - - Usually Buffers will be created using `as_device_buffer_like(obj)`, - which will make sure that `obj` is device-buffer-like and not a `Buffer` - necessarily. - - Parameters - ---------- - data : int or buffer-like or array-like - An integer representing a pointer to device memory or a buffer-like - or array-like object. When not an integer, `size` and `owner` must - be None. - size : int, optional - Size of device memory in bytes. Must be specified if `data` is an - integer. - owner : object, optional - Python object to which the lifetime of the memory allocation is tied. - A reference to this object is kept in the returned Buffer. - """ - - _ptr: int - _size: int - _owner: object - - def __init__( - self, data: Union[int, Any], *, size: int = None, owner: object = None - ): - if isinstance(data, int): - if size is None: - raise ValueError( - "size must be specified when `data` is an integer" - ) - if size < 0: - raise ValueError("size cannot be negative") - self._ptr = data - self._size = size - self._owner = owner - else: - if size is not None or owner is not None: - raise ValueError( - "`size` and `owner` must be None when " - "`data` is a buffer-like object" - ) - - # `data` is a buffer-like object - buf: Any = data - if isinstance(buf, rmm.DeviceBuffer): - self._ptr = buf.ptr - self._size = buf.size - self._owner = buf - return - iface = getattr(buf, "__cuda_array_interface__", None) - if iface: - ptr, size = get_ptr_and_size(iface) - self._ptr = ptr - self._size = size - self._owner = buf - return - ptr, size = get_ptr_and_size(np.asarray(buf).__array_interface__) - buf = rmm.DeviceBuffer(ptr=ptr, size=size) - self._ptr = buf.ptr - self._size = buf.size - self._owner = buf - - def __getitem__(self, key: slice) -> Buffer: - if not isinstance(key, slice): - raise ValueError("index must be an slice") - start, stop, step = key.indices(self.size) - if step != 1: - raise ValueError("slice must be contiguous") - return self.__class__( - data=self.ptr + start, size=stop - start, owner=self.owner - ) - - @property - def size(self) -> int: - return self._size - - @property - def nbytes(self) -> int: - return self._size - - @property - def ptr(self) -> int: - return self._ptr - - @property - def owner(self) -> Any: - return self._owner - - @property - def __cuda_array_interface__(self) -> dict: - return { - "data": (self.ptr, False), - "shape": (self.size,), - "strides": None, - "typestr": "|u1", - "version": 0, - } - - def memoryview(self) -> memoryview: - host_buf = bytearray(self.size) - rmm._lib.device_buffer.copy_ptr_to_host(self.ptr, host_buf) - return memoryview(host_buf).toreadonly() - - def serialize(self) -> Tuple[dict, list]: - header = {} # type: Dict[Any, Any] - header["type-serialized"] = pickle.dumps(type(self)) - header["constructor-kwargs"] = {} - header["desc"] = self.__cuda_array_interface__.copy() - header["desc"]["strides"] = (1,) - header["frame_count"] = 1 - frames = [self] - return header, frames - - @classmethod - def deserialize(cls, header: dict, frames: list) -> Buffer: - assert ( - header["frame_count"] == 1 - ), "Only expecting to deserialize Buffer with a single frame." - buf = cls(frames[0], **header["constructor-kwargs"]) - - if header["desc"]["shape"] != buf.__cuda_array_interface__["shape"]: - raise ValueError( - f"Received a `Buffer` with the wrong size." - f" Expected {header['desc']['shape']}, " - f"but got {buf.__cuda_array_interface__['shape']}" - ) - - return buf - - def __repr__(self) -> str: - return ( - f" bool: - """ - Determine if shape and strides are C-contiguous - - Parameters - ---------- - shape : Sequence[int] - Number of elements in each dimension. - strides : Sequence[int] - The stride of each dimension in bytes. - itemsize : int - Size of an element in bytes. - - Return - ------ - bool - The boolean answer. - """ - - if any(dim == 0 for dim in shape): - return True - cumulative_stride = itemsize - for dim, stride in zip(reversed(shape), reversed(strides)): - if dim > 1 and stride != cumulative_stride: - return False - cumulative_stride *= dim - return True - - -def get_ptr_and_size(array_interface: Mapping) -> Tuple[int, int]: - """ - Retrieve the pointer and size from an array interface. - - Raises ValueError if array isn't C-contiguous. - - Parameters - ---------- - array_interface : Mapping - The array interface metadata. - - Return - ------ - pointer : int - The pointer to device or host memory - size : int - The size in bytes - """ - - shape = array_interface["shape"] or (1,) - strides = array_interface["strides"] - itemsize = cudf.dtype(array_interface["typestr"]).itemsize - if strides is None or is_c_contiguous(shape, strides, itemsize): - nelem = math.prod(shape) - ptr = array_interface["data"][0] or 0 - return ptr, nelem * itemsize - raise ValueError("Buffer data must be C-contiguous") diff --git a/python/cudf/cudf/core/buffer/__init__.py b/python/cudf/cudf/core/buffer/__init__.py new file mode 100644 index 00000000000..a73bc69ffb5 --- /dev/null +++ b/python/cudf/cudf/core/buffer/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper +from cudf.core.buffer.utils import as_buffer diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py new file mode 100644 index 00000000000..73e589ebb8e --- /dev/null +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -0,0 +1,319 @@ +# Copyright (c) 2020-2022, NVIDIA CORPORATION. + +from __future__ import annotations + +import math +import pickle +from types import SimpleNamespace +from typing import Any, Dict, Mapping, Sequence, Tuple, Type, TypeVar + +import numpy + +import rmm + +import cudf +from cudf.core.abc import Serializable +from cudf.utils.string import format_bytes + +T = TypeVar("T", bound="Buffer") + + +def cuda_array_interface_wrapper( + ptr: int, + size: int, + owner: object = None, + readonly=False, + typestr="|u1", + version=0, +): + """Wrap device pointer in an object that exposes `__cuda_array_interface__` + + See + + Parameters + ---------- + ptr : int + An integer representing a pointer to device memory. + size : int, optional + Size of device memory in bytes. + owner : object, optional + Python object to which the lifetime of the memory allocation is tied. + A reference to this object is kept in the returned wrapper object. + readonly: bool, optional + Mark the interface read-only. + typestr: str, optional + The type string of the interface. By default this is "|u1", which + means "an unsigned integer with a not relevant byteorder". See: + + version : bool, optional + The version of the interface. + + Return + ------ + SimpleNamespace + An object that exposes `__cuda_array_interface__` and keeps a reference + to `owner`. + """ + + if size < 0: + raise ValueError("size cannot be negative") + + return SimpleNamespace( + __cuda_array_interface__={ + "data": (ptr, readonly), + "shape": (size,), + "strides": None, + "typestr": typestr, + "version": version, + }, + owner=owner, + ) + + +class Buffer(Serializable): + """A Buffer represents device memory. + + Use the factory function `as_buffer` to create a Buffer instance. + """ + + _ptr: int + _size: int + _owner: object + + def __init__(self): + raise ValueError( + f"do not create a {self.__class__} directly, please " + "use the factory function `cudf.core.buffer.as_buffer`" + ) + + @classmethod + def _from_device_memory(cls: Type[T], data: Any) -> T: + """Create a Buffer from an object exposing `__cuda_array_interface__`. + + No data is being copied. + + Parameters + ---------- + data : device-buffer-like + An object implementing the CUDA Array Interface. + + Returns + ------- + Buffer + Buffer representing the same device memory as `data` + """ + + # Bypass `__init__` and initialize attributes manually + ret = cls.__new__(cls) + ret._owner = data + if isinstance(data, rmm.DeviceBuffer): # Common case shortcut + ret._ptr = data.ptr + ret._size = data.size + else: + ret._ptr, ret._size = get_ptr_and_size( + data.__cuda_array_interface__ + ) + if ret.size < 0: + raise ValueError("size cannot be negative") + return ret + + @classmethod + def _from_host_memory(cls: Type[T], data: Any) -> T: + """Create a Buffer from a buffer or array like object + + Data must implement `__array_interface__`, the buffer protocol, and/or + be convertible to a buffer object using `numpy.array()` + + The host memory is copied to a new device allocation. + + Raises ValueError if array isn't C-contiguous. + + Parameters + ---------- + data : Any + An object that represens host memory. + + Returns + ------- + Buffer + Buffer representing a copy of `data`. + """ + + # Convert to numpy array, this will not copy data in most cases. + ary = numpy.array(data, copy=False, subok=True) + # Extract pointer and size + ptr, size = get_ptr_and_size(ary.__array_interface__) + # Copy to device memory + buf = rmm.DeviceBuffer(ptr=ptr, size=size) + # Create from device memory + return cls._from_device_memory(buf) + + def _getitem(self, offset: int, size: int) -> Buffer: + """ + Sub-classes can overwrite this to implement __getitem__ + without having to handle non-slice inputs. + """ + return self._from_device_memory( + cuda_array_interface_wrapper( + ptr=self.ptr + offset, size=size, owner=self.owner + ) + ) + + def __getitem__(self, key: slice) -> Buffer: + """Create a new slice of the buffer.""" + if not isinstance(key, slice): + raise TypeError( + "Argument 'key' has incorrect type " + f"(expected slice, got {key.__class__.__name__})" + ) + start, stop, step = key.indices(self.size) + if step != 1: + raise ValueError("slice must be C-contiguous") + return self._getitem(offset=start, size=stop - start) + + @property + def size(self) -> int: + """Size of the buffer in bytes.""" + return self._size + + @property + def nbytes(self) -> int: + """Size of the buffer in bytes.""" + return self._size + + @property + def ptr(self) -> int: + """Device pointer to the start of the buffer.""" + return self._ptr + + @property + def owner(self) -> Any: + """Object owning the memory of the buffer.""" + return self._owner + + @property + def __cuda_array_interface__(self) -> Mapping: + """Implementation of the CUDA Array Interface.""" + return { + "data": (self.ptr, False), + "shape": (self.size,), + "strides": None, + "typestr": "|u1", + "version": 0, + } + + def memoryview(self) -> memoryview: + """Read-only access to the buffer through host memory.""" + host_buf = bytearray(self.size) + rmm._lib.device_buffer.copy_ptr_to_host(self.ptr, host_buf) + return memoryview(host_buf).toreadonly() + + def serialize(self) -> Tuple[dict, list]: + """Serialize the buffer into header and frames. + + The frames can be a mixture of memoryview and Buffer objects. + + Returns + ------- + Tuple[dict, List] + The first element of the returned tuple is a dict containing any + serializable metadata required to reconstruct the object. The + second element is a list containing Buffers and memoryviews. + """ + header: Dict[str, Any] = {} + header["type-serialized"] = pickle.dumps(type(self)) + header["frame_count"] = 1 + frames = [self] + return header, frames + + @classmethod + def deserialize(cls: Type[T], header: dict, frames: list) -> T: + """Create an Buffer from a serialized representation. + + Parameters + ---------- + header : dict + The metadata required to reconstruct the object. + frames : list + The Buffer and memoryview that makes up the Buffer. + + Returns + ------- + Buffer + The deserialized Buffer. + """ + if header["frame_count"] != 1: + raise ValueError("Deserializing a Buffer expect a single frame") + frame = frames[0] + if isinstance(frame, cls): + return frame # The frame is already deserialized + + if hasattr(frame, "__cuda_array_interface__"): + return cls._from_device_memory(frame) + return cls._from_host_memory(frame) + + def __repr__(self) -> str: + klass = self.__class__ + name = f"{klass.__module__}.{klass.__qualname__}" + return ( + f"<{name} size={format_bytes(self._size)} " + f"ptr={hex(self._ptr)} owner={repr(self._owner)}>" + ) + + +def is_c_contiguous( + shape: Sequence[int], strides: Sequence[int], itemsize: int +) -> bool: + """Determine if shape and strides are C-contiguous + + Parameters + ---------- + shape : Sequence[int] + Number of elements in each dimension. + strides : Sequence[int] + The stride of each dimension in bytes. + itemsize : int + Size of an element in bytes. + + Return + ------ + bool + The boolean answer. + """ + + if any(dim == 0 for dim in shape): + return True + cumulative_stride = itemsize + for dim, stride in zip(reversed(shape), reversed(strides)): + if dim > 1 and stride != cumulative_stride: + return False + cumulative_stride *= dim + return True + + +def get_ptr_and_size(array_interface: Mapping) -> Tuple[int, int]: + """Retrieve the pointer and size from an array interface. + + Raises ValueError if array isn't C-contiguous. + + Parameters + ---------- + array_interface : Mapping + The array interface metadata. + + Return + ------ + pointer : int + The pointer to device or host memory + size : int + The size in bytes + """ + + shape = array_interface["shape"] or (1,) + strides = array_interface["strides"] + itemsize = cudf.dtype(array_interface["typestr"]).itemsize + if strides is None or is_c_contiguous(shape, strides, itemsize): + nelem = math.prod(shape) + ptr = array_interface["data"][0] or 0 + return ptr, nelem * itemsize + raise ValueError("Buffer data must be C-contiguous") diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py new file mode 100644 index 00000000000..5e017c4bc92 --- /dev/null +++ b/python/cudf/cudf/core/buffer/utils.py @@ -0,0 +1,67 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +from __future__ import annotations + +from typing import Any, Union + +from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper + + +def as_buffer( + data: Union[int, Any], + *, + size: int = None, + owner: object = None, +) -> Buffer: + """Factory function to wrap `data` in a Buffer object. + + If `data` isn't a buffer already, a new buffer that points to the memory of + `data` is created. If `data` represents host memory, it is copied to a new + `rmm.DeviceBuffer` device allocation. Otherwise, the memory of `data` is + **not** copied, instead the new buffer keeps a reference to `data` in order + to retain its lifetime. + + If `data` is an integer, it is assumed to point to device memory. + + Raises ValueError if data isn't C-contiguous. + + Parameters + ---------- + data : int or buffer-like or array-like + An integer representing a pointer to device memory or a buffer-like + or array-like object. When not an integer, `size` and `owner` must + be None. + size : int, optional + Size of device memory in bytes. Must be specified if `data` is an + integer. + owner : object, optional + Python object to which the lifetime of the memory allocation is tied. + A reference to this object is kept in the returned Buffer. + + Return + ------ + Buffer + A buffer instance that represents the device memory of `data`. + """ + + if isinstance(data, Buffer): + return data + + # We handle the integer argument in the factory function by wrapping + # the pointer in a `__cuda_array_interface__` exposing object so that + # the Buffer (and its sub-classes) do not have to. + if isinstance(data, int): + if size is None: + raise ValueError( + "size must be specified when `data` is an integer" + ) + data = cuda_array_interface_wrapper(ptr=data, size=size, owner=owner) + elif size is not None or owner is not None: + raise ValueError( + "`size` and `owner` must be None when " + "`data` is a buffer-like or array-like object" + ) + + if hasattr(data, "__cuda_array_interface__"): + return Buffer._from_device_memory(data) + return Buffer._from_host_memory(data) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index af5d140a20a..322092a149c 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -16,7 +16,7 @@ from cudf._lib.transform import bools_to_mask from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.api.types import is_categorical_dtype, is_interval_dtype -from cudf.core.buffer import DeviceBufferLike +from cudf.core.buffer import Buffer from cudf.core.column import column from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import CategoricalDtype @@ -595,7 +595,7 @@ class CategoricalColumn(column.ColumnBase): Parameters ---------- dtype : CategoricalDtype - mask : DeviceBufferLike + mask : Buffer The validity mask offset : int Data offset @@ -619,7 +619,7 @@ class CategoricalColumn(column.ColumnBase): def __init__( self, dtype: CategoricalDtype, - mask: DeviceBufferLike = None, + mask: Buffer = None, size: int = None, offset: int = 0, null_count: int = None, @@ -678,7 +678,7 @@ def _process_values_for_isin( rhs = cudf.core.column.as_column(values, dtype=self.dtype) return lhs, rhs - def set_base_mask(self, value: Optional[DeviceBufferLike]): + def set_base_mask(self, value: Optional[Buffer]): super().set_base_mask(value) self._codes = None diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 7291b695312..22f8d27f9e8 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -64,7 +64,7 @@ ) from cudf.core._compat import PANDAS_GE_150 from cudf.core.abc import Serializable -from cudf.core.buffer import Buffer, DeviceBufferLike, as_device_buffer_like +from cudf.core.buffer import Buffer, as_buffer from cudf.core.dtypes import ( CategoricalDtype, IntervalDtype, @@ -357,7 +357,7 @@ def valid_count(self) -> int: return len(self) - self.null_count @property - def nullmask(self) -> DeviceBufferLike: + def nullmask(self) -> Buffer: """The gpu buffer for the null-mask""" if not self.nullable: raise ValueError("Column has no null mask") @@ -761,12 +761,12 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase: res = res.drop_duplicates(subset="orig_order", ignore_index=True) return res._data["bool"].fillna(False) - def as_mask(self) -> DeviceBufferLike: + def as_mask(self) -> Buffer: """Convert booleans to bitmask Returns ------- - DeviceBufferLike + Buffer """ if self.has_nulls(): @@ -1281,7 +1281,7 @@ def column_empty( data = None children = ( build_column( - data=as_device_buffer_like( + data=as_buffer( rmm.DeviceBuffer( size=row_count * cudf.dtype("int32").itemsize ) @@ -1294,7 +1294,7 @@ def column_empty( children = ( full(row_count + 1, 0, dtype="int32"), build_column( - data=as_device_buffer_like( + data=as_buffer( rmm.DeviceBuffer( size=row_count * cudf.dtype("int8").itemsize ) @@ -1303,9 +1303,7 @@ def column_empty( ), ) else: - data = as_device_buffer_like( - rmm.DeviceBuffer(size=row_count * dtype.itemsize) - ) + data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize)) if masked: mask = create_null_mask(row_count, state=MaskState.ALL_NULL) @@ -1318,11 +1316,11 @@ def column_empty( def build_column( - data: Union[DeviceBufferLike, None], + data: Union[Buffer, None], dtype: Dtype, *, size: int = None, - mask: DeviceBufferLike = None, + mask: Buffer = None, offset: int = 0, null_count: int = None, children: Tuple[ColumnBase, ...] = (), @@ -1332,12 +1330,12 @@ def build_column( Parameters ---------- - data : DeviceBufferLike + data : Buffer The data buffer (can be None if constructing certain Column types like StringColumn, ListColumn, or CategoricalColumn) dtype The dtype associated with the Column to construct - mask : DeviceBufferLike, optional + mask : Buffer, optional The mask buffer size : int, optional offset : int, optional @@ -1482,7 +1480,7 @@ def build_column( def build_categorical_column( categories: ColumnBase, codes: ColumnBase, - mask: DeviceBufferLike = None, + mask: Buffer = None, size: int = None, offset: int = 0, null_count: int = None, @@ -1498,7 +1496,7 @@ def build_categorical_column( codes : Column Column of codes, the size of the resulting Column will be the size of `codes` - mask : DeviceBufferLike + mask : Buffer Null mask size : int, optional offset : int, optional @@ -1542,7 +1540,7 @@ def build_interval_column( Column of values representing the left of the interval right_col : Column Column of representing the right of the interval - mask : DeviceBufferLike + mask : Buffer Null mask size : int, optional offset : int, optional @@ -1573,7 +1571,7 @@ def build_interval_column( def build_list_column( indices: ColumnBase, elements: ColumnBase, - mask: DeviceBufferLike = None, + mask: Buffer = None, size: int = None, offset: int = 0, null_count: int = None, @@ -1587,7 +1585,7 @@ def build_list_column( Column of list indices elements : ColumnBase Column of list elements - mask: DeviceBufferLike + mask: Buffer Null mask size: int, optional offset: int, optional @@ -1619,7 +1617,7 @@ def build_struct_column( names: Sequence[str], children: Tuple[ColumnBase, ...], dtype: Optional[Dtype] = None, - mask: DeviceBufferLike = None, + mask: Buffer = None, size: int = None, offset: int = 0, null_count: int = None, @@ -1633,7 +1631,7 @@ def build_struct_column( Field names to map to children dtypes, must be strings. children : tuple - mask: DeviceBufferLike + mask: Buffer Null mask size: int, optional offset: int, optional @@ -1669,9 +1667,7 @@ def _make_copy_replacing_NaT_with_null(column): out_col = cudf._lib.replace.replace( column, build_column( - as_device_buffer_like( - np.array([na_value], dtype=column.dtype).view("|u1") - ), + as_buffer(np.array([na_value], dtype=column.dtype).view("|u1")), dtype=column.dtype, ), null, @@ -1766,7 +1762,7 @@ def as_column( ): arbitrary = cupy.ascontiguousarray(arbitrary) - data = as_device_buffer_like(arbitrary) + data = as_buffer(arbitrary) col = build_column(data, dtype=current_dtype, mask=mask) if dtype is not None: @@ -1914,7 +1910,7 @@ def as_column( if cast_dtype: arbitrary = arbitrary.astype(cudf.dtype("datetime64[s]")) - buffer = as_device_buffer_like(arbitrary.view("|u1")) + buffer = as_buffer(arbitrary.view("|u1")) mask = None if nan_as_null is None or nan_as_null is True: data = build_column(buffer, dtype=arbitrary.dtype) @@ -1932,7 +1928,7 @@ def as_column( if cast_dtype: arbitrary = arbitrary.astype(cudf.dtype("timedelta64[s]")) - buffer = as_device_buffer_like(arbitrary.view("|u1")) + buffer = as_buffer(arbitrary.view("|u1")) mask = None if nan_as_null is None or nan_as_null is True: data = build_column(buffer, dtype=arbitrary.dtype) @@ -2211,7 +2207,7 @@ def _construct_array( return arbitrary -def _mask_from_cuda_array_interface_desc(obj) -> Union[DeviceBufferLike, None]: +def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]: desc = obj.__cuda_array_interface__ mask = desc.get("mask", None) @@ -2223,7 +2219,7 @@ def _mask_from_cuda_array_interface_desc(obj) -> Union[DeviceBufferLike, None]: typecode = typestr[1] if typecode == "t": mask_size = bitmask_allocation_size_bytes(nelem) - mask = Buffer(data=ptr, size=mask_size, owner=obj) + mask = as_buffer(data=ptr, size=mask_size, owner=obj) elif typecode == "b": col = as_column(mask) mask = bools_to_mask(col) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 1419b14e8c6..56436ac141d 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -6,7 +6,6 @@ import locale import re from locale import nl_langinfo -from types import SimpleNamespace from typing import Any, Mapping, Sequence, cast import numpy as np @@ -23,7 +22,7 @@ ) from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype from cudf.core._compat import PANDAS_GE_120 -from cudf.core.buffer import DeviceBufferLike +from cudf.core.buffer import Buffer, cuda_array_interface_wrapper from cudf.core.column import ColumnBase, as_column, column, string from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.utils.utils import _fillna_natwise @@ -98,11 +97,11 @@ class DatetimeColumn(column.ColumnBase): Parameters ---------- - data : DeviceBufferLike + data : Buffer The datetime values dtype : np.dtype The data type - mask : DeviceBufferLike; optional + mask : Buffer; optional The validity mask """ @@ -121,9 +120,9 @@ class DatetimeColumn(column.ColumnBase): def __init__( self, - data: DeviceBufferLike, + data: Buffer, dtype: DtypeObj, - mask: DeviceBufferLike = None, + mask: Buffer = None, size: int = None, # TODO: make non-optional offset: int = 0, null_count: int = None, @@ -131,9 +130,7 @@ def __init__( dtype = cudf.dtype(dtype) if data.size % dtype.itemsize: - raise ValueError( - "DeviceBufferLike size must be divisible by element size" - ) + raise ValueError("Buffer size must be divisible by element size") if size is None: size = data.size // dtype.itemsize size = size - offset @@ -291,20 +288,16 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]: } if self.nullable and self.has_nulls(): - # Create a simple Python object that exposes the # `__cuda_array_interface__` attribute here since we need to modify # some of the attributes from the numba device array - mask = SimpleNamespace( - __cuda_array_interface__={ - "shape": (len(self),), - "typestr": " DatetimeColumn: diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index e03802e6d8c..0beb07bb591 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -16,7 +16,7 @@ ) from cudf._typing import ColumnBinaryOperand, Dtype from cudf.api.types import is_integer_dtype, is_scalar -from cudf.core.buffer import as_device_buffer_like +from cudf.core.buffer import as_buffer from cudf.core.column import ColumnBase, as_column from cudf.core.dtypes import ( Decimal32Dtype, @@ -203,7 +203,7 @@ def from_arrow(cls, data: pa.Array): data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int32")) data_32 = data_128[::4].copy() return cls( - data=as_device_buffer_like(data_32.view("uint8")), + data=as_buffer(data_32.view("uint8")), size=len(data), dtype=dtype, offset=data.offset, @@ -290,7 +290,7 @@ def from_arrow(cls, data: pa.Array): data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int64")) data_64 = data_128[::2].copy() return cls( - data=as_device_buffer_like(data_64.view("uint8")), + data=as_buffer(data_64.view("uint8")), size=len(data), dtype=dtype, offset=data.offset, diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index a66c11c8bdc..f126f47c3c2 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -2,7 +2,6 @@ from __future__ import annotations -from types import SimpleNamespace from typing import ( Any, Callable, @@ -36,7 +35,7 @@ is_number, is_scalar, ) -from cudf.core.buffer import DeviceBufferLike, as_device_buffer_like +from cudf.core.buffer import Buffer, as_buffer, cuda_array_interface_wrapper from cudf.core.column import ( ColumnBase, as_column, @@ -66,10 +65,10 @@ class NumericalColumn(NumericalBaseColumn): Parameters ---------- - data : DeviceBufferLike + data : Buffer dtype : np.dtype - The dtype associated with the data DeviceBufferLike - mask : DeviceBufferLike, optional + The dtype associated with the data Buffer + mask : Buffer, optional """ _nan_count: Optional[int] @@ -77,9 +76,9 @@ class NumericalColumn(NumericalBaseColumn): def __init__( self, - data: DeviceBufferLike, + data: Buffer, dtype: DtypeObj, - mask: DeviceBufferLike = None, + mask: Buffer = None, size: int = None, # TODO: make this non-optional offset: int = 0, null_count: int = None, @@ -87,9 +86,7 @@ def __init__( dtype = cudf.dtype(dtype) if data.size % dtype.itemsize: - raise ValueError( - "DeviceBufferLike size must be divisible by element size" - ) + raise ValueError("Buffer size must be divisible by element size") if size is None: size = (data.size // dtype.itemsize) - offset self._nan_count = None @@ -177,19 +174,16 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]: } if self.nullable and self.has_nulls(): - # Create a simple Python object that exposes the # `__cuda_array_interface__` attribute here since we need to modify # some of the attributes from the numba device array - mask = SimpleNamespace( - __cuda_array_interface__={ - "shape": (len(self),), - "typestr": " None: """ - Use DeviceBufferLike object. + Use Buffer object. """ # Store the cudf buffer where the data resides as a private # attribute, so we can use it to retrieve the public attributes @@ -80,7 +80,7 @@ def __init__( @property def bufsize(self) -> int: """ - The DeviceBufferLike size in bytes. + The Buffer size in bytes. """ return self._buf.size @@ -627,7 +627,7 @@ def __dataframe__( Notes ----- -- Interpreting a raw pointer (as in ``DeviceBufferLike.ptr``) is annoying and +- Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to do in pure Python. It's more general but definitely less friendly than having ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack ``__dlpack__`` (e.g., because the column dtype isn't supported by @@ -721,7 +721,7 @@ def _protocol_to_cudf_column_numeric( _dbuffer, _ddtype = buffers["data"] _check_buffer_is_on_gpu(_dbuffer) cudfcol_num = build_column( - Buffer(data=_dbuffer.ptr, size=_dbuffer.bufsize, owner=None), + as_buffer(data=_dbuffer.ptr, size=_dbuffer.bufsize, owner=None), protocol_dtype_to_cupy_dtype(_ddtype), ) return _set_missing_values(col, cudfcol_num), buffers @@ -751,9 +751,7 @@ def _set_missing_values( valid_mask = protocol_col.get_buffers()["validity"] if valid_mask is not None: bitmask = cp.asarray( - Buffer( - data=valid_mask[0].ptr, size=valid_mask[0].bufsize, owner=None - ), + as_buffer(data=valid_mask[0].ptr, size=valid_mask[0].bufsize), cp.bool8, ) cudf_col[~bitmask] = None @@ -792,7 +790,7 @@ def _protocol_to_cudf_column_categorical( _check_buffer_is_on_gpu(codes_buffer) cdtype = protocol_dtype_to_cupy_dtype(codes_dtype) codes = build_column( - Buffer(data=codes_buffer.ptr, size=codes_buffer.bufsize, owner=None), + as_buffer(data=codes_buffer.ptr, size=codes_buffer.bufsize), cdtype, ) @@ -824,7 +822,7 @@ def _protocol_to_cudf_column_string( data_buffer, data_dtype = buffers["data"] _check_buffer_is_on_gpu(data_buffer) encoded_string = build_column( - Buffer(data=data_buffer.ptr, size=data_buffer.bufsize, owner=None), + as_buffer(data=data_buffer.ptr, size=data_buffer.bufsize), protocol_dtype_to_cupy_dtype(data_dtype), ) @@ -834,7 +832,7 @@ def _protocol_to_cudf_column_string( offset_buffer, offset_dtype = buffers["offsets"] _check_buffer_is_on_gpu(offset_buffer) offsets = build_column( - Buffer(data=offset_buffer.ptr, size=offset_buffer.bufsize, owner=None), + as_buffer(data=offset_buffer.ptr, size=offset_buffer.bufsize), protocol_dtype_to_cupy_dtype(offset_dtype), ) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 84f528549e9..25b1b3895de 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -20,7 +20,7 @@ from cudf._typing import Dtype from cudf.core._compat import PANDAS_GE_130, PANDAS_GE_150 from cudf.core.abc import Serializable -from cudf.core.buffer import DeviceBufferLike +from cudf.core.buffer import Buffer from cudf.utils.docutils import doc_apply if PANDAS_GE_150: @@ -592,7 +592,7 @@ def serialize(self) -> Tuple[dict, list]: header: Dict[str, Any] = {} header["type-serialized"] = pickle.dumps(type(self)) - frames: List[DeviceBufferLike] = [] + frames: List[Buffer] = [] fields: Dict[str, Union[bytes, Tuple[Any, Tuple[int, int]]]] = {} diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 0628497fc29..f4f960f3274 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2974,7 +2974,7 @@ def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex: Currently supported inputs are: * ``Column`` - * ``DeviceBufferLike`` + * ``Buffer`` * ``Series`` * ``Index`` * numba device array diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 07e1782d788..b632ddd714b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1831,8 +1831,6 @@ def data(self): 2 3 3 4 dtype: int64 - >>> series.data - >>> np.array(series.data.memoryview()) array([1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0], dtype=uint8) diff --git a/python/cudf/cudf/tests/test_buffer.py b/python/cudf/cudf/tests/test_buffer.py index eaa615a2839..5ed5750f29b 100644 --- a/python/cudf/cudf/tests/test_buffer.py +++ b/python/cudf/cudf/tests/test_buffer.py @@ -1,10 +1,9 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. -from typing import Callable import cupy as cp import pytest -from cudf.core.buffer import Buffer, DeviceBufferLike, as_device_buffer_like +from cudf.core.buffer import Buffer, as_buffer arr_len = 10 @@ -23,10 +22,10 @@ def test_buffer_from_cuda_iface_contiguous(data): data, expect_success = data if expect_success: - as_device_buffer_like(data.view("|u1")) + as_buffer(data.view("|u1")) else: with pytest.raises(ValueError): - as_device_buffer_like(data.view("|u1")) + as_buffer(data.view("|u1")) @pytest.mark.parametrize( @@ -41,24 +40,23 @@ def test_buffer_from_cuda_iface_contiguous(data): @pytest.mark.parametrize("dtype", ["uint8", "int8", "float32", "int32"]) def test_buffer_from_cuda_iface_dtype(data, dtype): data = data.astype(dtype) - buf = as_device_buffer_like(data) + buf = as_buffer(data) got = cp.array(buf).reshape(-1).view("uint8") expect = data.reshape(-1).view("uint8") assert (expect == got).all() -@pytest.mark.parametrize("creator", [Buffer, as_device_buffer_like]) -def test_buffer_creation_from_any(creator: Callable[[object], Buffer]): +def test_buffer_creation_from_any(): ary = cp.arange(arr_len) - b = creator(ary) - assert isinstance(b, DeviceBufferLike) + b = as_buffer(ary) + assert isinstance(b, Buffer) assert ary.__cuda_array_interface__["data"][0] == b.ptr assert ary.nbytes == b.size with pytest.raises( ValueError, match="size must be specified when `data` is an integer" ): - Buffer(42) + as_buffer(42) @pytest.mark.parametrize( @@ -66,7 +64,7 @@ def test_buffer_creation_from_any(creator: Callable[[object], Buffer]): ) def test_buffer_repr(size, expect): ary = cp.arange(size, dtype="uint8") - buf = as_device_buffer_like(ary) + buf = as_buffer(ary) assert f"size={expect}" in repr(buf) @@ -83,25 +81,25 @@ def test_buffer_repr(size, expect): ) def test_buffer_slice(idx): ary = cp.arange(arr_len, dtype="uint8") - buf = as_device_buffer_like(ary) + buf = as_buffer(ary) expect = ary[idx] got = cp.array(buf[idx]) assert (expect == got).all() @pytest.mark.parametrize( - "idx, err_msg", + "idx, err_type, err_msg", [ - (1, "index must be an slice"), - (slice(3, 2), "size cannot be negative"), - (slice(1, 2, 2), "slice must be contiguous"), - (slice(1, 2, -1), "slice must be contiguous"), - (slice(3, 2, -1), "slice must be contiguous"), + (1, TypeError, "Argument 'key' has incorrect type"), + (slice(3, 2), ValueError, "size cannot be negative"), + (slice(1, 2, 2), ValueError, "slice must be C-contiguous"), + (slice(1, 2, -1), ValueError, "slice must be C-contiguous"), + (slice(3, 2, -1), ValueError, "slice must be C-contiguous"), ], ) -def test_buffer_slice_fail(idx, err_msg): +def test_buffer_slice_fail(idx, err_type, err_msg): ary = cp.arange(arr_len, dtype="uint8") - buf = as_device_buffer_like(ary) + buf = as_buffer(ary) - with pytest.raises(ValueError, match=err_msg): + with pytest.raises(err_type, match=err_msg): buf[idx] diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 4e2a26d31bd..467c88b200f 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -406,7 +406,7 @@ def test_column_view_string_slice(slc): ) def test_as_column_buffer(data, expected): actual_column = cudf.core.column.as_column( - cudf.core.buffer.as_device_buffer_like(data), dtype=data.dtype + cudf.core.buffer.as_buffer(data), dtype=data.dtype ) assert_eq(cudf.Series(actual_column), cudf.Series(expected)) diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py index 2a62a289747..9b9709b52c3 100644 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ b/python/cudf/cudf/tests/test_cuda_array_interface.py @@ -179,9 +179,7 @@ def test_cuda_array_interface_pytorch(): got = cudf.Series(tensor) assert_eq(got, series) - buffer = cudf.core.buffer.as_device_buffer_like( - cupy.ones(10, dtype=np.bool_) - ) + buffer = cudf.core.buffer.as_buffer(cupy.ones(10, dtype=np.bool_)) tensor = torch.tensor(buffer) got = cudf.Series(tensor, dtype=np.bool_) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index 7b83eec9b63..6f8305e6751 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -7,7 +7,7 @@ import pytest import cudf -from cudf.core.buffer import Buffer +from cudf.core.buffer import as_buffer from cudf.core.column import build_column from cudf.core.df_protocol import ( DataFrameObject, @@ -25,7 +25,7 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): device_id = cp.asarray(cudfcol.data).device.id assert buf.__dlpack_device__() == (2, device_id) col_from_buf = build_column( - Buffer(data=buf.ptr, size=buf.bufsize, owner=None), + as_buffer(data=buf.ptr, size=buf.bufsize), protocol_dtype_to_cupy_dtype(dtype), ) # check that non null values are the equals as nulls are represented diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index 1427a214a72..21343f19d79 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -7,7 +7,7 @@ import pytest from cudf import DataFrame, GenericIndex, RangeIndex, Series -from cudf.core.buffer import as_device_buffer_like +from cudf.core.buffer import as_buffer from cudf.testing._utils import assert_eq if sys.version_info < (3, 8): @@ -97,7 +97,7 @@ def test_pickle_index(): def test_pickle_buffer(): arr = np.arange(10).view("|u1") - buf = as_device_buffer_like(arr) + buf = as_buffer(arr) assert buf.size == arr.nbytes pickled = pickle.dumps(buf) unpacked = pickle.loads(pickled) diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py index 60f01d567ef..c3dfeac9a3f 100644 --- a/python/cudf/cudf/tests/test_testing.py +++ b/python/cudf/cudf/tests/test_testing.py @@ -429,7 +429,7 @@ def test_assert_column_memory_slice(arrow_arrays): def test_assert_column_memory_basic_same(arrow_arrays): data = cudf.core.column.ColumnBase.from_arrow(arrow_arrays) - buf = cudf.core.buffer.as_device_buffer_like(data.base_data) + buf = cudf.core.buffer.as_buffer(data.base_data) left = cudf.core.column.build_column(buf, dtype=np.int32) right = cudf.core.column.build_column(buf, dtype=np.int32) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 87596482d79..c5f4629483a 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -16,7 +16,7 @@ import cudf import cudf.api.types from cudf.core import column -from cudf.core.buffer import as_device_buffer_like +from cudf.core.buffer import as_buffer # The size of the mask in bytes mask_dtype = cudf.api.types.dtype(np.int32) @@ -293,8 +293,8 @@ def pa_mask_buffer_to_mask(mask_buf, size): if mask_buf.size < mask_size: dbuf = rmm.DeviceBuffer(size=mask_size) dbuf.copy_from_host(np.asarray(mask_buf).view("u1")) - return as_device_buffer_like(dbuf) - return as_device_buffer_like(mask_buf) + return as_buffer(dbuf) + return as_buffer(mask_buf) def _isnat(val): diff --git a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx index 7a0cdeb10b9..db6e206843c 100644 --- a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx +++ b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx @@ -3,7 +3,7 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -from cudf.core.buffer import Buffer +from cudf.core.buffer import as_buffer from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column_view @@ -21,4 +21,4 @@ def to_string_view_array(Column strings_col): c_buffer = move(cpp_to_string_view_array(input_view)) device_buffer = DeviceBuffer.c_from_unique_ptr(move(c_buffer)) - return Buffer(device_buffer) + return as_buffer(device_buffer)