Skip to content

Commit

Permalink
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into pylibcud…
Browse files Browse the repository at this point in the history
…f-lists-contains
  • Loading branch information
Matt711 committed Jun 26, 2024
2 parents cd03063 + cdfb550 commit a754eca
Show file tree
Hide file tree
Showing 133 changed files with 560 additions and 625 deletions.
5 changes: 2 additions & 3 deletions cpp/benchmarks/io/text/multibyte_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,7 @@ static cudf::string_scalar create_random_input(int32_t num_chars,

// extract the chars from the returned strings column.
auto input_column_contents = input_column->release();
auto chars_column_contents = input_column_contents.children[1]->release();
auto chars_buffer = chars_column_contents.data.release();
auto chars_buffer = input_column_contents.data.release();

// turn the chars in to a string scalar.
return cudf::string_scalar(std::move(*chars_buffer));
Expand Down Expand Up @@ -218,7 +217,7 @@ NVBENCH_BENCH_TYPES(bench_multibyte_split,
NVBENCH_BENCH_TYPES(bench_multibyte_split, NVBENCH_TYPE_AXES(source_type_list))
.set_name("multibyte_split_source")
.set_min_samples(4)
.add_int64_axis("strip_delimiters", {1})
.add_int64_axis("strip_delimiters", {0, 1})
.add_int64_axis("delim_size", {1})
.add_int64_axis("delim_percent", {1})
.add_int64_power_of_two_axis("size_approx", {15, 30})
Expand Down
8 changes: 6 additions & 2 deletions cpp/src/io/text/data_chunk_source_factories.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,11 @@ class istream_data_chunk_reader : public data_chunk_reader {
{
}

void skip_bytes(std::size_t size) override { _datastream->ignore(size); };
void skip_bytes(std::size_t size) override
{
// 20% faster than _datastream->ignore(size) for large files
_datastream->seekg(_datastream->tellg() + static_cast<std::ifstream::pos_type>(size));
};

std::unique_ptr<device_data_chunk> get_next_chunk(std::size_t read_size,
rmm::cuda_stream_view stream) override
Expand Down Expand Up @@ -265,7 +269,7 @@ class file_data_chunk_source : public data_chunk_source {
[[nodiscard]] std::unique_ptr<data_chunk_reader> create_reader() const override
{
return std::make_unique<istream_data_chunk_reader>(
std::make_unique<std::ifstream>(_filename, std::ifstream::in));
std::make_unique<std::ifstream>(_filename, std::ifstream::in | std::ifstream::binary));
}

private:
Expand Down
16 changes: 16 additions & 0 deletions cpp/tests/ast/transform_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,22 @@ TEST_F(TransformTest, ColumnReference)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
}

TEST_F(TransformTest, BasicAdditionDoubleCast)
{
auto c_0 = column_wrapper<double>{3, 20, 1, 50};
std::vector<__int128_t> data1{10, 7, 20, 0};
auto c_1 = cudf::test::fixed_point_column_wrapper<__int128_t>(
data1.begin(), data1.end(), numeric::scale_type{0});
auto table = cudf::table_view{{c_0, c_1}};
auto col_ref_0 = cudf::ast::column_reference(0);
auto col_ref_1 = cudf::ast::column_reference(1);
auto cast = cudf::ast::operation(cudf::ast::ast_operator::CAST_TO_FLOAT64, col_ref_1);
auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, cast);
auto expected = column_wrapper<double>{13, 27, 21, 50};
auto result = cudf::compute_column(table, expression);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
}

TEST_F(TransformTest, Literal)
{
auto c_0 = column_wrapper<int32_t>{3, 20, 1, 50};
Expand Down
2 changes: 2 additions & 0 deletions docs/cudf/source/user_guide/api_docs/general_utilities.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ Testing functions
:toctree: api/

cudf.testing.testing.assert_column_equal
cudf.testing.testing.assert_eq
cudf.testing.testing.assert_frame_equal
cudf.testing.testing.assert_index_equal
cudf.testing.testing.assert_neq
cudf.testing.testing.assert_series_equal
4 changes: 2 additions & 2 deletions python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

import sys
from io import StringIO
Expand All @@ -13,7 +13,7 @@
compare_content,
run_test,
)
from cudf.testing._utils import assert_eq
from cudf.testing import assert_eq


@pythonfuzz(data_handle=CSVReader)
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

import io
import sys
Expand All @@ -9,7 +9,7 @@
from cudf._fuzz_testing.json import JSONReader, JSONWriter
from cudf._fuzz_testing.main import pythonfuzz
from cudf._fuzz_testing.utils import ALL_POSSIBLE_VALUES, run_test
from cudf.testing._utils import assert_eq
from cudf.testing import assert_eq


@pythonfuzz(data_handle=JSONReader)
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_fuzz_testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pyarrow as pa

import cudf
from cudf.testing._utils import assert_eq
from cudf.testing import assert_eq
from cudf.utils.dtypes import (
pandas_dtypes_to_np_dtypes,
pyarrow_dtypes_to_pandas_dtypes,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ from libcpp.utility cimport move
from libcpp.vector cimport vector

cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
from cudf._lib.io.datasource cimport Datasource, NativeFileDatasource
from cudf._lib.pylibcudf.io.datasource cimport Datasource, NativeFileDatasource
from cudf._lib.pylibcudf.libcudf.types cimport data_type
from cudf._lib.types cimport dtype_to_data_type

Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/io/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand All @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources datasource.pyx utils.pyx)
set(cython_sources utils.pyx)
set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
CXX
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/io/utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ from libcpp.utility cimport move
from libcpp.vector cimport vector

from cudf._lib.column cimport Column
from cudf._lib.io.datasource cimport Datasource
from cudf._lib.pylibcudf.io.datasource cimport Datasource
from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
from cudf._lib.pylibcudf.libcudf.io.types cimport (
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/orc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ except ImportError:

cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
from cudf._lib.column cimport Column
from cudf._lib.io.datasource cimport NativeFileDatasource
from cudf._lib.io.utils cimport (
make_sink_info,
make_source_info,
update_column_struct_field_names,
)
from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
from cudf._lib.pylibcudf.libcudf.io.orc cimport (
chunked_orc_writer_options,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
cimport cudf._lib.pylibcudf.libcudf.types as cudf_types
from cudf._lib.column cimport Column
from cudf._lib.expressions cimport Expression
from cudf._lib.io.datasource cimport NativeFileDatasource
from cudf._lib.io.utils cimport (
make_sinks_info,
make_source_info,
update_struct_field_names,
)
from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
from cudf._lib.pylibcudf.libcudf.expressions cimport expression
from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
chunked_parquet_reader as cpp_chunked_parquet_reader,
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources avro.pyx types.pyx)
set(cython_sources avro.pyx datasource.pyx types.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand All @@ -21,5 +21,5 @@ rapids_cython_create_modules(
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
)

set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_types)
set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_types)
link_to_pyarrow_headers("${targets_using_arrow_headers}")
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport avro, types
from . cimport avro, datasource, types
from .types cimport SourceInfo, TableWithMetadata
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/io/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import avro, types
from . import avro, datasource, types
from .types import SourceInfo, TableWithMetadata
File renamed without changes.
File renamed without changes.
23 changes: 19 additions & 4 deletions python/cudf/cudf/_lib/pylibcudf/io/types.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ from libcpp.string cimport string
from libcpp.utility cimport move
from libcpp.vector cimport vector

from cudf._lib.pylibcudf.io.datasource cimport Datasource
from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
from cudf._lib.pylibcudf.libcudf.io.types cimport (
host_buffer,
source_info,
Expand Down Expand Up @@ -56,9 +58,8 @@ cdef class SourceInfo:
Parameters
----------
sources : List[Union[str, os.PathLike, bytes, io.BytesIO]]
A homogeneous list of sources (this can be a string filename,
an os.PathLike, bytes, or an io.BytesIO) to read from.
sources : List[Union[str, os.PathLike, bytes, io.BytesIO, DataSource]]
A homogeneous list of sources to read from.
Mixing different types of sources will raise a `ValueError`.
"""
Expand All @@ -68,6 +69,7 @@ cdef class SourceInfo:
raise ValueError("Need to pass at least one source")

cdef vector[string] c_files
cdef vector[datasource*] c_datasources

if isinstance(sources[0], (os.PathLike, str)):
c_files.reserve(len(sources))
Expand All @@ -84,6 +86,13 @@ cdef class SourceInfo:

self.c_obj = move(source_info(c_files))
return
elif isinstance(sources[0], Datasource):
for csrc in sources:
if not isinstance(csrc, Datasource):
raise ValueError("All sources must be of the same type!")
c_datasources.push_back((<Datasource>csrc).get_datasource())
self.c_obj = move(source_info(c_datasources))
return

# TODO: host_buffer is deprecated API, use host_span instead
cdef vector[host_buffer] c_host_buffers
Expand All @@ -106,5 +115,11 @@ cdef class SourceInfo:
c_buffer = bio.getbuffer() # check if empty?
c_host_buffers.push_back(host_buffer(<char*>&c_buffer[0],
c_buffer.shape[0]))
else:
raise ValueError("Sources must be a list of str/paths, "
"bytes, io.BytesIO, or a Datasource")

if empty_buffer is True:
c_host_buffers.push_back(host_buffer(<char*>NULL, 0))

self.c_obj = source_info(c_host_buffers)
self.c_obj = move(source_info(c_host_buffers))
79 changes: 32 additions & 47 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1068,51 +1068,34 @@ def notnull(self) -> ColumnBase:

return result

def fillna(
self,
fill_value: Any = None,
method: str | None = None,
) -> Self:
"""
Fill null values with *fill_value*
"""
if fill_value is not None:
fill_is_scalar = np.isscalar(fill_value)

if fill_is_scalar:
if fill_value == _DEFAULT_CATEGORICAL_VALUE:
fill_value = self.codes.dtype.type(fill_value)
else:
try:
fill_value = self._encode(fill_value)
fill_value = self.codes.dtype.type(fill_value)
except ValueError as err:
err_msg = "fill value must be in categories"
raise ValueError(err_msg) from err
def _validate_fillna_value(
self, fill_value: ScalarLike | ColumnLike
) -> cudf.Scalar | ColumnBase:
"""Align fill_value for .fillna based on column type."""
if cudf.api.types.is_scalar(fill_value):
if fill_value != _DEFAULT_CATEGORICAL_VALUE:
try:
fill_value = self._encode(fill_value)
except ValueError as err:
raise ValueError(
f"{fill_value=} must be in categories"
) from err
return cudf.Scalar(fill_value, dtype=self.codes.dtype)
else:
fill_value = column.as_column(fill_value, nan_as_null=False)
if isinstance(fill_value.dtype, CategoricalDtype):
if self.dtype != fill_value.dtype:
raise TypeError(
"Cannot set a categorical with another without identical categories"
)
else:
fill_value = column.as_column(fill_value, nan_as_null=False)
if isinstance(fill_value, CategoricalColumn):
if self.dtype != fill_value.dtype:
raise TypeError(
"Cannot set a Categorical with another, "
"without identical categories"
)
# TODO: only required if fill_value has a subset of the
# categories:
fill_value = fill_value._set_categories(
self.categories,
is_unique=True,
)
fill_value = column.as_column(fill_value.codes).astype(
self.codes.dtype
raise TypeError(
"Cannot set a categorical with non-categorical data"
)

# Validation of `fill_value` will have to be performed
# before returning self.
if not self.nullable:
return self

return super().fillna(fill_value, method=method)
fill_value = fill_value._set_categories(
self.categories,
)
return fill_value.codes.astype(self.codes.dtype)

def indices_of(
self, value: ScalarLike
Expand Down Expand Up @@ -1372,11 +1355,13 @@ def _set_categories(
if not (is_unique or new_cats.is_unique):
new_cats = cudf.Series(new_cats)._column.unique()

if cur_cats.equals(new_cats, check_dtypes=True):
# TODO: Internal usages don't always need a copy; add a copy keyword
# as_ordered shallow copies
return self.copy().as_ordered(ordered=ordered)

cur_codes = self.codes
max_cat_size = (
len(cur_cats) if len(cur_cats) > len(new_cats) else len(new_cats)
)
out_code_dtype = min_unsigned_type(max_cat_size)
out_code_dtype = min_unsigned_type(max(len(cur_cats), len(new_cats)))

cur_order = column.as_column(range(len(cur_codes)))
old_codes = column.as_column(
Expand Down
21 changes: 19 additions & 2 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,15 +666,32 @@ def _check_scatter_key_length(
f"{num_keys}"
)

def _validate_fillna_value(
self, fill_value: ScalarLike | ColumnLike
) -> cudf.Scalar | ColumnBase:
"""Align fill_value for .fillna based on column type."""
if is_scalar(fill_value):
return cudf.Scalar(fill_value, dtype=self.dtype)
return as_column(fill_value)

def fillna(
self,
fill_value: Any = None,
method: str | None = None,
fill_value: ScalarLike | ColumnLike,
method: Literal["ffill", "bfill", None] = None,
) -> Self:
"""Fill null values with ``value``.
Returns a copy with null filled.
"""
if not self.has_nulls(include_nan=True):
return self.copy()
elif method is None:
if is_scalar(fill_value) and libcudf.scalar._is_null_host_scalar(
fill_value
):
return self.copy()
else:
fill_value = self._validate_fillna_value(fill_value)
return libcudf.replace.replace_nulls(
input_col=self.nans_to_nulls(),
replacement=fill_value,
Expand Down
Loading

0 comments on commit a754eca

Please sign in to comment.