Skip to content

Commit

Permalink
Merge branch 'branch-24.08' into pylibcudf-docs
Browse files Browse the repository at this point in the history
  • Loading branch information
lithomas1 authored Jun 4, 2024
2 parents f2d7ead + 54d49fc commit 6dfb024
Show file tree
Hide file tree
Showing 282 changed files with 10,674 additions and 3,978 deletions.
20 changes: 12 additions & 8 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
# Documentation for config - https://github.com/actions/labeler#common-examples

cuDF (Python):
Python:
- 'python/**'
- 'notebooks/**'

cudf.pandas:
- 'python/cudf/cudf/pandas/**'
- 'python/cudf/cudf_pandas_tests/**'

cudf.polars:
- 'python/cudf_polars/**'

pylibcudf:
- 'python/cudf/cudf/_lib/pylibcudf/**'

libcudf:
- 'cpp/**'

Expand All @@ -12,11 +22,5 @@ CMake:
- '**/cmake/**'
- '**/*.cmake'

cuDF (Java):
Java:
- 'java/**'

ci:
- 'ci/**'

conda:
- 'conda/**'
4 changes: 3 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ repos:
- id: trailing-whitespace
exclude: |
(?x)^(
^cpp/cmake/thirdparty/patches/.*|
^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
)
- id: end-of-file-fixer
exclude: |
(?x)^(
^cpp/cmake/thirdparty/patches/.*|
^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
)
- repo: https://github.com/PyCQA/isort
Expand Down Expand Up @@ -127,7 +129,7 @@ repos:
^CHANGELOG.md$
)
- repo: https://github.com/rapidsai/dependency-file-generator
rev: v1.13.4
rev: v1.13.11
hooks:
- id: rapids-dependency-file-generator
args: ["--clean"]
Expand Down
2 changes: 2 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@ To build all libraries and tests, with Python packages in development mode, simp
./build.sh --pydevelop libcudf libcudf_kafka cudf dask_cudf cudf_kafka custreamz
```

- **Note**: if Cython files (`*.pyx` or `*.pxd`) have changed, the Python build must be rerun.

To run the C++ tests, run

```bash
Expand Down
14 changes: 2 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,8 @@ You can import `cudf` directly and use it like `pandas`:

```python
import cudf
import requests
from io import StringIO

url = "https://github.com/plotly/datasets/raw/master/tips.csv"
content = requests.get(url).content.decode("utf-8")

tips_df = cudf.read_csv(StringIO(content))
tips_df = cudf.read_csv("https://github.com/plotly/datasets/raw/master/tips.csv")
tips_df["tip_percentage"] = tips_df["tip"] / tips_df["total_bill"] * 100

# display average tip by dining party size
Expand All @@ -36,13 +31,8 @@ supported operations and falling back to pandas when needed:
%load_ext cudf.pandas # pandas operations now use the GPU!

import pandas as pd
import requests
from io import StringIO

url = "https://github.com/plotly/datasets/raw/master/tips.csv"
content = requests.get(url).content.decode("utf-8")

tips_df = pd.read_csv(StringIO(content))
tips_df = pd.read_csv("https://github.com/plotly/datasets/raw/master/tips.csv")
tips_df["tip_percentage"] = tips_df["tip"] / tips_df["total_bill"] * 100

# display average tip by dining party size
Expand Down
2 changes: 1 addition & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ BUILD_PER_THREAD_DEFAULT_STREAM=OFF
BUILD_REPORT_METRICS=OFF
BUILD_REPORT_INCL_CACHE_STATS=OFF
USE_PROPRIETARY_NVCOMP=ON
PYTHON_ARGS_FOR_INSTALL="-m pip install --no-build-isolation --no-deps"
PYTHON_ARGS_FOR_INSTALL="-m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true"

# Set defaults for vars that may not have been defined externally
# FIXME: if INSTALL_PREFIX is not set, check PREFIX, then check
Expand Down
16 changes: 6 additions & 10 deletions ci/build_docs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,23 +46,19 @@ pushd docs/cudf
make dirhtml
mkdir -p "${RAPIDS_DOCS_DIR}/cudf/html"
mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/cudf/html"
if [[ "${RAPIDS_BUILD_TYPE}" != "pull-request" ]]; then
make text
mkdir -p "${RAPIDS_DOCS_DIR}/cudf/txt"
mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt"
fi
make text
mkdir -p "${RAPIDS_DOCS_DIR}/cudf/txt"
mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt"
popd

rapids-logger "Build dask-cuDF Sphinx docs"
pushd docs/dask_cudf
make dirhtml
mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/html"
mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html"
if [[ "${RAPIDS_BUILD_TYPE}" != "pull-request" ]]; then
make text
mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
mv build/text/* "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
fi
make text
mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
mv build/text/* "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
popd

rapids-upload-docs
17 changes: 5 additions & 12 deletions ci/build_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,7 @@ export CMAKE_GENERATOR=Ninja

rapids-print-env

package_dir="python"
version=$(rapids-generate-version)
commit=$(git rev-parse HEAD)

echo "${version}" > VERSION
for package_name in cudf dask_cudf cudf_kafka custreamz; do
sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" ${package_dir}/${package_name}/${package_name}/_version.py
done
rapids-generate-version > ./VERSION

rapids-logger "Begin py build"

Expand All @@ -29,24 +22,24 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
# TODO: Remove `--no-test` flag once importing on a CPU
# node works correctly
# With boa installed conda build forwards to the boa builder
RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
--no-test \
--channel "${CPP_CHANNEL}" \
conda/recipes/cudf

RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
--no-test \
--channel "${CPP_CHANNEL}" \
--channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
conda/recipes/dask-cudf

RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
--no-test \
--channel "${CPP_CHANNEL}" \
--channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
conda/recipes/cudf_kafka

RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
--no-test \
--channel "${CPP_CHANNEL}" \
--channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
Expand Down
46 changes: 2 additions & 44 deletions ci/build_wheel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,54 +3,12 @@

set -euo pipefail

package_name=$1
package_dir=$2
package_dir=$1

source rapids-configure-sccache
source rapids-date-string

version=$(rapids-generate-version)
commit=$(git rev-parse HEAD)

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"

# This is the version of the suffix with a preceding hyphen. It's used
# everywhere except in the final wheel name.
PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"

# Patch project metadata files to include the CUDA version suffix and version override.
pyproject_file="${package_dir}/pyproject.toml"

sed -i "s/^name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
echo "${version}" > VERSION
sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_dir}/${package_name//-/_}/_version.py"

# For nightlies we want to ensure that we're pulling in alphas as well. The
# easiest way to do so is to augment the spec with a constraint containing a
# min alpha version that doesn't affect the version bounds but does allow usage
# of alpha versions for that dependency without --pre
alpha_spec=''
if ! rapids-is-release-build; then
alpha_spec=',>=0.0.0a0'
fi

if [[ ${package_name} == "dask-cudf" ]]; then
sed -r -i "s/cudf==(.*)\"/cudf${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}
else
sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
# ptxcompiler and cubinlinker aren't version constrained
sed -r -i "s/ptxcompiler\"/ptxcompiler${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
sed -r -i "s/cubinlinker\"/cubinlinker${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
fi

if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then
sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file}
sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file}
sed -i "s/ptxcompiler/pynvjitlink/g" ${pyproject_file}
sed -i "/cubinlinker/d" ${pyproject_file}
fi
rapids-generate-version > ./VERSION

cd "${package_dir}"

Expand Down
2 changes: 1 addition & 1 deletion ci/build_wheel_cudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ package_dir="python/cudf"

export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON"

./ci/build_wheel.sh cudf ${package_dir}
./ci/build_wheel.sh ${package_dir}

python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*

Expand Down
2 changes: 1 addition & 1 deletion ci/build_wheel_dask_cudf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ set -euo pipefail

package_dir="python/dask_cudf"

./ci/build_wheel.sh dask-cudf ${package_dir}
./ci/build_wheel.sh ${package_dir}

RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist
4 changes: 2 additions & 2 deletions ci/release/update-version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,10 @@ DEPENDENCIES=(
)
for DEP in "${DEPENDENCIES[@]}"; do
for FILE in dependencies.yaml conda/environments/*.yaml; do
sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" "${FILE}"
sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
done
for FILE in python/*/pyproject.toml; do
sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" ${FILE}
sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE}
done
done

Expand Down
9 changes: 9 additions & 0 deletions ci/run_cudf_kafka_pytests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
# Copyright (c) 2024, NVIDIA CORPORATION.

set -euo pipefail

# Support invoking run_cudf_kafka_pytests.sh outside the script directory
cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_kafka/cudf_kafka

pytest --cache-clear "$@" tests
2 changes: 1 addition & 1 deletion ci/run_custreamz_pytests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

set -euo pipefail

# It is essential to cd into python/cudf/cudf as `pytest-xdist` + `coverage` seem to work only at this directory level.
# It is essential to cd into python/custreamz/custreamz/ as `pytest-xdist` + `coverage` seem to work only at this directory level.

# Support invoking run_custreamz_pytests.sh outside the script directory
cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/custreamz/custreamz/
Expand Down
2 changes: 1 addition & 1 deletion ci/run_dask_cudf_pytests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

set -euo pipefail

# It is essential to cd into python/cudf/cudf as `pytest-xdist` + `coverage` seem to work only at this directory level.
# It is essential to cd into python/dask_cudf/dask_cudf/ as `pytest-xdist` + `coverage` seem to work only at this directory level.

# Support invoking run_dask_cudf_pytests.sh outside the script directory
cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/dask_cudf/dask_cudf/
Expand Down
4 changes: 4 additions & 0 deletions ci/test_python_other.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \
--dist=loadscope \
.

rapids-logger "pytest cudf_kafka"
./ci/run_cudf_kafka_pytests.sh \
--junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-kafka.xml"

rapids-logger "pytest custreamz"
./ci/run_custreamz_pytests.sh \
--junitxml="${RAPIDS_TESTS_DIR}/junit-custreamz.xml" \
Expand Down
16 changes: 9 additions & 7 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ dependencies:
- cxx-compiler
- cython>=3.0.3
- dask-cuda==24.8.*
- dask-cuda==24.8.*,>=0.0.0a0
- dlpack>=0.8,<1.0
- doxygen=1.9.1
- fastavro>=0.22.9
Expand All @@ -36,15 +37,15 @@ dependencies:
- hypothesis
- identify>=2.5.20
- ipython
- libarrow-acero==16.0.0.*
- libarrow-dataset==16.0.0.*
- libarrow==16.0.0.*
- libarrow-acero==16.1.0.*
- libarrow-dataset==16.1.0.*
- libarrow==16.1.0.*
- libcufile-dev=1.4.0.31
- libcufile=1.4.0.31
- libcurand-dev=10.3.0.86
- libcurand=10.3.0.86
- libkvikio==24.8.*
- libparquet==16.0.0.*
- libparquet==16.1.0.*
- librdkafka>=1.9.0,<1.10.0a0
- librmm==24.8.*
- make
Expand All @@ -66,7 +67,7 @@ dependencies:
- pip
- pre-commit
- ptxcompiler
- pyarrow==16.0.0.*
- pyarrow==16.1.0.*
- pydata-sphinx-theme!=0.14.2
- pytest-benchmark
- pytest-cases>=3.8.2
Expand All @@ -76,9 +77,10 @@ dependencies:
- python-confluent-kafka>=1.9.0,<1.10.0a0
- python>=3.9,<3.12
- pytorch>=2.1.0
- rapids-dask-dependency==24.8.*
- rapids-build-backend>=0.3.0,<0.4.0.dev0
- rapids-dask-dependency==24.8.*,>=0.0.0a0
- rich
- rmm==24.8.*
- rmm==24.8.*,>=0.0.0a0
- s3fs>=2022.3.0
- scikit-build-core>=0.7.0
- scipy
Expand Down
Loading

0 comments on commit 6dfb024

Please sign in to comment.