diff --git a/.github/ci-scripts/format_env_vars.py b/.github/ci-scripts/format_env_vars.py index 7eae7e4101..870c007dc8 100644 --- a/.github/ci-scripts/format_env_vars.py +++ b/.github/ci-scripts/format_env_vars.py @@ -1,5 +1,4 @@ -""" -Given a comma-separated string of environment variables, parse them into a dictionary. +"""Given a comma-separated string of environment variables, parse them into a dictionary. Example: env_str = "a=1,b=2" diff --git a/.github/ci-scripts/get_wheel_name_from_s3.py b/.github/ci-scripts/get_wheel_name_from_s3.py index c50ce509be..b033fdedf0 100644 --- a/.github/ci-scripts/get_wheel_name_from_s3.py +++ b/.github/ci-scripts/get_wheel_name_from_s3.py @@ -1,5 +1,4 @@ -""" -Given a commit hash and a "platform substring", prints the wheelname of the wheel (if one exists) to stdout. +"""Given a commit hash and a "platform substring", prints the wheelname of the wheel (if one exists) to stdout. # Example diff --git a/.github/ci-scripts/read_inline_metadata.py b/.github/ci-scripts/read_inline_metadata.py index 0f601a2276..11189986a8 100644 --- a/.github/ci-scripts/read_inline_metadata.py +++ b/.github/ci-scripts/read_inline_metadata.py @@ -3,10 +3,7 @@ # dependencies = [] # /// -""" -The `read` function below is sourced from: -https://packaging.python.org/en/latest/specifications/inline-script-metadata/#inline-script-metadata -""" +"""The `read` function below is sourced from: https://packaging.python.org/en/latest/specifications/inline-script-metadata/#inline-script-metadata.""" import re diff --git a/.github/ci-scripts/wheellib.py b/.github/ci-scripts/wheellib.py index 33c1d1ab8c..177747d0f8 100644 --- a/.github/ci-scripts/wheellib.py +++ b/.github/ci-scripts/wheellib.py @@ -2,6 +2,6 @@ def get_platform_tag(wheelname: str) -> str: - distribution, version, build_tag, tags = parse_wheel_filename(wheelname) + _, _, _, tags = parse_wheel_filename(wheelname) assert len(tags) == 1, "Multiple tags found" return next(iter(tags)).platform diff --git a/.github/working-dir/shuffle_testing.py b/.github/working-dir/shuffle_testing.py index 18b7c76307..fdf45cbba4 100644 --- a/.github/working-dir/shuffle_testing.py +++ b/.github/working-dir/shuffle_testing.py @@ -6,7 +6,7 @@ import random import time from functools import partial -from typing import Any, Dict +from typing import Any, Dict, Optional import numpy as np import pyarrow as pa @@ -32,8 +32,8 @@ def parse_size(size_str: str) -> int: def get_skewed_distribution(num_partitions: int, skew_factor: float) -> np.ndarray: - """ - Generate a skewed distribution using a power law. + """Generate a skewed distribution using a power law. + Higher skew_factor means more skewed distribution. """ if skew_factor <= 0: @@ -46,8 +46,7 @@ def get_skewed_distribution(num_partitions: int, skew_factor: float) -> np.ndarr def get_partition_size(base_size: int, size_variation: float, partition_idx: int) -> int: - """ - Calculate size for a specific partition with variation. + """Calculate size for a specific partition with variation. Args: base_size: The base partition size in bytes @@ -80,7 +79,6 @@ def generate( partition_idx: int, ): """Generate data for a single partition with optional skew, timing and size variations.""" - # Calculate actual partition size with variation actual_partition_size = get_partition_size(base_partition_size, size_variation, partition_idx) num_rows = actual_partition_size // ROW_SIZE @@ -135,7 +133,7 @@ def generator( ) -def setup_daft(shuffle_algorithm: str = None): +def setup_daft(shuffle_algorithm: Optional[str] = None): """Configure Daft execution settings.""" daft.context.set_runner_ray() daft.context.set_execution_config(shuffle_algorithm=shuffle_algorithm, pre_shuffle_merge_threshold=8 * GB) @@ -152,7 +150,7 @@ def run_benchmark( skew_factor: float, timing_variation: float, size_variation: float, - shuffle_algorithm: str = None, + shuffle_algorithm: Optional[str] = None, ) -> Dict[str, Any]: """Run the memory benchmark and return statistics.""" setup_daft(shuffle_algorithm) @@ -249,7 +247,7 @@ def main(): print(f"Total time: {timing:.2f}s") except Exception as e: - print(f"Error running benchmark: {str(e)}") + print(f"Error running benchmark: {e!s}") raise diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b5c2ddcc45..4362e615db 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -57,7 +57,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.6.2 + rev: v0.8.2 hooks: # Run the linter. - id: ruff diff --git a/.ruff.toml b/.ruff.toml index d0ea1cb625..87bcabe0cf 100644 --- a/.ruff.toml +++ b/.ruff.toml @@ -4,6 +4,7 @@ line-length = 120 target-version = "py38" [format] +docstring-code-format = true # Like Black, indent with spaces, rather than tabs. indent-style = "space" # Like Black, automatically detect the appropriate line ending. @@ -20,15 +21,37 @@ extend-select = [ "LOG", # flake8-logging "G", # flake8-logging-format "I", # isort + "RUF010", # Use explicit conversion flag + "RUF013", # PEP 484 prohibits implicit Optional + "RUF015", # Prefer next({iterable}) over single element slice + "RUF017", # Avoid quadratic list summation + "RUF022", # __all__ is not sorted + "RUF032", # Decimal() called with float literal argument + "RUF034", # Useless if-else condition + "RUF041", # Unnecessary nested Literal "RUF100", # unused-noqa" - "T10" # flake8-debugger + "T10", # flake8-debugger + "D" # pydocstyle rules ] ignore = [ - "E402" # Module level import not at top of file [TODO(sammy): We want to fix this] + "E402", # Module level import not at top of file [TODO(sammy): We want to fix this] + "D417", # requires documentation for every function parameter. + "D100", # Missing docstring in public module + "D101", # Missing docstring in public class + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "D104", # Missing docstring in public package + "D105", # Missing docstring in magic method + "D106", # Missing docstring in public nested class + "D107" # Missing docstring in __init__ ] +preview = true [lint.per-file-ignores] # Do not enforce usage and import order rules in init files "__init__.py" = ["E402", "F401", "I"] # Allow wild imports in conftest "tests/conftest.py" = ["F405", "E402", "F403"] + +[lint.pydocstyle] +convention = "google" diff --git a/benchmarking/parquet/conftest.py b/benchmarking/parquet/conftest.py index 6d5dd88dcf..19774f359b 100644 --- a/benchmarking/parquet/conftest.py +++ b/benchmarking/parquet/conftest.py @@ -73,7 +73,7 @@ def daft_dataframe_read(path: str, columns: list[str] | None = None) -> pa.Table ], ) def read_fn(request): - """Fixture which returns the function to read a PyArrow table from a path""" + """Fixture which returns the function to read a PyArrow table from a path.""" return request.param @@ -116,5 +116,5 @@ def boto_bulk_read(paths: list[str], columns: list[str] | None = None) -> list[p ], ) def bulk_read_fn(request): - """Fixture which returns the function to read a PyArrow table from a path""" + """Fixture which returns the function to read a PyArrow table from a path.""" return request.param diff --git a/benchmarking/tpch/__main__.py b/benchmarking/tpch/__main__.py index 8ad131e08f..5c0550bdf1 100644 --- a/benchmarking/tpch/__main__.py +++ b/benchmarking/tpch/__main__.py @@ -175,7 +175,7 @@ def run_all_benchmarks( def generate_parquet_data(tpch_gen_folder: str, scale_factor: float, num_parts: int) -> str: - """Generates Parquet data and returns the path to the folder + """Generates Parquet data and returns the path to the folder. Args: tpch_gen_folder (str): Path to the folder containing the TPCH dbgen tool and generated data @@ -193,7 +193,7 @@ def get_daft_version() -> str: def get_daft_benchmark_runner_name() -> Literal["ray"] | Literal["py"] | Literal["native"]: - """Test utility that checks the environment variable for the runner that is being used for the benchmarking""" + """Test utility that checks the environment variable for the runner that is being used for the benchmarking.""" name = os.getenv("DAFT_RUNNER") assert name is not None, "Tests must be run with $DAFT_RUNNER env var" name = name.lower() @@ -217,7 +217,7 @@ def get_ray_runtime_env(requirements: str | None) -> dict: def warmup_environment(requirements: str | None, parquet_folder: str): - """Performs necessary setup of Daft on the current benchmarking environment""" + """Performs necessary setup of Daft on the current benchmarking environment.""" if get_daft_benchmark_runner_name() == "ray": runtime_env = get_ray_runtime_env(requirements) diff --git a/benchmarking/tpch/data_generation.py b/benchmarking/tpch/data_generation.py index e70170b9d2..47e280bf26 100644 --- a/benchmarking/tpch/data_generation.py +++ b/benchmarking/tpch/data_generation.py @@ -202,7 +202,7 @@ def gen_sqlite_db(csv_filepath: str, num_parts: int) -> str: - """Generates a SQLite DB from a folder filled with generated CSVs + """Generates a SQLite DB from a folder filled with generated CSVs. Args: csv_filepath (str): path to folder with generated CSVs @@ -243,7 +243,7 @@ def import_table(table, table_path): def gen_csv_files(basedir: str, num_parts: int, scale_factor: float) -> str: - """Generates CSV files + """Generates CSV files. Args: basedir (str): path to generate files into @@ -302,7 +302,7 @@ def gen_csv_files(basedir: str, num_parts: int, scale_factor: float) -> str: def gen_parquet(csv_files_location: str) -> str: - """Generates Parquet from generated CSV files + """Generates Parquet from generated CSV files. Args: csv_files_location (str): path to folder with generated CSV files diff --git a/benchmarking/tpch/pipelined_data_generation.py b/benchmarking/tpch/pipelined_data_generation.py index f28063a990..58a86b6afc 100644 --- a/benchmarking/tpch/pipelined_data_generation.py +++ b/benchmarking/tpch/pipelined_data_generation.py @@ -1,4 +1,4 @@ -"""This script provides a pipelined data generation implementation of data_generation.py +"""This script provides a pipelined data generation implementation of data_generation.py. Note that after running this script, data will no longer be coherent/exist locally. This is used for generating large amounts of benchmarking data that lands directly in AWS S3, but for local benchmarking/testing use-cases use data_generation.py instead. @@ -52,7 +52,7 @@ def pipelined_data_generation( if not cachedir.exists(): logger.info("Cloning tpch dbgen repo") - subprocess.check_output(shlex.split(f"git clone https://github.com/electrum/tpch-dbgen {str(cachedir)}")) + subprocess.check_output(shlex.split(f"git clone https://github.com/electrum/tpch-dbgen {cachedir!s}")) subprocess.check_output("make", cwd=str(cachedir)) for i, part_indices in enumerate(batch(range(1, num_parts + 1), n=parallelism)): diff --git a/benchmarking/tpch/ray_job_runner.py b/benchmarking/tpch/ray_job_runner.py index 42fcfc96cf..89301cf647 100644 --- a/benchmarking/tpch/ray_job_runner.py +++ b/benchmarking/tpch/ray_job_runner.py @@ -23,8 +23,7 @@ async def wait_on_job(logs, timeout_s): def run_on_ray(ray_address: str, job_params: dict, timeout_s: int = 1500): - """Submits a job to run in the Ray cluster""" - + """Submits a job to run in the Ray cluster.""" print("Submitting benchmarking job to Ray cluster...") print("Parameters:") print(job_params) @@ -57,7 +56,7 @@ def ray_job_params( ) -> dict: return dict( submission_id=f"tpch-q{tpch_qnum}-{str(uuid.uuid4())[:4]}", - entrypoint=f"python3 {str(entrypoint.relative_to(working_dir))} --parquet-folder {parquet_folder_path} --question-number {tpch_qnum}", + entrypoint=f"python3 {entrypoint.relative_to(working_dir)!s} --parquet-folder {parquet_folder_path} --question-number {tpch_qnum}", runtime_env={ "working_dir": str(working_dir), **runtime_env, diff --git a/benchmarking/tpch/subprefix_s3_files.py b/benchmarking/tpch/subprefix_s3_files.py index 1e82a0c579..4b9c4419bc 100644 --- a/benchmarking/tpch/subprefix_s3_files.py +++ b/benchmarking/tpch/subprefix_s3_files.py @@ -1,7 +1,7 @@ -""" -Introduces more prefixes into TPCH data files hosted on S3. +"""Introduces more prefixes into TPCH data files hosted on S3. + This improves S3 read performance. For more details, see: -https://docs.aws.amazon.com/AmazonS3/latest/userguide/optimizing-performance.html +https://docs.aws.amazon.com/AmazonS3/latest/userguide/optimizing-performance.html. Does this by copying existing files into subfolders, based on file prefix. e.g. copies diff --git a/daft/__init__.py b/daft/__init__.py index be04c80f8f..4aeb5144ac 100644 --- a/daft/__init__.py +++ b/daft/__init__.py @@ -37,7 +37,7 @@ def get_build_type() -> str: def refresh_logger() -> None: - """Refreshes Daft's internal rust logging to the current python log level""" + """Refreshes Daft's internal rust logging to the current python log level.""" _refresh_logger() @@ -94,46 +94,46 @@ def refresh_logger() -> None: to_struct = Expression.to_struct __all__ = [ - "from_pylist", - "from_pydict", - "from_arrow", - "from_pandas", - "from_ray_dataset", - "from_dask_dataframe", - "from_glob_path", - "read_csv", - "read_json", - "read_parquet", - "read_hudi", - "read_iceberg", - "read_deltalake", - "read_sql", - "read_lance", - "DataCatalogType", "DataCatalogTable", + "DataCatalogType", "DataFrame", - "Expression", - "col", - "interval", "DataType", - "ImageMode", + "Expression", "ImageFormat", - "lit", - "Series", - "TimeUnit", - "register_viz_hook", - "refresh_logger", - "udf", + "ImageMode", "ResourceRequest", "Schema", - "set_planning_config", - "set_execution_config", - "planning_config_ctx", + "Series", + "TimeUnit", + "coalesce", + "col", "execution_config_ctx", + "from_arrow", + "from_dask_dataframe", + "from_glob_path", + "from_pandas", + "from_pydict", + "from_pylist", + "from_ray_dataset", + "interval", + "lit", + "planning_config_ctx", + "read_csv", + "read_deltalake", + "read_hudi", + "read_iceberg", + "read_json", + "read_lance", + "read_parquet", + "read_sql", "read_table", + "refresh_logger", "register_table", + "register_viz_hook", + "set_execution_config", + "set_planning_config", "sql", "sql_expr", "to_struct", - "coalesce", + "udf", ] diff --git a/daft/analytics.py b/daft/analytics.py index 1c59f2b491..1decc8e11a 100644 --- a/daft/analytics.py +++ b/daft/analytics.py @@ -61,7 +61,7 @@ def _build_segment_batch_payload( def _post_segment_track_endpoint(analytics_client: AnalyticsClient, payload: dict[str, Any]) -> None: - """Posts a batch of JSON data to Segment""" + """Posts a batch of JSON data to Segment.""" req = urllib.request.Request( _SEGMENT_BATCH_ENDPOINT, method="POST", @@ -84,7 +84,7 @@ def _post_segment_track_endpoint(analytics_client: AnalyticsClient, payload: dic class AnalyticsClient: - """Non-threadsafe client for sending analytics events, which is a singleton for each Python process""" + """Non-threadsafe client for sending analytics events, which is a singleton for each Python process.""" def __init__( self, @@ -169,7 +169,7 @@ def track_fn_call(self, fn_name: str, duration_seconds: float, error: str | None def init_analytics(daft_version: str, daft_build_type: str, user_opted_out: bool) -> AnalyticsClient: - """Initialize the analytics module + """Initialize the analytics module. Returns: AnalyticsClient: initialized singleton AnalyticsClient @@ -187,7 +187,7 @@ def init_analytics(daft_version: str, daft_build_type: str, user_opted_out: bool def time_df_method(method): - """Decorator to track metrics about Dataframe method calls""" + """Decorator to track metrics about Dataframe method calls.""" @functools.wraps(method) def tracked_method(*args, **kwargs): @@ -213,7 +213,7 @@ def tracked_method(*args, **kwargs): def time_func(fn): - """Decorator to track metrics for daft API calls""" + """Decorator to track metrics for daft API calls.""" @functools.wraps(fn) def tracked_fn(*args, **kwargs): diff --git a/daft/api_annotations.py b/daft/api_annotations.py index 50c8cf2494..bf4256fbd0 100644 --- a/daft/api_annotations.py +++ b/daft/api_annotations.py @@ -52,7 +52,6 @@ def type_check_function(func: Callable[..., Any], *args: Any, **kwargs: Any) -> def isinstance_helper(value: Any, T: Any) -> bool: """Like builtins.isinstance, but also accepts typing.* types.""" - if T is Any: return True diff --git a/daft/arrow_utils.py b/daft/arrow_utils.py index 1f211653f2..b01e7ce7da 100644 --- a/daft/arrow_utils.py +++ b/daft/arrow_utils.py @@ -6,29 +6,28 @@ def ensure_array(arr: pa.Array) -> pa.Array: - """Applies all fixes to an Arrow array""" + """Applies all fixes to an Arrow array.""" arr = _FixEmptyStructArrays.ensure_array(arr) arr = _FixSliceOffsets.ensure_array(arr) return arr def ensure_chunked_array(arr: pa.ChunkedArray) -> pa.ChunkedArray: - """Applies all fixes to an Arrow chunked array""" + """Applies all fixes to an Arrow chunked array.""" arr = _FixEmptyStructArrays.ensure_chunked_array(arr) arr = _FixSliceOffsets.ensure_chunked_array(arr) return arr def ensure_table(tbl: pa.Table) -> pa.Table: - """Applies all fixes to an Arrow table""" + """Applies all fixes to an Arrow table.""" tbl = _FixEmptyStructArrays.ensure_table(tbl) tbl = _FixSliceOffsets.ensure_table(tbl) return tbl class _FixEmptyStructArrays: - """Converts StructArrays that are empty (have no fields) to StructArrays with a single field - named "" and with a NullType + """Converts StructArrays that are empty (have no fields) to StructArrays with a single field named "" and with a NullType. This is done because arrow2::ffi cannot handle empty StructArrays and we need to handle this on the Python layer before going through ffi into Rust. @@ -62,7 +61,7 @@ def ensure_chunked_array(arr: pa.ChunkedArray) -> pa.ChunkedArray: return pa.chunked_array([_FixEmptyStructArrays.ensure_array(chunk) for chunk in arr.chunks]) def ensure_array(arr: pa.Array) -> pa.Array: - """Recursively converts empty struct arrays to single-field struct arrays""" + """Recursively converts empty struct arrays to single-field struct arrays.""" if arr.type == _FixEmptyStructArrays.get_empty_struct_type(): return pa.array( [ @@ -84,7 +83,7 @@ def ensure_array(arr: pa.Array) -> pa.Array: def remove_empty_struct_placeholders(arr: pa.Array): - """Recursively removes the empty struct placeholders placed by _FixEmptyStructArrays.ensure_array""" + """Recursively removes the empty struct placeholders placed by _FixEmptyStructArrays.ensure_array.""" if arr.type == _FixEmptyStructArrays.get_single_field_struct_type(): return pa.array( [{} if valid.as_py() else None for valid in arr.is_valid()], @@ -118,10 +117,7 @@ class _FixSliceOffsets: @staticmethod def ensure_table(arrow_table: pa.Table) -> pa.Table: - """ - Ensures that table-level slice offsets are properly propagated to child arrays - to prevent them from being dropped upon record batch conversion and FFI transfer. - """ + """Ensures that table-level slice offsets are properly propagated to child arrays to prevent them from being dropped upon record batch conversion and FFI transfer.""" arrow_schema = arrow_table.schema for idx, name in enumerate(arrow_schema.names): field = arrow_schema.field(name) @@ -133,10 +129,7 @@ def ensure_table(arrow_table: pa.Table) -> pa.Table: @staticmethod def ensure_chunked_array(chunked_array: pa.ChunkedArray) -> pa.ChunkedArray: - """ - Ensures that chunked-array-level slice offsets are properly propagated to child arrays - to prevent them from being dropped upon record batch conversion and FFI transfer. - """ + """Ensures that chunked-array-level slice offsets are properly propagated to child arrays to prevent them from being dropped upon record batch conversion and FFI transfer.""" if _FixSliceOffsets._chunked_array_needs_slice_offset_propagation(chunked_array): return _FixSliceOffsets._propagate_chunked_array_slice_offsets(chunked_array) else: @@ -144,10 +137,7 @@ def ensure_chunked_array(chunked_array: pa.ChunkedArray) -> pa.ChunkedArray: @staticmethod def ensure_array(array: pa.Array) -> pa.Array: - """ - Ensures that array-level slice offsets are properly propagated to child arrays - to prevent them from being dropped upon record batch conversion and FFI transfer. - """ + """Ensures that array-level slice offsets are properly propagated to child arrays to prevent them from being dropped upon record batch conversion and FFI transfer.""" if _FixSliceOffsets._array_needs_slice_offset_propagation(array): return _FixSliceOffsets._propagate_array_slice_offsets(array) else: @@ -155,8 +145,7 @@ def ensure_array(array: pa.Array) -> pa.Array: @staticmethod def _chunked_array_needs_slice_offset_propagation(chunked_array: pa.ChunkedArray) -> bool: - """ - Whether an Arrow ChunkedArray needs slice offset propagation. + """Whether an Arrow ChunkedArray needs slice offset propagation. This is currently only true for struct arrays and fixed-size list arrays that contain slice offsets/truncations. @@ -167,8 +156,7 @@ def _chunked_array_needs_slice_offset_propagation(chunked_array: pa.ChunkedArray @staticmethod def _array_needs_slice_offset_propagation(array: pa.Array) -> bool: - """ - Whether an Arrow array needs slice offset propagation. + """Whether an Arrow array needs slice offset propagation. This is currently only true for struct arrays and fixed-size list arrays that contain slice offsets/truncations. @@ -182,9 +170,7 @@ def _array_needs_slice_offset_propagation(array: pa.Array) -> bool: @staticmethod def _struct_array_needs_slice_offset_propagation(array: pa.StructArray) -> bool: - """ - Whether the provided struct array needs slice offset propagation. - """ + """Whether the provided struct array needs slice offset propagation.""" assert isinstance(array, pa.StructArray) # TODO(Clark): Only propagate slice offsets if a slice exists; checking whether the # array length has been truncated is currently difficult since StructArray.field() @@ -198,17 +184,13 @@ def _struct_array_needs_slice_offset_propagation(array: pa.StructArray) -> bool: @staticmethod def _fixed_size_list_array_needs_slice_offset_propagation(array: pa.FixedSizeListArray) -> bool: - """ - Whether the provided fixed-size list array needs slice offset propagation. - """ + """Whether the provided fixed-size list array needs slice offset propagation.""" assert isinstance(array, pa.FixedSizeListArray) return array.offset > 0 or len(array) < array.type.list_size * len(array.values) @staticmethod def _propagate_chunked_array_slice_offsets(chunked_array: pa.ChunkedArray) -> pa.ChunkedArray: - """ - Propagate slice offsets for the provided chunked array to the child arrays of each chunk. - """ + """Propagate slice offsets for the provided chunked array to the child arrays of each chunk.""" new_chunks = [] # Flatten each chunk to propagate slice offsets to child arrays. for chunk in chunked_array.chunks: @@ -218,9 +200,7 @@ def _propagate_chunked_array_slice_offsets(chunked_array: pa.ChunkedArray) -> pa @staticmethod def _propagate_array_slice_offsets(array: pa.Array) -> pa.Array: - """ - Propagate slice offsets for the provided array to its child arrays. - """ + """Propagate slice offsets for the provided array to its child arrays.""" assert _FixSliceOffsets._array_needs_slice_offset_propagation(array) dtype = array.type if pa.types.is_struct(dtype): @@ -246,8 +226,7 @@ def _propagate_array_slice_offsets(array: pa.Array) -> pa.Array: @staticmethod def _slice_bitmap_buffer(buf: pa.Buffer, offset: int, length: int) -> pa.Buffer: - """ - Slice the provided bitpacked boolean bitmap buffer at the given offset and length. + """Slice the provided bitpacked boolean bitmap buffer at the given offset and length. This function takes care of the byte and bit offset bookkeeping required due to the buffer being bitpacked. diff --git a/daft/catalog/__init__.py b/daft/catalog/__init__.py index 438fd369d0..201c1f9b47 100644 --- a/daft/catalog/__init__.py +++ b/daft/catalog/__init__.py @@ -64,8 +64,8 @@ __all__ = [ "read_table", "register_python_catalog", - "unregister_catalog", "register_table", + "unregister_catalog", ] # Forward imports from the native catalog which don't require Python wrappers @@ -73,7 +73,7 @@ def read_table(name: str) -> DataFrame: - """Finds a table with the specified name and reads it as a DataFrame + """Finds a table with the specified name and reads it as a DataFrame. The provided name can be any of the following, and Daft will return them with the following order of priority: @@ -113,7 +113,7 @@ def register_table(name: str, dataframe: DataFrame) -> str: def register_python_catalog(catalog: PyIcebergCatalog | UnityCatalog, name: str | None = None) -> str: - """Registers a Python catalog with Daft + """Registers a Python catalog with Daft. Currently supports: diff --git a/daft/catalog/python_catalog.py b/daft/catalog/python_catalog.py index dc911f3766..2a0f942eac 100644 --- a/daft/catalog/python_catalog.py +++ b/daft/catalog/python_catalog.py @@ -8,7 +8,7 @@ class PythonCatalog: - """Wrapper class for various Python implementations of Data Catalogs""" + """Wrapper class for various Python implementations of Data Catalogs.""" @abstractmethod def list_tables(self, prefix: str) -> list[str]: ... @@ -18,7 +18,7 @@ def load_table(self, name: str) -> PythonCatalogTable: ... class PythonCatalogTable: - """Wrapper class for various Python implementations of Data Catalog Tables""" + """Wrapper class for various Python implementations of Data Catalog Tables.""" @abstractmethod def to_dataframe(self) -> DataFrame: ... diff --git a/daft/context.py b/daft/context.py index 7f9b8b1ae6..0b071a431b 100644 --- a/daft/context.py +++ b/daft/context.py @@ -42,7 +42,7 @@ class _RayRunnerConfig(_RunnerConfig): def _get_runner_config_from_env() -> _RunnerConfig: - """Retrieves the appropriate RunnerConfig from environment variables + """Retrieves the appropriate RunnerConfig from environment variables. To use: @@ -121,7 +121,7 @@ def _get_runner_config_from_env() -> _RunnerConfig: @dataclasses.dataclass class DaftContext: - """Global context for the current Daft execution environment""" + """Global context for the current Daft execution environment.""" # When a dataframe is executed, this config is copied into the Runner # which then keeps track of a per-unique-execution-ID copy of the config, using it consistently throughout the execution @@ -204,7 +204,7 @@ def set_runner_ray( max_task_backlog: int | None = None, force_client_mode: bool = False, ) -> DaftContext: - """Set the runner for executing Daft dataframes to a Ray cluster + """Set the runner for executing Daft dataframes to a Ray cluster. Alternatively, users can set this behavior via environment variables: @@ -222,7 +222,6 @@ def set_runner_ray( Returns: DaftContext: Daft context after setting the Ray runner """ - ctx = get_context() with ctx._lock: if ctx._runner is not None: @@ -283,7 +282,7 @@ def set_runner_native() -> DaftContext: @contextlib.contextmanager def planning_config_ctx(**kwargs): - """Context manager that wraps set_planning_config to reset the config to its original setting afternwards""" + """Context manager that wraps set_planning_config to reset the config to its original setting afternwards.""" original_config = get_context().daft_planning_config try: set_planning_config(**kwargs) @@ -296,8 +295,9 @@ def set_planning_config( config: PyDaftPlanningConfig | None = None, default_io_config: IOConfig | None = None, ) -> DaftContext: - """Globally sets various configuration parameters which control Daft plan construction behavior. These configuration values - are used when a Dataframe is being constructed (e.g. calls to create a Dataframe, or to build on an existing Dataframe) + """Globally sets various configuration parameters which control Daft plan construction behavior. + + These configuration values are used when a Dataframe is being constructed (e.g. calls to create a Dataframe, or to build on an existing Dataframe). Args: config: A PyDaftPlanningConfig object to set the config to, before applying other kwargs. Defaults to None which indicates @@ -319,7 +319,7 @@ def set_planning_config( @contextlib.contextmanager def execution_config_ctx(**kwargs): - """Context manager that wraps set_execution_config to reset the config to its original setting afternwards""" + """Context manager that wraps set_execution_config to reset the config to its original setting afternwards.""" original_config = get_context().daft_execution_config try: set_execution_config(**kwargs) @@ -352,8 +352,10 @@ def set_execution_config( pre_shuffle_merge_threshold: int | None = None, enable_ray_tracing: bool | None = None, ) -> DaftContext: - """Globally sets various configuration parameters which control various aspects of Daft execution. These configuration values - are used when a Dataframe is executed (e.g. calls to `.write_*`, `.collect()` or `.show()`) + """Globally sets various configuration parameters which control various aspects of Daft execution. + + These configuration values + are used when a Dataframe is executed (e.g. calls to `.write_*`, `.collect()` or `.show()`). Args: config: A PyDaftExecutionConfig object to set the config to, before applying other kwargs. Defaults to None which indicates diff --git a/daft/daft/__init__.pyi b/daft/daft/__init__.pyi index a91477d5e9..47e88d9afd 100644 --- a/daft/daft/__init__.pyi +++ b/daft/daft/__init__.pyi @@ -17,8 +17,7 @@ if TYPE_CHECKING: from pyiceberg.table import TableProperties as IcebergTableProperties class ImageMode(Enum): - """ - Supported image modes for Daft's image type. + """Supported image modes for Daft's image type. .. warning:: Currently, only the 8-bit modes (L, LA, RGB, RGBA) can be stored in a DataFrame. @@ -58,8 +57,7 @@ class ImageMode(Enum): @staticmethod def from_mode_string(mode: str) -> ImageMode: - """ - Create an ImageMode from its string representation. + """Create an ImageMode from its string representation. Args: mode: String representation of the mode. This is the same as the enum @@ -69,9 +67,7 @@ class ImageMode(Enum): ... class ImageFormat(Enum): - """ - Supported image formats for Daft's image I/O. - """ + """Supported image formats for Daft's image I/O.""" PNG: int JPEG: int @@ -81,15 +77,11 @@ class ImageFormat(Enum): @staticmethod def from_format_string(mode: str) -> ImageFormat: - """ - Create an ImageFormat from its string representation. - """ + """Create an ImageFormat from its string representation.""" ... class JoinType(Enum): - """ - Type of a join operation. - """ + """Type of a join operation.""" Inner: int Left: int @@ -100,8 +92,7 @@ class JoinType(Enum): @staticmethod def from_join_type_str(join_type: str) -> JoinType: - """ - Create a JoinType from its string representation. + """Create a JoinType from its string representation. Args: join_type: String representation of the join type. This is the same as the enum @@ -111,9 +102,7 @@ class JoinType(Enum): ... class JoinStrategy(Enum): - """ - Join strategy (algorithm) to use. - """ + """Join strategy (algorithm) to use.""" Hash: int SortMerge: int @@ -121,8 +110,7 @@ class JoinStrategy(Enum): @staticmethod def from_join_strategy_str(join_strategy: str) -> JoinStrategy: - """ - Create a JoinStrategy from its string representation. + """Create a JoinStrategy from its string representation. Args: join_strategy: String representation of the join strategy. This is the same as the enum @@ -136,8 +124,7 @@ class JoinSide(Enum): Right: int class CountMode(Enum): - """ - Supported count modes for Daft's count aggregation. + """Supported count modes for Daft's count aggregation. | All - Count both non-null and null values. | Valid - Count only valid values. @@ -150,8 +137,7 @@ class CountMode(Enum): @staticmethod def from_count_mode_str(count_mode: str) -> CountMode: - """ - Create a CountMode from its string representation. + """Create a CountMode from its string representation. Args: count_mode: String representation of the count mode , e.g. "all", "valid", or "null". @@ -159,9 +145,7 @@ class CountMode(Enum): ... class ResourceRequest: - """ - Resource request for a query fragment task. - """ + """Resource request for a query fragment task.""" num_cpus: float | None num_gpus: float | None @@ -188,9 +172,7 @@ class ResourceRequest: def __ne__(self, other: ResourceRequest) -> bool: ... # type: ignore[override] class FileFormat(Enum): - """ - Format of a file, e.g. Parquet, CSV, and JSON. - """ + """Format of a file, e.g. Parquet, CSV, and JSON.""" Parquet: int Csv: int @@ -199,9 +181,7 @@ class FileFormat(Enum): def ext(self): ... class ParquetSourceConfig: - """ - Configuration of a Parquet data source. - """ + """Configuration of a Parquet data source.""" coerce_int96_timestamp_unit: PyTimeUnit | None field_id_mapping: dict[int, PyField] | None @@ -217,9 +197,7 @@ class ParquetSourceConfig: ): ... class CsvSourceConfig: - """ - Configuration of a CSV data source. - """ + """Configuration of a CSV data source.""" delimiter: str | None has_headers: bool @@ -245,9 +223,7 @@ class CsvSourceConfig: ): ... class JsonSourceConfig: - """ - Configuration of a JSON data source. - """ + """Configuration of a JSON data source.""" buffer_size: int | None chunk_size: int | None @@ -259,9 +235,7 @@ class JsonSourceConfig: ): ... class DatabaseSourceConfig: - """ - Configuration of a database data source. - """ + """Configuration of a database data source.""" sql: str conn: SQLConnection @@ -269,53 +243,39 @@ class DatabaseSourceConfig: def __init__(self, sql: str, conn_factory: SQLConnection): ... class FileFormatConfig: - """ - Configuration for parsing a particular file format (Parquet, CSV, JSON). - """ + """Configuration for parsing a particular file format (Parquet, CSV, JSON).""" config: ParquetSourceConfig | CsvSourceConfig | JsonSourceConfig | DatabaseSourceConfig @staticmethod def from_parquet_config(config: ParquetSourceConfig) -> FileFormatConfig: - """ - Create a Parquet file format config. - """ + """Create a Parquet file format config.""" ... @staticmethod def from_csv_config(config: CsvSourceConfig) -> FileFormatConfig: - """ - Create a CSV file format config. - """ + """Create a CSV file format config.""" ... @staticmethod def from_json_config(config: JsonSourceConfig) -> FileFormatConfig: - """ - Create a JSON file format config. - """ + """Create a JSON file format config.""" ... @staticmethod def from_database_config(config: DatabaseSourceConfig) -> FileFormatConfig: - """ - Create a database file format config. - """ + """Create a database file format config.""" ... def file_format(self) -> FileFormat: - """ - Get the file format for this config. - """ + """Get the file format for this config.""" ... def __eq__(self, other: FileFormatConfig) -> bool: ... # type: ignore[override] def __ne__(self, other: FileFormatConfig) -> bool: ... # type: ignore[override] class CsvConvertOptions: - """ - Options for converting CSV data to Daft data. - """ + """Options for converting CSV data to Daft data.""" limit: int | None include_columns: list[str] | None @@ -333,9 +293,7 @@ class CsvConvertOptions: ): ... class CsvParseOptions: - """ - Options for parsing CSV files. - """ + """Options for parsing CSV files.""" has_header: bool delimiter: str | None @@ -357,9 +315,7 @@ class CsvParseOptions: ): ... class CsvReadOptions: - """ - Options for reading CSV files. - """ + """Options for reading CSV files.""" buffer_size: int | None chunk_size: int | None @@ -371,9 +327,7 @@ class CsvReadOptions: ): ... class JsonConvertOptions: - """ - Options for converting JSON data to Daft data. - """ + """Options for converting JSON data to Daft data.""" limit: int | None include_columns: list[str] | None @@ -387,14 +341,10 @@ class JsonConvertOptions: ): ... class JsonParseOptions: - """ - Options for parsing JSON files. - """ + """Options for parsing JSON files.""" class JsonReadOptions: - """ - Options for reading JSON files. - """ + """Options for reading JSON files.""" buffer_size: int | None chunk_size: int | None @@ -406,18 +356,14 @@ class JsonReadOptions: ): ... class FileInfo: - """ - Metadata for a single file. - """ + """Metadata for a single file.""" file_path: str file_size: int | None num_rows: int | None class FileInfos: - """ - Metadata for a collection of files. - """ + """Metadata for a collection of files.""" file_paths: list[str] file_sizes: list[int | None] @@ -426,27 +372,21 @@ class FileInfos: @staticmethod def from_infos(file_paths: list[str], file_sizes: list[int | None], num_rows: list[int | None]) -> FileInfos: ... def extend(self, new_infos: FileInfos) -> FileInfos: - """ - Concatenate two FileInfos together. - """ + """Concatenate two FileInfos together.""" ... def __getitem__(self, idx: int) -> FileInfo: ... def __len__(self) -> int: ... class HTTPConfig: - """ - I/O configuration for accessing HTTP systems - """ + """I/O configuration for accessing HTTP systems.""" bearer_token: str | None def __init__(self, bearer_token: str | None = None): ... class S3Config: - """ - I/O configuration for accessing an S3-compatible system. - """ + """I/O configuration for accessing an S3-compatible system.""" region_name: str | None endpoint_url: str | None @@ -513,12 +453,12 @@ class S3Config: force_virtual_addressing: bool | None = None, profile_name: str | None = None, ) -> S3Config: - """Replaces values if provided, returning a new S3Config""" + """Replaces values if provided, returning a new S3Config.""" ... @staticmethod def from_env() -> S3Config: - """Creates an S3Config, retrieving credentials and configurations from the current environment""" + """Creates an S3Config, retrieving credentials and configurations from the current environment.""" ... class S3Credentials: @@ -536,9 +476,7 @@ class S3Credentials: ): ... class AzureConfig: - """ - I/O configuration for accessing Azure Blob Storage. - """ + """I/O configuration for accessing Azure Blob Storage.""" storage_account: str | None access_key: str | None @@ -580,13 +518,11 @@ class AzureConfig: endpoint_url: str | None = None, use_ssl: bool | None = None, ) -> AzureConfig: - """Replaces values if provided, returning a new AzureConfig""" + """Replaces values if provided, returning a new AzureConfig.""" ... class GCSConfig: - """ - I/O configuration for accessing Google Cloud Storage. - """ + """I/O configuration for accessing Google Cloud Storage.""" project_id: str | None credentials: str | None @@ -622,13 +558,11 @@ class GCSConfig: read_timeout_ms: int | None = None, num_tries: int | None = None, ) -> GCSConfig: - """Replaces values if provided, returning a new GCSConfig""" + """Replaces values if provided, returning a new GCSConfig.""" ... class IOConfig: - """ - Configuration for the native I/O layer, e.g. credentials for accessing cloud storage systems. - """ + """Configuration for the native I/O layer, e.g. credentials for accessing cloud storage systems.""" s3: S3Config azure: AzureConfig @@ -649,13 +583,11 @@ class IOConfig: gcs: GCSConfig | None = None, http: HTTPConfig | None = None, ) -> IOConfig: - """Replaces values if provided, returning a new IOConfig""" + """Replaces values if provided, returning a new IOConfig.""" ... class NativeStorageConfig: - """ - Storage configuration for the Rust-native I/O layer. - """ + """Storage configuration for the Rust-native I/O layer.""" # Whether or not to use a multithreaded tokio runtime for processing I/O multithreaded_io: bool @@ -664,52 +596,37 @@ class NativeStorageConfig: def __init__(self, multithreaded_io: bool, io_config: IOConfig): ... class PythonStorageConfig: - """ - Storage configuration for the legacy Python I/O layer. - """ + """Storage configuration for the legacy Python I/O layer.""" io_config: IOConfig def __init__(self, io_config: IOConfig): ... class StorageConfig: - """ - Configuration for interacting with a particular storage backend, using a particular - I/O layer implementation. - """ + """Configuration for interacting with a particular storage backend, using a particular I/O layer implementation.""" @staticmethod def native(config: NativeStorageConfig) -> StorageConfig: - """ - Create from a native storage config. - """ + """Create from a native storage config.""" ... @staticmethod def python(config: PythonStorageConfig) -> StorageConfig: - """ - Create from a Python storage config. - """ + """Create from a Python storage config.""" ... @property def config(self) -> NativeStorageConfig | PythonStorageConfig: ... class ScanTask: - """ - A batch of scan tasks for reading data from an external source. - """ + """A batch of scan tasks for reading data from an external source.""" def num_rows(self) -> int: - """ - Get number of rows that will be scanned by this ScanTask. - """ + """Get number of rows that will be scanned by this ScanTask.""" ... def estimate_in_memory_size_bytes(self, cfg: PyDaftExecutionConfig) -> int: - """ - Estimate the In Memory Size of this ScanTask. - """ + """Estimate the In Memory Size of this ScanTask.""" ... @staticmethod @@ -725,9 +642,7 @@ class ScanTask: partition_values: PyTable | None, stats: PyTable | None, ) -> ScanTask | None: - """ - Create a Catalog Scan Task - """ + """Create a Catalog Scan Task.""" ... @staticmethod @@ -741,9 +656,7 @@ class ScanTask: pushdowns: Pushdowns | None, stats: PyTable | None, ) -> ScanTask: - """ - Create a SQL Scan Task - """ + """Create a SQL Scan Task.""" ... @staticmethod @@ -757,15 +670,11 @@ class ScanTask: pushdowns: Pushdowns | None, stats: PyTable | None, ) -> ScanTask: - """ - Create a Python factory function Scan Task - """ + """Create a Python factory function Scan Task.""" ... class ScanOperatorHandle: - """ - A handle to a scan operator. - """ + """A handle to a scan operator.""" @staticmethod def anonymous_scan( @@ -790,9 +699,7 @@ class ScanOperatorHandle: def logical_plan_table_scan(scan_operator: ScanOperatorHandle) -> LogicalPlanBuilder: ... class PartitionField: - """ - Partitioning Field of a Scan Source such as Hive or Iceberg - """ + """Partitioning Field of a Scan Source such as Hive or Iceberg.""" field: PyField @@ -804,9 +711,7 @@ class PartitionField: ) -> None: ... class PartitionTransform: - """ - Partitioning Transform from a Data Catalog source field to a Partitioning Columns - """ + """Partitioning Transform from a Data Catalog source field to a Partitioning Columns.""" @staticmethod def identity() -> PartitionTransform: ... @@ -824,9 +729,7 @@ class PartitionTransform: def iceberg_truncate(w: int) -> PartitionTransform: ... class Pushdowns: - """ - Pushdowns from the query optimizer that can optimize scanning data sources. - """ + """Pushdowns from the query optimizer that can optimize scanning data sources.""" columns: list[str] | None filters: PyExpr | None @@ -1653,9 +1556,7 @@ class PyMicroPartition: ): ... class PhysicalPlanScheduler: - """ - A work scheduler for physical query plans. - """ + """A work scheduler for physical query plans.""" @staticmethod def from_logical_plan_builder( @@ -1672,9 +1573,7 @@ class PhysicalPlanScheduler: def run(self, psets: dict[str, list[PartitionT]]) -> Iterator[PyMicroPartition]: ... class AdaptivePhysicalPlanScheduler: - """ - An adaptive Physical Plan Scheduler. - """ + """An adaptive Physical Plan Scheduler.""" @staticmethod def from_logical_plan_builder( @@ -1695,9 +1594,9 @@ class AdaptivePhysicalPlanScheduler: ) -> None: ... class LogicalPlanBuilder: - """ - A logical plan builder, which simplifies constructing logical plans via - a fluent interface. E.g., LogicalPlanBuilder.table_scan(..).project(..).filter(..). + """A logical plan builder, which simplifies constructing logical plans via a fluent interface. + + E.g., LogicalPlanBuilder.table_scan(..).project(..).filter(..). This builder holds the current root (sink) of the logical plan, and the building methods return a brand new builder holding a new plan; i.e., this is an immutable builder. @@ -1905,9 +1804,7 @@ def io_glob( ) -> list[dict]: ... class SystemInfo: - """ - Accessor for system information. - """ + """Accessor for system information.""" def __init__(self) -> None: ... def total_memory(self) -> int: ... diff --git a/daft/dataframe/dataframe.py b/daft/dataframe/dataframe.py index 74f7a21348..eb07238ceb 100644 --- a/daft/dataframe/dataframe.py +++ b/daft/dataframe/dataframe.py @@ -92,12 +92,16 @@ def to_logical_plan_builder(*parts: MicroPartition) -> LogicalPlanBuilder: class DataFrame: - """A Daft DataFrame is a table of data. It has columns, where each column has a type and the same + """A Daft DataFrame is a table of data. + + It has columns, where each column has a type and the same number of items (rows) as all other columns. """ def __init__(self, builder: LogicalPlanBuilder) -> None: - """Constructs a DataFrame according to a given LogicalPlan. Users are expected instead to call + """Constructs a DataFrame according to a given LogicalPlan. + + Users are expected instead to call the classmethods on DataFrame to create a DataFrame. Args: @@ -157,6 +161,7 @@ def explain( self, show_all: bool = False, format: str = "ascii", simple: bool = False, file: Optional[io.IOBase] = None ) -> Any: """Prints the (logical and physical) plans that will be executed to produce this DataFrame. + Defaults to showing the unoptimized logical plan. Use ``show_all=True`` to show the unoptimized logical plan, the optimized logical plan, and the physical plan. @@ -219,7 +224,7 @@ def num_partitions(self) -> int: @DataframePublicAPI def schema(self) -> Schema: - """Returns the Schema of the DataFrame, which provides information about each column + """Returns the Schema of the DataFrame, which provides information about each column. Returns: Schema: schema of the DataFrame @@ -271,7 +276,6 @@ def iter_rows( The default value is the total number of CPUs available on the current machine. Example: - >>> import daft >>> >>> df = daft.from_pydict({"foo": [1, 2, 3], "bar": ["a", "b", "c"]}) @@ -320,9 +324,7 @@ def to_arrow_iter( self, results_buffer_size: Union[Optional[int], Literal["num_cpus"]] = "num_cpus", ) -> Iterator["pyarrow.RecordBatch"]: - """ - Return an iterator of pyarrow recordbatches for this dataframe. - """ + """Return an iterator of pyarrow recordbatches for this dataframe.""" for name in self.schema().column_names(): if self.schema()[name].dtype._is_python_type(): raise ValueError( @@ -552,7 +554,7 @@ def write_parquet( partition_cols: Optional[List[ColumnInputType]] = None, io_config: Optional[IOConfig] = None, ) -> "DataFrame": - """Writes the DataFrame as parquet files, returning a new DataFrame with paths to the files that were written + """Writes the DataFrame as parquet files, returning a new DataFrame with paths to the files that were written. Files will be written to ``/*`` with randomly generated UUIDs as the file names. @@ -624,7 +626,7 @@ def write_csv( partition_cols: Optional[List[ColumnInputType]] = None, io_config: Optional[IOConfig] = None, ) -> "DataFrame": - """Writes the DataFrame as CSV files, returning a new DataFrame with paths to the files that were written + """Writes the DataFrame as CSV files, returning a new DataFrame with paths to the files that were written. Files will be written to ``/*`` with randomly generated UUIDs as the file names. @@ -684,6 +686,7 @@ def write_csv( @DataframePublicAPI def write_iceberg(self, table: "pyiceberg.table.Table", mode: str = "append") -> "DataFrame": """Writes the DataFrame to an `Iceberg `__ table, returning a new DataFrame with the operations that occurred. + Can be run in either `append` or `overwrite` mode which will either appends the rows in the DataFrame or will delete the existing rows and then append the DataFrame rows respectively. .. NOTE:: @@ -696,7 +699,6 @@ def write_iceberg(self, table: "pyiceberg.table.Table", mode: str = "append") -> Returns: DataFrame: The operations that occurred with this write. """ - import pyarrow as pa import pyiceberg from packaging.version import parse @@ -857,7 +859,6 @@ def write_deltalake( Returns: DataFrame: The operations that occurred with this write. """ - import json import deltalake @@ -1028,22 +1029,21 @@ def write_lance( io_config: Optional[IOConfig] = None, **kwargs, ) -> "DataFrame": - """ - Writes the DataFrame to a Lance table + """Writes the DataFrame to a Lance table. + Note: `write_lance` requires python 3.9 or higher Args: uri: The URI of the Lance table to write to mode: The write mode. One of "create", "append", or "overwrite" io_config (IOConfig, optional): configurations to use when interacting with remote storage. - **kwargs: Additional keyword arguments to pass to the Lance writer + **kwargs: Additional keyword arguments to pass to the Lance writer. + Example: -------- - - >>> import daft >>> df = daft.from_pydict({"a": [1, 2, 3, 4]}) - >>> df.write_lance("/tmp/lance/my_table.lance") # doctest: +SKIP + >>> df.write_lance("/tmp/lance/my_table.lance") # doctest: +SKIP ╭───────────────┬──────────────────┬─────────────────┬─────────╮ │ num_fragments ┆ num_deleted_rows ┆ num_small_files ┆ version │ │ --- ┆ --- ┆ --- ┆ --- │ @@ -1054,7 +1054,7 @@ def write_lance( (Showing first 1 of 1 rows) - >>> daft.read_lance("/tmp/lance/my_table.lance").collect() # doctest: +SKIP + >>> daft.read_lance("/tmp/lance/my_table.lance").collect() # doctest: +SKIP ╭───────╮ │ a │ │ --- │ @@ -1074,7 +1074,7 @@ def write_lance( # Pass additional keyword arguments to the Lance writer # All additional keyword arguments are passed to `lance.write_fragments` - >>> df.write_lance("/tmp/lance/my_table.lance", mode="overwrite", max_bytes_per_file=1024) # doctest: +SKIP + >>> df.write_lance("/tmp/lance/my_table.lance", mode="overwrite", max_bytes_per_file=1024) # doctest: +SKIP ╭───────────────┬──────────────────┬─────────────────┬─────────╮ │ num_fragments ┆ num_deleted_rows ┆ num_small_files ┆ version │ │ --- ┆ --- ┆ --- ┆ --- │ @@ -1169,23 +1169,21 @@ def _is_column_input(self, x: Any) -> bool: return isinstance(x, str) or isinstance(x, Expression) def _column_inputs_to_expressions(self, columns: ManyColumnsInputType) -> List[Expression]: - """ - Inputs to dataframe operations can be passed in as individual arguments or an iterable. + """Inputs to dataframe operations can be passed in as individual arguments or an iterable. + In addition, they may be strings or Expressions. This method normalizes the inputs to a list of Expressions. """ - column_iter: Iterable[ColumnInputType] = [columns] if self._is_column_input(columns) else columns # type: ignore return [col(c) if isinstance(c, str) else c for c in column_iter] def _wildcard_inputs_to_expressions(self, columns: Tuple[ManyColumnsInputType, ...]) -> List[Expression]: - """Handles wildcard argument column inputs""" - + """Handles wildcard argument column inputs.""" column_input: Iterable[ColumnInputType] = columns[0] if len(columns) == 1 else columns # type: ignore return self._column_inputs_to_expressions(column_input) def __getitem__(self, item: Union[slice, int, str, Iterable[Union[str, int]]]) -> Union[Expression, "DataFrame"]: - """Gets a column from the DataFrame as an Expression (``df["mycol"]``)""" + """Gets a column from the DataFrame as an Expression (``df["mycol"]``).""" result: Optional[Expression] if isinstance(item, int): @@ -1257,18 +1255,17 @@ def _add_monotonically_increasing_id(self, column_name: Optional[str] = None) -> Returns: DataFrame: DataFrame with a new column of monotonically increasing ids. """ - builder = self._builder.add_monotonically_increasing_id(column_name) return DataFrame(builder) @DataframePublicAPI def select(self, *columns: ColumnInputType) -> "DataFrame": - """Creates a new DataFrame from the provided expressions, similar to a SQL ``SELECT`` + """Creates a new DataFrame from the provided expressions, similar to a SQL ``SELECT``. Examples: >>> import daft >>> df = daft.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]}) - >>> df = df.select('x', daft.col('y'), daft.col('z') + 1) + >>> df = df.select("x", daft.col("y"), daft.col("z") + 1) >>> df.show() ╭───────┬───────┬───────╮ │ x ┆ y ┆ z │ @@ -1296,7 +1293,7 @@ def select(self, *columns: ColumnInputType) -> "DataFrame": @DataframePublicAPI def distinct(self) -> "DataFrame": - """Computes unique rows, dropping duplicates + """Computes unique rows, dropping duplicates. Example: >>> import daft @@ -1329,7 +1326,7 @@ def sample( with_replacement: bool = False, seed: Optional[int] = None, ) -> "DataFrame": - """Samples a fraction of rows from the DataFrame + """Samples a fraction of rows from the DataFrame. Example: >>> import daft @@ -1363,14 +1360,14 @@ def sample( @DataframePublicAPI def exclude(self, *names: str) -> "DataFrame": - """Drops columns from the current DataFrame by name + """Drops columns from the current DataFrame by name. This is equivalent of performing a select with all the columns but the ones excluded. Example: >>> import daft >>> df = daft.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]}) - >>> df_without_x = df.exclude('x') + >>> df_without_x = df.exclude("x") >>> df_without_x.show() ╭───────┬───────╮ │ y ┆ z │ @@ -1417,10 +1414,9 @@ def where(self, predicate: Union[Expression, str]) -> "DataFrame": """Filters rows via a predicate expression, similar to SQL ``WHERE``. Example: - >>> import daft >>> df = daft.from_pydict({"x": [1, 2, 3], "y": [4, 6, 6], "z": [7, 8, 9]}) - >>> df.where((col('x') > 1) & (col('y') > 1)).collect() + >>> df.where((col("x") > 1) & (col("y") > 1)).collect() ╭───────┬───────┬───────╮ │ x ┆ y ┆ z │ │ --- ┆ --- ┆ --- │ @@ -1470,13 +1466,12 @@ def with_column( column_name: str, expr: Expression, ) -> "DataFrame": - """Adds a column to the current DataFrame with an Expression, equivalent to a ``select`` - with all current columns and the new one + """Adds a column to the current DataFrame with an Expression, equivalent to a ``select`` with all current columns and the new one. Example: >>> import daft >>> df = daft.from_pydict({"x": [1, 2, 3]}) - >>> new_df = df.with_column('x+1', col('x') + 1) + >>> new_df = df.with_column("x+1", col("x") + 1) >>> new_df.show() ╭───────┬───────╮ │ x ┆ x+1 │ @@ -1506,13 +1501,12 @@ def with_columns( self, columns: Dict[str, Expression], ) -> "DataFrame": - """Adds columns to the current DataFrame with Expressions, equivalent to a ``select`` - with all current columns and the new ones + """Adds columns to the current DataFrame with Expressions, equivalent to a ``select`` with all current columns and the new ones. Example: >>> import daft - >>> df = daft.from_pydict({'x': [1, 2, 3], 'y': [4, 5, 6]}) - >>> new_df = df.with_columns({'foo': df['x'] + 1,'bar': df['y'] - df['x']}) + >>> df = daft.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6]}) + >>> new_df = df.with_columns({"foo": df["x"] + 1, "bar": df["y"] - df["x"]}) >>> new_df.show() ╭───────┬───────┬───────┬───────╮ │ x ┆ y ┆ foo ┆ bar │ @@ -1545,7 +1539,7 @@ def sort( by: Union[ColumnInputType, List[ColumnInputType]], desc: Union[bool, List[bool]] = False, ) -> "DataFrame": - """Sorts DataFrame globally + """Sorts DataFrame globally. Note: * Since this a global sort, this requires an expensive repartition which can be quite slow. @@ -1554,7 +1548,7 @@ def sort( Example: >>> import daft >>> df = daft.from_pydict({"x": [3, 2, 1], "y": [6, 4, 5]}) - >>> sorted_df = df.sort(col('x') + col('y')) + >>> sorted_df = df.sort(col("x") + col("y")) >>> sorted_df.show() ╭───────┬───────╮ │ x ┆ y │ @@ -1610,12 +1604,12 @@ def sort( @DataframePublicAPI def limit(self, num: int) -> "DataFrame": - """Limits the rows in the DataFrame to the first ``N`` rows, similar to a SQL ``LIMIT`` + """Limits the rows in the DataFrame to the first ``N`` rows, similar to a SQL ``LIMIT``. Example: >>> import daft >>> df = df = daft.from_pydict({"x": [1, 2, 3, 4, 5, 6, 7]}) - >>> df_limited = df.limit(5) # returns 5 rows + >>> df_limited = df.limit(5) # returns 5 rows >>> df_limited.show() ╭───────╮ │ x │ @@ -1661,7 +1655,7 @@ def count_rows(self) -> int: @DataframePublicAPI def repartition(self, num: Optional[int], *partition_by: ColumnInputType) -> "DataFrame": - """Repartitions DataFrame to ``num`` partitions + """Repartitions DataFrame to ``num`` partitions. If columns are passed in, then DataFrame will be repartitioned by those, otherwise random repartitioning will occur. @@ -1734,7 +1728,7 @@ def join( prefix: Optional[str] = None, suffix: Optional[str] = None, ) -> "DataFrame": - """Column-wise join of the current DataFrame with an ``other`` DataFrame, similar to a SQL ``JOIN`` + """Column-wise join of the current DataFrame with an ``other`` DataFrame, similar to a SQL ``JOIN``. If the two DataFrames have duplicate non-join key column names, "right." will be prepended to the conflicting right columns. You can change the behavior by passing either (or both) `prefix` or `suffix` to the function. If `prefix` is passed, it will be prepended to the conflicting right columns. If `suffix` is passed, it will be appended to the conflicting right columns. @@ -1746,9 +1740,9 @@ def join( Example: >>> import daft >>> from daft import col - >>> df1 = daft.from_pydict({ "a": ["w", "x", "y"], "b": [1, 2, 3] }) - >>> df2 = daft.from_pydict({ "a": ["x", "y", "z"], "b": [20, 30, 40] }) - >>> joined_df = df1.join(df2, left_on=[col("a"), col("b")], right_on=[col("a"), col("b")/10]) + >>> df1 = daft.from_pydict({"a": ["w", "x", "y"], "b": [1, 2, 3]}) + >>> df2 = daft.from_pydict({"a": ["x", "y", "z"], "b": [20, 30, 40]}) + >>> joined_df = df1.join(df2, left_on=[col("a"), col("b")], right_on=[col("a"), col("b") / 10]) >>> joined_df.show() ╭──────┬───────┬─────────╮ │ a ┆ b ┆ right.b │ @@ -1764,9 +1758,9 @@ def join( >>> import daft >>> from daft import col - >>> df1 = daft.from_pydict({ "a": ["w", "x", "y"], "b": [1, 2, 3] }) - >>> df2 = daft.from_pydict({ "a": ["x", "y", "z"], "b": [20, 30, 40] }) - >>> joined_df = df1.join(df2, left_on=[col("a"), col("b")], right_on=[col("a"), col("b")/10], prefix="right_") + >>> df1 = daft.from_pydict({"a": ["w", "x", "y"], "b": [1, 2, 3]}) + >>> df2 = daft.from_pydict({"a": ["x", "y", "z"], "b": [20, 30, 40]}) + >>> joined_df = df1.join(df2, left_on=[col("a"), col("b")], right_on=[col("a"), col("b") / 10], prefix="right_") >>> joined_df.show() ╭──────┬───────┬─────────╮ │ a ┆ b ┆ right_b │ @@ -1782,9 +1776,9 @@ def join( >>> import daft >>> from daft import col - >>> df1 = daft.from_pydict({ "a": ["w", "x", "y"], "b": [1, 2, 3] }) - >>> df2 = daft.from_pydict({ "a": ["x", "y", "z"], "b": [20, 30, 40] }) - >>> joined_df = df1.join(df2, left_on=[col("a"), col("b")], right_on=[col("a"), col("b")/10], suffix="_right") + >>> df1 = daft.from_pydict({"a": ["w", "x", "y"], "b": [1, 2, 3]}) + >>> df2 = daft.from_pydict({"a": ["x", "y", "z"], "b": [20, 30, 40]}) + >>> joined_df = df1.join(df2, left_on=[col("a"), col("b")], right_on=[col("a"), col("b") / 10], suffix="_right") >>> joined_df.show() ╭──────┬───────┬─────────╮ │ a ┆ b ┆ b_right │ @@ -1816,7 +1810,6 @@ def join( Returns: DataFrame: Joined DataFrame. """ - if how == "cross": if any(side_on is not None for side_on in [on, left_on, right_on]): raise ValueError("In a cross join, `on`, `left_on`, and `right_on` cannot be set") @@ -1855,7 +1848,9 @@ def join( @DataframePublicAPI def concat(self, other: "DataFrame") -> "DataFrame": - """Concatenates two DataFrames together in a "vertical" concatenation. The resulting DataFrame + """Concatenates two DataFrames together in a "vertical" concatenation. + + The resulting DataFrame has number of rows equal to the sum of the number of rows of the input DataFrames. .. NOTE:: @@ -1879,12 +1874,13 @@ def concat(self, other: "DataFrame") -> "DataFrame": @DataframePublicAPI def drop_nan(self, *cols: ColumnInputType): """Drops rows that contains NaNs. If cols is None it will drop rows with any NaN value. + If column names are supplied, it will drop only those rows that contains NaNs in one of these columns. Example: >>> import daft >>> df = daft.from_pydict({"a": [1.0, 2.2, 3.5, float("nan")]}) - >>> df.drop_nan().collect() # drops rows where any column contains NaN values + >>> df.drop_nan().collect() # drops rows where any column contains NaN values ╭─────────╮ │ a │ │ --- │ @@ -1945,6 +1941,7 @@ def drop_nan(self, *cols: ColumnInputType): @DataframePublicAPI def drop_null(self, *cols: ColumnInputType): """Drops rows that contains NaNs or NULLs. If cols is None it will drop rows with any NULL value. + If column names are supplied, it will drop only those rows that contains NULLs in one of these columns. Example: @@ -1979,8 +1976,7 @@ def drop_null(self, *cols: ColumnInputType): @DataframePublicAPI def explode(self, *columns: ColumnInputType) -> "DataFrame": - """Explodes a List column, where every element in each row's List becomes its own row, and all - other columns in the DataFrame are duplicated across rows + """Explodes a List column, where every element in each row's List becomes its own row, and all other columns in the DataFrame are duplicated across rows. If multiple columns are specified, each row must contain the same number of items in each specified column. @@ -2036,11 +2032,13 @@ def unpivot( Example: >>> import daft - >>> df = daft.from_pydict({ - ... "year": [2020, 2021, 2022], - ... "Jan": [10, 30, 50], - ... "Feb": [20, 40, 60], - ... }) + >>> df = daft.from_pydict( + ... { + ... "year": [2020, 2021, 2022], + ... "Jan": [10, 30, 50], + ... "Feb": [20, 40, 60], + ... } + ... ) >>> df = df.unpivot("year", ["Jan", "Feb"], variable_name="month", value_name="inventory") >>> df = df.sort("year") >>> df.show() @@ -2073,7 +2071,7 @@ def unpivot( Returns: DataFrame: Unpivoted DataFrame - See also: + See Also: `melt` """ ids_exprs = self._column_inputs_to_expressions(ids) @@ -2090,9 +2088,9 @@ def melt( variable_name: str = "variable", value_name: str = "value", ) -> "DataFrame": - """Alias for unpivot + """Alias for unpivot. - See also: + See Also: `unpivot` """ return self.unpivot(ids, values, variable_name, value_name) @@ -2100,19 +2098,18 @@ def melt( @DataframePublicAPI def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any) -> "DataFrame": """Apply a function that takes and returns a DataFrame. + Allow splitting your transformation into different units of work (functions) while preserving the syntax for chaining transformations. Example: >>> import daft - >>> df = daft.from_pydict({"col_a":[1,2,3,4]}) + >>> df = daft.from_pydict({"col_a": [1, 2, 3, 4]}) >>> def add_1(df): ... df = df.select(daft.col("col_a") + 1) ... return df - ... >>> def multiply_x(df, x): ... df = df.select(daft.col("col_a") * x) ... return df - ... >>> df = df.transform(add_1).transform(multiply_x, 4) >>> df.show() ╭───────╮ @@ -2135,6 +2132,7 @@ def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any) func: A function that takes and returns a DataFrame. *args: Positional arguments to pass to func. **kwargs: Keyword arguments to pass to func. + Returns: DataFrame: Transformed DataFrame. """ @@ -2192,7 +2190,7 @@ def _map_groups(self, udf: Expression, group_by: Optional[ExpressionsProjection] @DataframePublicAPI def sum(self, *cols: ManyColumnsInputType) -> "DataFrame": - """Performs a global sum on the DataFrame + """Performs a global sum on the DataFrame. Args: *cols (Union[str, Expression]): columns to sum @@ -2203,7 +2201,7 @@ def sum(self, *cols: ManyColumnsInputType) -> "DataFrame": @DataframePublicAPI def mean(self, *cols: ColumnInputType) -> "DataFrame": - """Performs a global mean on the DataFrame + """Performs a global mean on the DataFrame. Args: *cols (Union[str, Expression]): columns to mean @@ -2214,11 +2212,11 @@ def mean(self, *cols: ColumnInputType) -> "DataFrame": @DataframePublicAPI def stddev(self, *cols: ColumnInputType) -> "DataFrame": - """Performs a global standard deviation on the DataFrame + """Performs a global standard deviation on the DataFrame. Example: >>> import daft - >>> df = daft.from_pydict({"col_a":[0,1,2]}) + >>> df = daft.from_pydict({"col_a": [0, 1, 2]}) >>> df = df.stddev("col_a") >>> df.show() ╭───────────────────╮ @@ -2241,7 +2239,7 @@ def stddev(self, *cols: ColumnInputType) -> "DataFrame": @DataframePublicAPI def min(self, *cols: ColumnInputType) -> "DataFrame": - """Performs a global min on the DataFrame + """Performs a global min on the DataFrame. Args: *cols (Union[str, Expression]): columns to min @@ -2252,7 +2250,7 @@ def min(self, *cols: ColumnInputType) -> "DataFrame": @DataframePublicAPI def max(self, *cols: ColumnInputType) -> "DataFrame": - """Performs a global max on the DataFrame + """Performs a global max on the DataFrame. Args: *cols (Union[str, Expression]): columns to max @@ -2264,6 +2262,7 @@ def max(self, *cols: ColumnInputType) -> "DataFrame": @DataframePublicAPI def any_value(self, *cols: ColumnInputType) -> "DataFrame": """Returns an arbitrary value on this DataFrame. + Values for each column are not guaranteed to be from the same row. Args: @@ -2275,7 +2274,7 @@ def any_value(self, *cols: ColumnInputType) -> "DataFrame": @DataframePublicAPI def count(self, *cols: ColumnInputType) -> "DataFrame": - """Performs a global count on the DataFrame + """Performs a global count on the DataFrame. If no columns are specified (i.e. in the case you call `df.count()`), or only the literal string "*", this functions very similarly to a COUNT(*) operation in SQL and will return a new dataframe with a @@ -2340,7 +2339,7 @@ def count(self, *cols: ColumnInputType) -> "DataFrame": @DataframePublicAPI def agg_list(self, *cols: ColumnInputType) -> "DataFrame": - """Performs a global list agg on the DataFrame + """Performs a global list agg on the DataFrame. Args: *cols (Union[str, Expression]): columns to form into a list @@ -2351,7 +2350,7 @@ def agg_list(self, *cols: ColumnInputType) -> "DataFrame": @DataframePublicAPI def agg_concat(self, *cols: ColumnInputType) -> "DataFrame": - """Performs a global list concatenation agg on the DataFrame + """Performs a global list concatenation agg on the DataFrame. Args: *cols (Union[str, Expression]): columns that are lists to concatenate @@ -2362,7 +2361,9 @@ def agg_concat(self, *cols: ColumnInputType) -> "DataFrame": @DataframePublicAPI def agg(self, *to_agg: Union[Expression, Iterable[Expression]]) -> "DataFrame": - """Perform aggregations on this DataFrame. Allows for mixed aggregations for multiple columns + """Perform aggregations on this DataFrame. + + Allows for mixed aggregations for multiple columns. Will return a single row that aggregated the entire DataFrame. For a full list of aggregation expressions, see :ref:`Aggregation Expressions ` @@ -2370,16 +2371,14 @@ def agg(self, *to_agg: Union[Expression, Iterable[Expression]]) -> "DataFrame": Example: >>> import daft >>> from daft import col - >>> df = daft.from_pydict({ - ... "student_id": [1, 2, 3, 4], - ... "test1": [0.5, 0.4, 0.6, 0.7], - ... "test2": [0.9, 0.8, 0.7, 1.0] - ... }) + >>> df = daft.from_pydict( + ... {"student_id": [1, 2, 3, 4], "test1": [0.5, 0.4, 0.6, 0.7], "test2": [0.9, 0.8, 0.7, 1.0]} + ... ) >>> agg_df = df.agg( ... col("test1").mean(), ... col("test2").mean(), - ... ((col("test1") + col("test2"))/2).min().alias("total_min"), - ... ((col("test1") + col("test2"))/2).max().alias("total_max"), + ... ((col("test1") + col("test2")) / 2).min().alias("total_min"), + ... ((col("test1") + col("test2")) / 2).max().alias("total_max"), ... ) >>> agg_df.show() ╭─────────┬────────────────────┬────────────────────┬───────────╮ @@ -2401,7 +2400,7 @@ def agg(self, *to_agg: Union[Expression, Iterable[Expression]]) -> "DataFrame": to_agg_list = ( list(to_agg[0]) if (len(to_agg) == 1 and not isinstance(to_agg[0], Expression)) - else list(typing.cast(Tuple[Expression], to_agg)) + else list(typing.cast("Tuple[Expression]", to_agg)) ) for expr in to_agg_list: @@ -2412,21 +2411,19 @@ def agg(self, *to_agg: Union[Expression, Iterable[Expression]]) -> "DataFrame": @DataframePublicAPI def groupby(self, *group_by: ManyColumnsInputType) -> "GroupedDataFrame": - """Performs a GroupBy on the DataFrame for aggregation + """Performs a GroupBy on the DataFrame for aggregation. Example: >>> import daft >>> from daft import col - >>> df = daft.from_pydict({ - ... "pet": ["cat", "dog", "dog", "cat"], - ... "age": [1, 2, 3, 4], - ... "name": ["Alex", "Jordan", "Sam", "Riley"] - ... }) + >>> df = daft.from_pydict( + ... {"pet": ["cat", "dog", "dog", "cat"], "age": [1, 2, 3, 4], "name": ["Alex", "Jordan", "Sam", "Riley"]} + ... ) >>> grouped_df = df.groupby("pet").agg( ... col("age").min().alias("min_age"), ... col("age").max().alias("max_age"), ... col("pet").count().alias("count"), - ... col("name").any_value() + ... col("name").any_value(), ... ) >>> grouped_df.show() ╭──────┬─────────┬─────────┬────────┬────────╮ @@ -2550,7 +2547,7 @@ def _materialize_results(self) -> None: @DataframePublicAPI def collect(self, num_preview_rows: Optional[int] = 8) -> "DataFrame": - """Executes the entire DataFrame and materializes the results + """Executes the entire DataFrame and materializes the results. .. NOTE:: This call is **blocking** and will execute the DataFrame when called @@ -2572,7 +2569,7 @@ def collect(self, num_preview_rows: Optional[int] = 8) -> "DataFrame": return self def _construct_show_display(self, n: int) -> "DataFrameDisplay": - """Helper for .show() which will construct the underlying DataFrameDisplay object""" + """Helper for .show() which will construct the underlying DataFrameDisplay object.""" preview_partition = self._preview.preview_partition total_rows = self._preview.dataframe_num_rows @@ -2621,7 +2618,7 @@ def _construct_show_display(self, n: int) -> "DataFrameDisplay": @DataframePublicAPI def show(self, n: int = 8) -> None: - """Executes enough of the DataFrame in order to display the first ``n`` rows + """Executes enough of the DataFrame in order to display the first ``n`` rows. If IPython is installed, this will use IPython's `display` utility to pretty-print in a notebook/REPL environment. Otherwise, this will fall back onto a naive Python `print`. @@ -2643,13 +2640,13 @@ def show(self, n: int = 8) -> None: def __len__(self): """Returns the count of rows when dataframe is materialized. + If dataframe is not materialized yet, raises a runtime error. Returns: int: count of rows. """ - if self._result is not None: return len(self._result) @@ -2666,7 +2663,7 @@ def __contains__(self, col_name: str) -> bool: Example: >>> import daft >>> df = daft.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]}) - >>> 'x' in df + >>> "x" in df True Args: @@ -2680,6 +2677,7 @@ def __contains__(self, col_name: str) -> bool: @DataframePublicAPI def to_pandas(self, coerce_temporal_nanoseconds: bool = False) -> "pandas.DataFrame": """Converts the current DataFrame to a `pandas DataFrame `__. + If results have not computed yet, collect will be called. Args: @@ -2704,6 +2702,7 @@ def to_pandas(self, coerce_temporal_nanoseconds: bool = False) -> "pandas.DataFr @DataframePublicAPI def to_arrow(self) -> "pyarrow.Table": """Converts the current DataFrame to a `pyarrow Table `__. + If results have not computed yet, collect will be called. Returns: @@ -2737,6 +2736,7 @@ def to_pydict(self) -> Dict[str, List[Any]]: @DataframePublicAPI def to_pylist(self) -> List[Any]: """Converts the current Dataframe into a python list. + .. WARNING:: This is a convenience method over :meth:`DataFrame.iter_rows() `. Users should prefer using `.iter_rows()` directly instead for lower memory utilization if they are streaming rows out of a DataFrame and don't require full materialization of the Python list. @@ -2757,9 +2757,7 @@ def to_pylist(self) -> List[Any]: @DataframePublicAPI def to_torch_map_dataset(self) -> "torch.utils.data.Dataset": - """Convert the current DataFrame into a map-style - `Torch Dataset `__ - for use with PyTorch. + """Convert the current DataFrame into a map-style `Torch Dataset `__ for use with PyTorch. This method will materialize the entire DataFrame and block on completion. @@ -2779,9 +2777,7 @@ def to_torch_map_dataset(self) -> "torch.utils.data.Dataset": @DataframePublicAPI def to_torch_iter_dataset(self) -> "torch.utils.data.IterableDataset": - """Convert the current DataFrame into a - `Torch IterableDataset `__ - for use with PyTorch. + """Convert the current DataFrame into a `Torch IterableDataset `__ for use with PyTorch. Begins execution of the DataFrame if it is not yet executed. @@ -2804,7 +2800,7 @@ def to_torch_iter_dataset(self) -> "torch.utils.data.IterableDataset": @DataframePublicAPI def to_ray_dataset(self) -> "ray.data.dataset.DataSet": - """Converts the current DataFrame to a `Ray Dataset `__ which is useful for running distributed ML model training in Ray + """Converts the current DataFrame to a `Ray Dataset `__ which is useful for running distributed ML model training in Ray. .. NOTE:: This function can only work if Daft is running using the RayRunner @@ -2982,7 +2978,7 @@ def __post_init__(self): raise ExpressionTypeError(f"Cannot groupby on null type expression: {e}") def __getitem__(self, item: Union[slice, int, str, Iterable[Union[str, int]]]) -> Union[Expression, "DataFrame"]: - """Gets a column from the DataFrame as an Expression""" + """Gets a column from the DataFrame as an Expression.""" return self.df.__getitem__(item) def sum(self, *cols: ColumnInputType) -> "DataFrame": @@ -3012,7 +3008,7 @@ def stddev(self, *cols: ColumnInputType) -> "DataFrame": Example: >>> import daft - >>> df = daft.from_pydict({"keys": ["a", "a", "a", "b"], "col_a": [0,1,2,100]}) + >>> df = daft.from_pydict({"keys": ["a", "a", "a", "b"], "col_a": [0, 1, 2, 100]}) >>> df = df.groupby("keys").stddev() >>> df.show() ╭──────┬───────────────────╮ @@ -3059,6 +3055,7 @@ def max(self, *cols: ColumnInputType) -> "DataFrame": def any_value(self, *cols: ColumnInputType) -> "DataFrame": """Returns an arbitrary value on this GroupedDataFrame. + Values for each column are not guaranteed to be from the same row. Args: @@ -3101,16 +3098,14 @@ def agg(self, *to_agg: Union[Expression, Iterable[Expression]]) -> "DataFrame": Example: >>> import daft >>> from daft import col - >>> df = daft.from_pydict({ - ... "pet": ["cat", "dog", "dog", "cat"], - ... "age": [1, 2, 3, 4], - ... "name": ["Alex", "Jordan", "Sam", "Riley"] - ... }) + >>> df = daft.from_pydict( + ... {"pet": ["cat", "dog", "dog", "cat"], "age": [1, 2, 3, 4], "name": ["Alex", "Jordan", "Sam", "Riley"]} + ... ) >>> grouped_df = df.groupby("pet").agg( ... col("age").min().alias("min_age"), ... col("age").max().alias("max_age"), ... col("pet").count().alias("count"), - ... col("name").any_value() + ... col("name").any_value(), ... ) >>> grouped_df.show() ╭──────┬─────────┬─────────┬────────┬────────╮ @@ -3134,7 +3129,7 @@ def agg(self, *to_agg: Union[Expression, Iterable[Expression]]) -> "DataFrame": to_agg_list = ( list(to_agg[0]) if (len(to_agg) == 1 and not isinstance(to_agg[0], Expression)) - else list(typing.cast(Tuple[Expression], to_agg)) + else list(typing.cast("Tuple[Expression]", to_agg)) ) for expr in to_agg_list: diff --git a/daft/datatype.py b/daft/datatype.py index 3ce448d4c4..646dd231ec 100644 --- a/daft/datatype.py +++ b/daft/datatype.py @@ -73,7 +73,7 @@ def __str__(self) -> str: class DataType: - """A Daft DataType defines the type of all the values in an Expression or DataFrame column""" + """A Daft DataType defines the type of all the values in an Expression or DataFrame column.""" _dtype: PyDataType @@ -91,79 +91,79 @@ def _from_pydatatype(pydt: PyDataType) -> DataType: @classmethod def int8(cls) -> DataType: - """Create an 8-bit integer DataType""" + """Create an 8-bit integer DataType.""" return cls._from_pydatatype(PyDataType.int8()) @classmethod def int16(cls) -> DataType: - """Create an 16-bit integer DataType""" + """Create an 16-bit integer DataType.""" return cls._from_pydatatype(PyDataType.int16()) @classmethod def int32(cls) -> DataType: - """Create an 32-bit integer DataType""" + """Create an 32-bit integer DataType.""" return cls._from_pydatatype(PyDataType.int32()) @classmethod def int64(cls) -> DataType: - """Create an 64-bit integer DataType""" + """Create an 64-bit integer DataType.""" return cls._from_pydatatype(PyDataType.int64()) @classmethod def uint8(cls) -> DataType: - """Create an unsigned 8-bit integer DataType""" + """Create an unsigned 8-bit integer DataType.""" return cls._from_pydatatype(PyDataType.uint8()) @classmethod def uint16(cls) -> DataType: - """Create an unsigned 16-bit integer DataType""" + """Create an unsigned 16-bit integer DataType.""" return cls._from_pydatatype(PyDataType.uint16()) @classmethod def uint32(cls) -> DataType: - """Create an unsigned 32-bit integer DataType""" + """Create an unsigned 32-bit integer DataType.""" return cls._from_pydatatype(PyDataType.uint32()) @classmethod def uint64(cls) -> DataType: - """Create an unsigned 64-bit integer DataType""" + """Create an unsigned 64-bit integer DataType.""" return cls._from_pydatatype(PyDataType.uint64()) @classmethod def float32(cls) -> DataType: - """Create a 32-bit float DataType""" + """Create a 32-bit float DataType.""" return cls._from_pydatatype(PyDataType.float32()) @classmethod def float64(cls) -> DataType: - """Create a 64-bit float DataType""" + """Create a 64-bit float DataType.""" return cls._from_pydatatype(PyDataType.float64()) @classmethod def string(cls) -> DataType: - """Create a String DataType: A string of UTF8 characters""" + """Create a String DataType: A string of UTF8 characters.""" return cls._from_pydatatype(PyDataType.string()) @classmethod def bool(cls) -> DataType: - """Create the Boolean DataType: Either ``True`` or ``False``""" + """Create the Boolean DataType: Either ``True`` or ``False``.""" return cls._from_pydatatype(PyDataType.bool()) @classmethod def binary(cls) -> DataType: - """Create a Binary DataType: A string of bytes""" + """Create a Binary DataType: A string of bytes.""" return cls._from_pydatatype(PyDataType.binary()) @classmethod def fixed_size_binary(cls, size: int) -> DataType: - """Create a FixedSizeBinary DataType: A fixed-size string of bytes""" + """Create a FixedSizeBinary DataType: A fixed-size string of bytes.""" if not isinstance(size, int) or size <= 0: raise ValueError("The size for a fixed-size binary must be a positive integer, but got: ", size) return cls._from_pydatatype(PyDataType.fixed_size_binary(size)) @classmethod def null(cls) -> DataType: - """Creates the Null DataType: Always the ``Null`` value""" + """Creates the Null DataType: Always the ``Null`` value.""" return cls._from_pydatatype(PyDataType.null()) @classmethod @@ -173,7 +173,7 @@ def decimal128(cls, precision: int, scale: int) -> DataType: @classmethod def date(cls) -> DataType: - """Create a Date DataType: A date with a year, month and day""" + """Create a Date DataType: A date with a year, month and day.""" return cls._from_pydatatype(PyDataType.date()) @classmethod @@ -204,7 +204,7 @@ def interval(cls) -> DataType: @classmethod def list(cls, dtype: DataType) -> DataType: - """Create a List DataType: Variable-length list, where each element in the list has type ``dtype`` + """Create a List DataType: Variable-length list, where each element in the list has type ``dtype``. Args: dtype: DataType of each element in the list @@ -213,8 +213,7 @@ def list(cls, dtype: DataType) -> DataType: @classmethod def fixed_size_list(cls, dtype: DataType, size: int) -> DataType: - """Create a FixedSizeList DataType: Fixed-size list, where each element in the list has type ``dtype`` - and each list has length ``size``. + """Create a FixedSizeList DataType: Fixed-size list, where each element in the list has type ``dtype`` and each list has length ``size``. Args: dtype: DataType of each element in the list @@ -227,6 +226,7 @@ def fixed_size_list(cls, dtype: DataType, size: int) -> DataType: @classmethod def map(cls, key_type: DataType, value_type: DataType) -> DataType: """Create a Map DataType: A map is a nested type of key-value pairs that is implemented as a list of structs with two fields, key and value. + Args: key_type: DataType of the keys in the map value_type: DataType of the values in the map @@ -235,7 +235,7 @@ def map(cls, key_type: DataType, value_type: DataType) -> DataType: @classmethod def struct(cls, fields: dict[str, DataType]) -> DataType: - """Create a Struct DataType: a nested type which has names mapped to child types + """Create a Struct DataType: a nested type which has names mapped to child types. Example: >>> DataType.struct({"name": DataType.string(), "age": DataType.int64()}) @@ -251,8 +251,7 @@ def extension(cls, name: str, storage_dtype: DataType, metadata: str | None = No @classmethod def embedding(cls, dtype: DataType, size: int) -> DataType: - """Create an Embedding DataType: embeddings are fixed size arrays, where each element - in the array has a **numeric** ``dtype`` and each array has a fixed length of ``size``. + """Create an Embedding DataType: embeddings are fixed size arrays, where each element in the array has a **numeric** ``dtype`` and each array has a fixed length of ``size``. Args: dtype: DataType of each element in the list (must be numeric) @@ -309,8 +308,7 @@ def tensor( dtype: DataType, shape: tuple[int, ...] | None = None, ) -> DataType: - """Create a tensor DataType: tensor arrays contain n-dimensional arrays of data of the provided ``dtype`` as elements, each of the provided - ``shape``. + """Create a tensor DataType: tensor arrays contain n-dimensional arrays of data of the provided ``dtype`` as elements, each of the provided ``shape``. If a ``shape`` is given, each ndarray in the column will have this shape. @@ -333,8 +331,7 @@ def sparse_tensor( dtype: DataType, shape: tuple[int, ...] | None = None, ) -> DataType: - """Create a SparseTensor DataType: SparseTensor arrays implemented as 'COO Sparse Tensor' representation of n-dimensional arrays of data of the provided ``dtype`` as elements, each of the provided - ``shape``. + """Create a SparseTensor DataType: SparseTensor arrays implemented as 'COO Sparse Tensor' representation of n-dimensional arrays of data of the provided ``dtype`` as elements, each of the provided ``shape``. If a ``shape`` is given, each ndarray in the column will have this shape. @@ -353,7 +350,7 @@ def sparse_tensor( @classmethod def from_arrow_type(cls, arrow_type: pa.lib.DataType) -> DataType: - """Maps a PyArrow DataType to a Daft DataType""" + """Maps a PyArrow DataType to a Daft DataType.""" if pa.types.is_int8(arrow_type): return cls.int8() elif pa.types.is_int16(arrow_type): @@ -461,7 +458,7 @@ def from_arrow_type(cls, arrow_type: pa.lib.DataType) -> DataType: @classmethod def from_numpy_dtype(cls, np_type: np.dtype) -> DataType: - """Maps a Numpy datatype to a Daft DataType""" + """Maps a Numpy datatype to a Daft DataType.""" arrow_type = pa.from_numpy_dtype(np_type) return cls.from_arrow_type(arrow_type) @@ -470,7 +467,7 @@ def to_arrow_dtype(self) -> pa.DataType: @classmethod def python(cls) -> DataType: - """Create a Python DataType: a type which refers to an arbitrary Python object""" + """Create a Python DataType: a type which refers to an arbitrary Python object.""" return cls._from_pydatatype(PyDataType.python()) def _is_python_type(self) -> builtins.bool: diff --git a/daft/delta_lake/delta_lake_scan.py b/daft/delta_lake/delta_lake_scan.py index a287a90b8c..20dd2e93ae 100644 --- a/daft/delta_lake/delta_lake_scan.py +++ b/daft/delta_lake/delta_lake_scan.py @@ -137,7 +137,7 @@ def to_scan_tasks(self, pushdowns: Pushdowns) -> Iterator[ScanTask]: add_actions: pa.RecordBatch = self._table.get_add_actions() if len(self.partitioning_keys()) > 0 and pushdowns.partition_filters is None: - logging.warning( + logger.warning( "%s has partitioning keys = %s, but no partition filter was specified. This will result in a full table scan.", self.display_name(), self.partitioning_keys(), diff --git a/daft/delta_lake/delta_lake_write.py b/daft/delta_lake/delta_lake_write.py index c4487e9d8a..e762c059cc 100644 --- a/daft/delta_lake/delta_lake_write.py +++ b/daft/delta_lake/delta_lake_write.py @@ -32,10 +32,7 @@ def sanitize_table_for_deltalake( def partitioned_table_to_deltalake_iter( partitioned: PartitionedTable, large_dtypes: bool ) -> Iterator[Tuple[pa.Table, str, Dict[str, Optional[str]]]]: - """ - Iterates over partitions, yielding each partition as an Arrow table, along with their respective paths and partition values. - """ - + """Iterates over partitions, yielding each partition as an Arrow table, along with their respective paths and partition values.""" partition_values = partitioned.partition_values() if partition_values: diff --git a/daft/exceptions.py b/daft/exceptions.py index 121b06938b..79d7d2ad38 100644 --- a/daft/exceptions.py +++ b/daft/exceptions.py @@ -3,19 +3,20 @@ class DaftCoreException(ValueError): - """DaftCore Base Exception""" + """DaftCore Base Exception.""" pass class DaftTypeError(DaftCoreException): - """Type Error that occurred in Daft Core""" + """Type Error that occurred in Daft Core.""" pass class DaftTransientError(DaftCoreException): - """Daft Transient Error + """Daft Transient Error. + This is typically raised when there is a network issue such as timeout or throttling. This can usually be retried. """ @@ -23,7 +24,8 @@ class DaftTransientError(DaftCoreException): class ConnectTimeoutError(DaftTransientError): - """Daft Connection Timeout Error + """Daft Connection Timeout Error. + Daft client was not able to make a connection to the server in the connect timeout time. """ @@ -31,7 +33,8 @@ class ConnectTimeoutError(DaftTransientError): class ReadTimeoutError(DaftTransientError): - """Daft Read Timeout Error + """Daft Read Timeout Error. + Daft client was not able to read bytes from server under the read timeout time. """ @@ -39,7 +42,8 @@ class ReadTimeoutError(DaftTransientError): class ByteStreamError(DaftTransientError): - """Daft Byte Stream Error + """Daft Byte Stream Error. + Daft client had an error while reading bytes in a stream from the server. """ @@ -47,7 +51,8 @@ class ByteStreamError(DaftTransientError): class SocketError(DaftTransientError): - """Daft Socket Error + """Daft Socket Error. + Daft client had a socket error while reading bytes in a stream from the server. """ @@ -55,7 +60,8 @@ class SocketError(DaftTransientError): class ThrottleError(DaftTransientError): - """Daft Throttle Error + """Daft Throttle Error. + Daft client had a throttle error while making request to server. """ @@ -63,7 +69,8 @@ class ThrottleError(DaftTransientError): class MiscTransientError(DaftTransientError): - """Daft Misc Transient Error + """Daft Misc Transient Error. + Daft client had a Misc Transient Error while making request to server. """ diff --git a/daft/execution/actor_pool_udf.py b/daft/execution/actor_pool_udf.py index a60e2da7ed..9ecadc3f91 100644 --- a/daft/execution/actor_pool_udf.py +++ b/daft/execution/actor_pool_udf.py @@ -16,8 +16,7 @@ def actor_event_loop(uninitialized_projection: ExpressionsProjection, conn: Connection) -> None: - """ - Event loop that runs in a actor process and receives MicroPartitions to evaluate with an initialized UDF projection. + """Event loop that runs in a actor process and receives MicroPartitions to evaluate with an initialized UDF projection. Terminates once it receives None. """ diff --git a/daft/execution/execution_step.py b/daft/execution/execution_step.py index b370d6c774..364e577547 100644 --- a/daft/execution/execution_step.py +++ b/daft/execution/execution_step.py @@ -234,7 +234,9 @@ def __repr__(self) -> str: @dataclass class MultiOutputPartitionTask(PartitionTask[PartitionT]): - """A PartitionTask that is ready to run. More instructions cannot be added. + """A PartitionTask that is ready to run. + + More instructions cannot be added. This PartitionTask will return a list of any number of partitions. """ @@ -570,10 +572,7 @@ def run_partial_metadata(self, input_metadatas: list[PartialPartitionMetadata]) def _prune_boundaries(boundaries: Boundaries, projection: ExpressionsProjection) -> Boundaries | None: - """ - If projection expression is a nontrivial computation (i.e. not a direct col() reference and not an alias) on top of a boundary - expression, then invalidate the boundary. - """ + """If projection expression is a nontrivial computation (i.e. not a direct col() reference and not an alias) on top of a boundary expression, then invalidate the boundary.""" proj_all_names = projection.to_name_set() proj_names_needing_compute = proj_all_names - projection.input_mapping().keys() for i, e in enumerate(boundaries.sort_by): @@ -951,7 +950,6 @@ def calculate_cross_join_stats( left_meta: PartialPartitionMetadata, right_meta: PartialPartitionMetadata ) -> tuple[int | None, int | None]: """Given the left and right partition metadata, returns the expected (num rows, size bytes) of the cross join output.""" - left_rows, left_bytes = left_meta.num_rows, left_meta.size_bytes right_rows, right_bytes = right_meta.num_rows, right_meta.size_bytes diff --git a/daft/execution/physical_plan.py b/daft/execution/physical_plan.py index f7084a7412..be3f6739c3 100644 --- a/daft/execution/physical_plan.py +++ b/daft/execution/physical_plan.py @@ -1,5 +1,5 @@ -""" -This file contains physical plan building blocks. +"""This file contains physical plan building blocks. + To get a physical plan for a logical plan, see physical_plan_factory.py. Conceptually, a physical plan decides what steps, and the order of steps, to run to build some target. @@ -100,7 +100,6 @@ def file_write( io_config: IOConfig | None, ) -> InProgressPhysicalPlan[PartitionT]: """Write the results of `child_plan` into files described by `write_info`.""" - yield from ( step.add_instruction( execution_step.WriteFile( @@ -128,7 +127,6 @@ def iceberg_write( io_config: IOConfig | None, ) -> InProgressPhysicalPlan[PartitionT]: """Write the results of `child_plan` into pyiceberg data files described by `write_info`.""" - yield from ( step.add_instruction( execution_step.WriteIceberg( @@ -155,7 +153,6 @@ def deltalake_write( io_config: IOConfig | None, ) -> InProgressPhysicalPlan[PartitionT]: """Write the results of `child_plan` into pyiceberg data files described by `write_info`.""" - yield from ( step.add_instruction( execution_step.WriteDeltaLake( @@ -180,7 +177,6 @@ def lance_write( kwargs: dict | None, ) -> InProgressPhysicalPlan[PartitionT]: """Write the results of `child_plan` into lance data files described by `write_info`.""" - yield from ( step.add_instruction( execution_step.WriteLance( @@ -202,7 +198,6 @@ def pipeline_instruction( resource_request: execution_step.ResourceRequest, ) -> InProgressPhysicalPlan[PartitionT]: """Apply an instruction to the results of `child_plan`.""" - yield from ( step.add_instruction(pipeable_instruction, resource_request) if isinstance(step, PartitionTaskBuilder) else step for step in child_plan @@ -327,7 +322,6 @@ def monotonically_increasing_id( child_plan: InProgressPhysicalPlan[PartitionT], column_name: str ) -> InProgressPhysicalPlan[PartitionT]: """Apply a monotonically_increasing_id instruction to the results of `child_plan`.""" - partition_counter = ( 0 # This counter gives each partition a monotonically increasing int to use as the leftmost 28 bits of the id ) @@ -350,7 +344,6 @@ def hash_join( how: JoinType, ) -> InProgressPhysicalPlan[PartitionT]: """Hash-based pairwise join the partitions from `left_child_plan` and `right_child_plan` together.""" - # Materialize the steps from the left and right sources to get partitions. # As the materializations complete, emit new steps to join each left and right partition. left_requests: deque[SingleOutputPartitionTask[PartitionT]] = deque() @@ -497,7 +490,6 @@ def broadcast_join( is_swapped: bool, ) -> InProgressPhysicalPlan[PartitionT]: """Broadcast join all partitions from the broadcaster child plan to each partition in the receiver child plan.""" - # Materialize the steps from the broadcaster and receiver sources to get partitions. # As the receiver-side materializations complete, emit new steps to join each broadcaster and receiver partition. stage_id = next(stage_id_counter) @@ -636,8 +628,7 @@ def cross_join( class MergeJoinTaskTracker(Generic[PartitionT]): - """ - Tracks merge-join tasks for each larger-side partition. + """Tracks merge-join tasks for each larger-side partition. Merge-join tasks are added to the tracker, and the tracker handles empty tasks, finalizing PartitionTaskBuilders, determining whether tasks are ready to be executed, checking whether tasks are done, and deciding whether a coalesce @@ -663,8 +654,7 @@ def __init__(self, stage_id: int): self._stage_id = stage_id def add_task(self, part_id: str, task: PartitionTaskBuilder[PartitionT]) -> None: - """ - Add a merge-join task to the tracker for the provided larger-side partition. + """Add a merge-join task to the tracker for the provided larger-side partition. This task needs to be unfinalized, i.e. a PartitionTaskBuilder. """ @@ -687,9 +677,7 @@ def add_task(self, part_id: str, task: PartitionTaskBuilder[PartitionT]) -> None self._finalized_tasks[part_id].append(task.finalize_partition_task_single_output(self._stage_id)) def finalize(self, part_id: str) -> None: - """ - Indicates to the tracker that we are done adding merge-join tasks for this partition. - """ + """Indicates to the tracker that we are done adding merge-join tasks for this partition.""" # All finalized tasks should have been yielded before the tracker.finalize() call. finalized_tasks = self._finalized_tasks.pop(part_id, deque()) assert len(finalized_tasks) == 0 @@ -699,8 +687,9 @@ def finalize(self, part_id: str) -> None: def yield_ready( self, part_id: str ) -> Iterator[SingleOutputPartitionTask[PartitionT] | PartitionTaskBuilder[PartitionT]]: - """ - Returns an iterator of all tasks for this partition that are ready for execution. Each merge-join task will be + """Returns an iterator of all tasks for this partition that are ready for execution. + + Each merge-join task will be yielded once, even across multiple calls. """ assert self._is_contained(part_id) @@ -716,8 +705,9 @@ def yield_ready( yield self._task_staging.pop(part_id) def pop_uncoalesced(self, part_id: str) -> deque[SingleOutputPartitionTask[PartitionT]] | None: - """ - Returns all tasks for this partition that need to be coalesced. If this partition only involved a single + """Returns all tasks for this partition that need to be coalesced. + + If this partition only involved a single merge-join task (i.e. we don't need to coalesce), this this function will return None. NOTE: tracker.finalize(part_id) must be called before this function. @@ -726,9 +716,7 @@ def pop_uncoalesced(self, part_id: str) -> deque[SingleOutputPartitionTask[Parti return self._uncoalesced_tasks.pop(part_id, None) def all_tasks_done_for_partition(self, part_id: str) -> bool: - """ - Return whether all merge-join tasks for this partition are done. - """ + """Return whether all merge-join tasks for this partition are done.""" assert self._is_contained(part_id) if part_id in self._task_staging: # Unfinalized tasks are trivially "done". @@ -741,9 +729,7 @@ def all_tasks_done_for_partition(self, part_id: str) -> bool: ) def all_tasks_done(self) -> bool: - """ - Return whether all merge-join tasks for all partitions are done. - """ + """Return whether all merge-join tasks for all partitions are done.""" return all( self.all_tasks_done_for_partition(part_id) for part_id in itertools.chain( @@ -752,9 +738,7 @@ def all_tasks_done(self) -> bool: ) def _is_contained(self, part_id: str) -> bool: - """ - Return whether the provided partition is being tracked by this tracker. - """ + """Return whether the provided partition is being tracked by this tracker.""" return part_id in self._task_staging or part_id in self._finalized_tasks or part_id in self._uncoalesced_tasks @@ -768,9 +752,7 @@ def _emit_merge_joins_on_window( right_on: ExpressionsProjection, how: JoinType, ) -> Iterator[PartitionTaskBuilder[PartitionT] | PartitionTask[PartitionT]]: - """ - Emits merge-join steps of next_part with each partition in other_window. - """ + """Emits merge-join steps of next_part with each partition in other_window.""" # Emit a merge-join step for all partitions in the other window that intersect with this new partition. for other_next_part in other_window: memory_bytes = _memory_bytes_for_merge(next_part, other_next_part) @@ -829,13 +811,11 @@ def merge_join_sorted( how: JoinType, left_is_larger: bool, ) -> InProgressPhysicalPlan[PartitionT]: - """ - Merge the sorted partitions from `left_plan` and `right_plan` together. + """Merge the sorted partitions from `left_plan` and `right_plan` together. This assumes that `left_plan` and `right_plan` are both sorted on the join key(s), although with potentially different range partitionings (partition boundaries). """ - # Large vs. smaller side of join. larger_plan = left_plan if left_is_larger else right_plan smaller_plan = right_plan if left_is_larger else left_plan @@ -1025,10 +1005,7 @@ def merge_join_sorted( def _is_strictly_bounded_above_by( lower_part: SingleOutputPartitionTask[PartitionT], upper_part: SingleOutputPartitionTask[PartitionT] ) -> bool: - """ - Returns whether lower_part is strictly bounded above by upper part; i.e., whether lower_part's upper bound is - strictly less than upper_part's upper bound. - """ + """Returns whether lower_part is strictly bounded above by upper part.""" lower_boundaries = lower_part.partition_metadata().boundaries upper_boundaries = upper_part.partition_metadata().boundaries assert lower_boundaries is not None and upper_boundaries is not None @@ -1063,8 +1040,7 @@ def sort_merge_join_aligned_boundaries( num_partitions: int, left_is_larger: bool, ) -> InProgressPhysicalPlan[PartitionT]: - """ - Sort-merge join the partitions from `left_plan` and `right_plan` together. + """Sort-merge join the partitions from `left_plan` and `right_plan` together. This assumes that both `left_plan` and `right_plan` need to be sorted, and will be sorted using the same partitioning boundaries. @@ -1261,7 +1237,7 @@ def _to_per_partition_bounds(boundaries: MicroPartition, num_partitions: int) -> def concat( top_plan: InProgressPhysicalPlan[PartitionT], bottom_plan: InProgressPhysicalPlan[PartitionT] ) -> InProgressPhysicalPlan[PartitionT]: - """Vertical concat of the partitions in `top_plan` and `bottom_plan`""" + """Vertical concat of the partitions in `top_plan` and `bottom_plan`.""" # Yield steps in order from the top_plan to bottom_plan yield from top_plan yield from bottom_plan @@ -1297,7 +1273,6 @@ def global_limit( num_partitions: int, ) -> InProgressPhysicalPlan[PartitionT]: """Return the first n rows from the `child_plan`.""" - remaining_rows = limit_rows assert remaining_rows >= 0, f"Invalid value for limit: {remaining_rows}" remaining_partitions = num_partitions @@ -1395,7 +1370,6 @@ def global_limit( def flatten_plan(child_plan: InProgressPhysicalPlan[PartitionT]) -> InProgressPhysicalPlan[PartitionT]: """Wrap a plan that emits multi-output tasks to a plan that emits single-output tasks.""" - materializations: deque[MultiOutputPartitionTask[PartitionT]] = deque() stage_id = next(stage_id_counter) while True: @@ -1463,7 +1437,6 @@ def coalesce( The current implementation only does partition merging, no rebalancing. """ - assert ( to_num_partitions <= from_num_partitions ), f"Cannot coalesce upwards from {from_num_partitions} to {to_num_partitions} partitions." @@ -1539,7 +1512,6 @@ def reduce( Then, the reduce instruction is applied to each `i`th slice across the child lists. """ - materializations = list() stage_id = next(stage_id_counter) @@ -1585,7 +1557,6 @@ def sort( num_partitions: int, ) -> InProgressPhysicalPlan[PartitionT]: """Sort the result of `child_plan` according to `sort_info`.""" - # First, materialize the child plan. source_materializations: deque[SingleOutputPartitionTask[PartitionT]] = deque() stage_id_children = next(stage_id_counter) @@ -1683,7 +1654,7 @@ def sort( def fanout_random(child_plan: InProgressPhysicalPlan[PartitionT], num_partitions: int): - """Splits the results of `child_plan` randomly into a list of `node.num_partitions()` number of partitions""" + """Splits the results of `child_plan` randomly into a list of `node.num_partitions()` number of partitions.""" seed = 0 for step in child_plan: if isinstance(step, PartitionTaskBuilder): @@ -1696,7 +1667,7 @@ def fanout_random(child_plan: InProgressPhysicalPlan[PartitionT], num_partitions def _best_effort_next_step( stage_id: int, child_plan: InProgressPhysicalPlan[PartitionT] ) -> tuple[PartitionTask[PartitionT] | None, bool]: - """Performs a best-effort attempt at retrieving the next step from a child plan + """Performs a best-effort attempt at retrieving the next step from a child plan. Returns None in cases where there is nothing to run, or the plan has been exhausted. diff --git a/daft/execution/rust_physical_plan_shim.py b/daft/execution/rust_physical_plan_shim.py index a304574894..cd316ae431 100644 --- a/daft/execution/rust_physical_plan_shim.py +++ b/daft/execution/rust_physical_plan_shim.py @@ -52,7 +52,7 @@ def scan_with_tasks( def empty_scan( schema: Schema, ) -> physical_plan.InProgressPhysicalPlan[PartitionT]: - """yield a plan to create an empty Partition""" + """Yield a plan to create an empty Partition.""" scan_step = execution_step.PartitionTaskBuilder[PartitionT]( inputs=[], partial_metadatas=None, diff --git a/daft/execution/shuffles/pre_shuffle_merge.py b/daft/execution/shuffles/pre_shuffle_merge.py index 1ecaaeef33..b0580a3c86 100644 --- a/daft/execution/shuffles/pre_shuffle_merge.py +++ b/daft/execution/shuffles/pre_shuffle_merge.py @@ -19,8 +19,7 @@ def pre_shuffle_merge( map_plan: InProgressPhysicalPlan[PartitionT], pre_shuffle_merge_threshold: int, ) -> InProgressPhysicalPlan[PartitionT]: - """ - Merges intermediate partitions from the map_plan based on memory constraints. + """Merges intermediate partitions from the map_plan based on memory constraints. The function processes incoming map tasks and merges their outputs when: 1. There are multiple materialized maps available @@ -32,7 +31,6 @@ def pre_shuffle_merge( Yields: Merged partition tasks or processed child steps """ - NUM_MAPS_THRESHOLD = 4 stage_id = next(stage_id_counter) diff --git a/daft/expressions/__init__.py b/daft/expressions/__init__.py index bc28ed1925..93eb173b90 100644 --- a/daft/expressions/__init__.py +++ b/daft/expressions/__init__.py @@ -2,4 +2,4 @@ from .expressions import Expression, ExpressionsProjection, col, lit, interval, coalesce -__all__ = ["Expression", "ExpressionsProjection", "col", "lit", "interval", "coalesce"] +__all__ = ["Expression", "ExpressionsProjection", "coalesce", "col", "interval", "lit"] diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py index 45fd329250..1dfd4730a7 100644 --- a/daft/expressions/expressions.py +++ b/daft/expressions/expressions.py @@ -70,7 +70,7 @@ def __get__( # type: ignore[override] def lit(value: object) -> Expression: - """Creates an Expression representing a column with every value set to the provided value + """Creates an Expression representing a column with every value set to the provided value. Example: >>> import daft @@ -174,10 +174,7 @@ def interval( millis: int | None = None, nanos: int | None = None, ) -> Expression: - """ - Creates an Expression representing an interval. - - """ + """Creates an Expression representing an interval.""" lit_value = native.interval_lit( years=years, months=months, days=days, hours=hours, minutes=minutes, seconds=seconds, millis=millis, nanos=nanos ) @@ -223,57 +220,57 @@ def __init__(self) -> None: @property def str(self) -> ExpressionStringNamespace: - """Access methods that work on columns of strings""" + """Access methods that work on columns of strings.""" return ExpressionStringNamespace.from_expression(self) @property def dt(self) -> ExpressionDatetimeNamespace: - """Access methods that work on columns of datetimes""" + """Access methods that work on columns of datetimes.""" return ExpressionDatetimeNamespace.from_expression(self) @property def embedding(self) -> ExpressionEmbeddingNamespace: - """Access methods that work on columns of embeddings""" + """Access methods that work on columns of embeddings.""" return ExpressionEmbeddingNamespace.from_expression(self) @property def float(self) -> ExpressionFloatNamespace: - """Access methods that work on columns of floats""" + """Access methods that work on columns of floats.""" return ExpressionFloatNamespace.from_expression(self) @property def url(self) -> ExpressionUrlNamespace: - """Access methods that work on columns of URLs""" + """Access methods that work on columns of URLs.""" return ExpressionUrlNamespace.from_expression(self) @property def list(self) -> ExpressionListNamespace: - """Access methods that work on columns of lists""" + """Access methods that work on columns of lists.""" return ExpressionListNamespace.from_expression(self) @property def struct(self) -> ExpressionStructNamespace: - """Access methods that work on columns of structs""" + """Access methods that work on columns of structs.""" return ExpressionStructNamespace.from_expression(self) @property def map(self) -> ExpressionMapNamespace: - """Access methods that work on columns of maps""" + """Access methods that work on columns of maps.""" return ExpressionMapNamespace.from_expression(self) @property def image(self) -> ExpressionImageNamespace: - """Access methods that work on columns of images""" + """Access methods that work on columns of images.""" return ExpressionImageNamespace.from_expression(self) @property def partitioning(self) -> ExpressionPartitioningNamespace: - """Access methods that support partitioning operators""" + """Access methods that support partitioning operators.""" return ExpressionPartitioningNamespace.from_expression(self) @property def json(self) -> ExpressionJsonNamespace: - """Access methods that work on columns of json""" + """Access methods that work on columns of json.""" return ExpressionJsonNamespace.from_expression(self) @staticmethod @@ -323,7 +320,7 @@ def to_struct(*inputs: Expression | builtins.str) -> Expression: >>> import daft >>> from daft import col >>> df = daft.from_pydict({"a": [1, 2, 3], "b": ["a", "b", "c"]}) - >>> df.select(daft.to_struct(col("a")*2, col("b"))).show() + >>> df.select(daft.to_struct(col("a") * 2, col("b"))).show() ╭───────────────────────────╮ │ struct │ │ --- │ @@ -367,15 +364,15 @@ def __bool__(self) -> bool: ) def __abs__(self) -> Expression: - """Absolute of a numeric expression (``abs(expr)``)""" + """Absolute of a numeric expression.""" return self.abs() def abs(self) -> Expression: - """Absolute of a numeric expression (``expr.abs()``)""" + """Absolute of a numeric expression.""" return Expression._from_pyexpr(native.abs(self._expr)) def __add__(self, other: object) -> Expression: - """Adds two numeric expressions or concatenates two string expressions (``e1 + e2``)""" + """Adds two numeric expressions or concatenates two string expressions (``e1 + e2``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(self._expr + expr._expr) @@ -384,7 +381,7 @@ def __radd__(self, other: object) -> Expression: return Expression._from_pyexpr(expr._expr + self._expr) def __sub__(self, other: object) -> Expression: - """Subtracts two numeric expressions (``e1 - e2``)""" + """Subtracts two numeric expressions (``e1 - e2``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(self._expr - expr._expr) @@ -393,7 +390,7 @@ def __rsub__(self, other: object) -> Expression: return Expression._from_pyexpr(expr._expr - self._expr) def __mul__(self, other: object) -> Expression: - """Multiplies two numeric expressions (``e1 * e2``)""" + """Multiplies two numeric expressions (``e1 * e2``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(self._expr * expr._expr) @@ -402,7 +399,7 @@ def __rmul__(self, other: object) -> Expression: return Expression._from_pyexpr(expr._expr * self._expr) def __truediv__(self, other: object) -> Expression: - """True divides two numeric expressions (``e1 / e2``)""" + """True divides two numeric expressions (``e1 / e2``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(self._expr / expr._expr) @@ -411,72 +408,73 @@ def __rtruediv__(self, other: object) -> Expression: return Expression._from_pyexpr(expr._expr / self._expr) def __mod__(self, other: Expression) -> Expression: - """Takes the mod of two numeric expressions (``e1 % e2``)""" + """Takes the mod of two numeric expressions (``e1 % e2``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(self._expr % expr._expr) def __rmod__(self, other: Expression) -> Expression: - """Takes the mod of two numeric expressions (``e1 % e2``)""" + """Takes the mod of two numeric expressions (``e1 % e2``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(expr._expr % self._expr) def __and__(self, other: Expression) -> Expression: - """Takes the logical AND of two boolean expressions, or bitwise AND of two integer expressions (``e1 & e2``)""" + """Takes the logical AND of two boolean expressions, or bitwise AND of two integer expressions (``e1 & e2``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(self._expr & expr._expr) def __rand__(self, other: Expression) -> Expression: - """Takes the logical reverse AND of two boolean expressions (``e1 & e2``)""" + """Takes the logical reverse AND of two boolean expressions (``e1 & e2``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(expr._expr & self._expr) def __or__(self, other: Expression) -> Expression: - """Takes the logical OR of two boolean or integer expressions, or bitwise OR of two integer expressions (``e1 | e2``)""" + """Takes the logical OR of two boolean or integer expressions, or bitwise OR of two integer expressions (``e1 | e2``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(self._expr | expr._expr) def __xor__(self, other: Expression) -> Expression: - """Takes the logical XOR of two boolean or integer expressions, or bitwise XOR of two integer expressions (``e1 ^ e2``)""" + """Takes the logical XOR of two boolean or integer expressions, or bitwise XOR of two integer expressions (``e1 ^ e2``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(self._expr ^ expr._expr) def __ror__(self, other: Expression) -> Expression: - """Takes the logical reverse OR of two boolean expressions (``e1 | e2``)""" + """Takes the logical reverse OR of two boolean expressions (``e1 | e2``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(expr._expr | self._expr) def __lt__(self, other: Expression) -> Expression: - """Compares if an expression is less than another (``e1 < e2``)""" + """Compares if an expression is less than another (``e1 < e2``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(self._expr < expr._expr) def __le__(self, other: Expression) -> Expression: - """Compares if an expression is less than or equal to another (``e1 <= e2``)""" + """Compares if an expression is less than or equal to another (``e1 <= e2``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(self._expr <= expr._expr) def __eq__(self, other: Expression) -> Expression: # type: ignore - """Compares if an expression is equal to another (``e1 == e2``)""" + """Compares if an expression is equal to another (``e1 == e2``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(self._expr == expr._expr) def __ne__(self, other: Expression) -> Expression: # type: ignore - """Compares if an expression is not equal to another (``e1 != e2``)""" + """Compares if an expression is not equal to another (``e1 != e2``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(self._expr != expr._expr) def __gt__(self, other: Expression) -> Expression: - """Compares if an expression is greater than another (``e1 > e2``)""" + """Compares if an expression is greater than another (``e1 > e2``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(self._expr > expr._expr) def __ge__(self, other: Expression) -> Expression: - """Compares if an expression is greater than or equal to another (``e1 >= e2``)""" + """Compares if an expression is greater than or equal to another (``e1 >= e2``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(self._expr >= expr._expr) def __lshift__(self, other: Expression) -> Expression: - """Shifts the bits of an integer expression to the left (``e1 << e2``) + """Shifts the bits of an integer expression to the left (``e1 << e2``). + Args: other: The number of bits to shift the expression to the left """ @@ -484,7 +482,7 @@ def __lshift__(self, other: Expression) -> Expression: return Expression._from_pyexpr(self._expr << expr._expr) def __rshift__(self, other: Expression) -> Expression: - """Shifts the bits of an integer expression to the right (``e1 >> e2``) + """Shifts the bits of an integer expression to the right (``e1 >> e2``). .. NOTE:: @@ -498,23 +496,22 @@ def __rshift__(self, other: Expression) -> Expression: return Expression._from_pyexpr(self._expr >> expr._expr) def __invert__(self) -> Expression: - """Inverts a boolean expression (``~e``)""" + """Inverts a boolean expression (``~e``).""" expr = self._expr.__invert__() return Expression._from_pyexpr(expr) def __floordiv__(self, other: Expression) -> Expression: - """Floor divides two numeric expressions (``e1 / e2``)""" + """Floor divides two numeric expressions (``e1 / e2``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(self._expr // expr._expr) def __rfloordiv__(self, other: object) -> Expression: - """Reverse floor divides two numeric expressions (``e2 / e1``)""" + """Reverse floor divides two numeric expressions (``e2 / e1``).""" expr = Expression._to_expression(other) return Expression._from_pyexpr(expr._expr // self._expr) def alias(self, name: builtins.str) -> Expression: - """Gives the expression a new name, which is its column's name in the DataFrame schema and the name - by which subsequent expressions can refer to the results of this expression. + """Gives the expression a new name. Example: >>> import daft @@ -630,17 +627,17 @@ def cast(self, dtype: DataType) -> Expression: return Expression._from_pyexpr(expr) def ceil(self) -> Expression: - """The ceiling of a numeric expression (``expr.ceil()``)""" + """The ceiling of a numeric expression.""" expr = native.ceil(self._expr) return Expression._from_pyexpr(expr) def floor(self) -> Expression: - """The floor of a numeric expression (``expr.floor()``)""" + """The floor of a numeric expression.""" expr = native.floor(self._expr) return Expression._from_pyexpr(expr) def clip(self, min: Expression | None = None, max: Expression | None = None) -> Expression: - """Clips an expression to the given minimum and maximum values (``expr.clip(min, max)``). + """Clips an expression to the given minimum and maximum values. Args: min: Minimum value to clip to. If None (or column value is Null), no lower clipping is applied. @@ -652,12 +649,12 @@ def clip(self, min: Expression | None = None, max: Expression | None = None) -> return Expression._from_pyexpr(native.clip(self._expr, min_expr._expr, max_expr._expr)) def sign(self) -> Expression: - """The sign of a numeric expression (``expr.sign()``)""" + """The sign of a numeric expression.""" expr = native.sign(self._expr) return Expression._from_pyexpr(expr) def round(self, decimals: int = 0) -> Expression: - """The round of a numeric expression (``expr.round(decimals = 0)``) + """The round of a numeric expression. Args: decimals: number of decimal places to round to. Defaults to 0. @@ -667,96 +664,98 @@ def round(self, decimals: int = 0) -> Expression: return Expression._from_pyexpr(expr) def sqrt(self) -> Expression: - """The square root of a numeric expression (``expr.sqrt()``)""" + """The square root of a numeric expression.""" expr = native.sqrt(self._expr) return Expression._from_pyexpr(expr) def cbrt(self) -> Expression: - """The cube root of a numeric expression (``expr.cbrt()``)""" + """The cube root of a numeric expression.""" return Expression._from_pyexpr(native.cbrt(self._expr)) def sin(self) -> Expression: - """The elementwise sine of a numeric expression (``expr.sin()``)""" + """The elementwise sine of a numeric expression.""" expr = native.sin(self._expr) return Expression._from_pyexpr(expr) def cos(self) -> Expression: - """The elementwise cosine of a numeric expression (``expr.cos()``)""" + """The elementwise cosine of a numeric expression.""" expr = native.cos(self._expr) return Expression._from_pyexpr(expr) def tan(self) -> Expression: - """The elementwise tangent of a numeric expression (``expr.tan()``)""" + """The elementwise tangent of a numeric expression.""" expr = native.tan(self._expr) return Expression._from_pyexpr(expr) def cot(self) -> Expression: - """The elementwise cotangent of a numeric expression (``expr.cot()``)""" + """The elementwise cotangent of a numeric expression.""" expr = native.cot(self._expr) return Expression._from_pyexpr(expr) def arcsin(self) -> Expression: - """The elementwise arc sine of a numeric expression (``expr.arcsin()``)""" + """The elementwise arc sine of a numeric expression.""" expr = native.arcsin(self._expr) return Expression._from_pyexpr(expr) def arccos(self) -> Expression: - """The elementwise arc cosine of a numeric expression (``expr.arccos()``)""" + """The elementwise arc cosine of a numeric expression.""" expr = native.arccos(self._expr) return Expression._from_pyexpr(expr) def arctan(self) -> Expression: - """The elementwise arc tangent of a numeric expression (``expr.arctan()``)""" + """The elementwise arc tangent of a numeric expression.""" expr = native.arctan(self._expr) return Expression._from_pyexpr(expr) def arctan2(self, other: Expression) -> Expression: - """Calculates the four quadrant arctangent of coordinates (y, x), in radians (``expr_y.arctan2(expr_x)``) + """Calculates the four quadrant arctangent of coordinates (y, x), in radians. * ``x = 0``, ``y = 0``: ``0`` * ``x >= 0``: ``[-pi/2, pi/2]`` * ``y >= 0``: ``(pi/2, pi]`` - * ``y < 0``: ``(-pi, -pi/2)``""" + * ``y < 0``: ``(-pi, -pi/2)`` + """ expr = Expression._to_expression(other) return Expression._from_pyexpr(native.arctan2(self._expr, expr._expr)) def arctanh(self) -> Expression: - """The elementwise inverse hyperbolic tangent of a numeric expression (``expr.arctanh()``)""" + """The elementwise inverse hyperbolic tangent of a numeric expression.""" expr = native.arctanh(self._expr) return Expression._from_pyexpr(expr) def arccosh(self) -> Expression: - """The elementwise inverse hyperbolic cosine of a numeric expression (``expr.arccosh()``)""" + """The elementwise inverse hyperbolic cosine of a numeric expression.""" expr = native.arccosh(self._expr) return Expression._from_pyexpr(expr) def arcsinh(self) -> Expression: - """The elementwise inverse hyperbolic sine of a numeric expression (``expr.arcsinh()``)""" + """The elementwise inverse hyperbolic sine of a numeric expression.""" expr = native.arcsinh(self._expr) return Expression._from_pyexpr(expr) def radians(self) -> Expression: - """The elementwise radians of a numeric expression (``expr.radians()``)""" + """The elementwise radians of a numeric expression.""" expr = native.radians(self._expr) return Expression._from_pyexpr(expr) def degrees(self) -> Expression: - """The elementwise degrees of a numeric expression (``expr.degrees()``)""" + """The elementwise degrees of a numeric expression.""" expr = native.degrees(self._expr) return Expression._from_pyexpr(expr) def log2(self) -> Expression: - """The elementwise log base 2 of a numeric expression (``expr.log2()``)""" + """The elementwise log base 2 of a numeric expression.""" expr = native.log2(self._expr) return Expression._from_pyexpr(expr) def log10(self) -> Expression: - """The elementwise log base 10 of a numeric expression (``expr.log10()``)""" + """The elementwise log base 10 of a numeric expression.""" expr = native.log10(self._expr) return Expression._from_pyexpr(expr) def log(self, base: float = math.e) -> Expression: # type: ignore - """The elementwise log with given base, of a numeric expression (``expr.log(base = math.e)``) + """The elementwise log with given base, of a numeric expression. + Args: base: The base of the logarithm. Defaults to e. """ @@ -765,32 +764,33 @@ def log(self, base: float = math.e) -> Expression: # type: ignore return Expression._from_pyexpr(expr) def ln(self) -> Expression: - """The elementwise natural log of a numeric expression (``expr.ln()``)""" + """The elementwise natural log of a numeric expression.""" expr = native.ln(self._expr) return Expression._from_pyexpr(expr) def exp(self) -> Expression: - """The e^self of a numeric expression (``expr.exp()``)""" + """The e^self of a numeric expression.""" expr = native.exp(self._expr) return Expression._from_pyexpr(expr) def bitwise_and(self, other: Expression) -> Expression: - """Bitwise AND of two integer expressions (``expr.bitwise_and(other)``)""" + """Bitwise AND of two integer expressions.""" expr = Expression._to_expression(other) return Expression._from_pyexpr(self._expr & expr._expr) def bitwise_or(self, other: Expression) -> Expression: - """Bitwise OR of two integer expressions (``expr.bitwise_or(other)``)""" + """Bitwise OR of two integer expressions.""" expr = Expression._to_expression(other) return Expression._from_pyexpr(self._expr | expr._expr) def bitwise_xor(self, other: Expression) -> Expression: - """Bitwise XOR of two integer expressions (``expr.bitwise_xor(other)``)""" + """Bitwise XOR of two integer expressions.""" expr = Expression._to_expression(other) return Expression._from_pyexpr(self._expr ^ expr._expr) def shift_left(self, other: Expression) -> Expression: - """Shifts the bits of an integer expression to the left (``expr << other``) + """Shifts the bits of an integer expression to the left (``expr << other``). + Args: other: The number of bits to shift the expression to the left """ @@ -798,7 +798,7 @@ def shift_left(self, other: Expression) -> Expression: return Expression._from_pyexpr(self._expr << expr._expr) def shift_right(self, other: Expression) -> Expression: - """Shifts the bits of an integer expression to the right (``expr >> other``) + """Shifts the bits of an integer expression to the right (``expr >> other``). .. NOTE:: For unsigned integers, this expression perform a logical right shift. @@ -826,13 +826,12 @@ def count_distinct(self) -> Expression: return Expression._from_pyexpr(expr) def sum(self) -> Expression: - """Calculates the sum of the values in the expression""" + """Calculates the sum of the values in the expression.""" expr = self._expr.sum() return Expression._from_pyexpr(expr) def approx_count_distinct(self) -> Expression: - """ - Calculates the approximate number of non-`NULL` unique values in the expression. + """Calculates the approximate number of non-`NULL` unique values in the expression. Approximation is performed using the `HyperLogLog `_ algorithm. @@ -859,7 +858,7 @@ def approx_count_distinct(self) -> Expression: return Expression._from_pyexpr(expr) def approx_percentiles(self, percentiles: builtins.float | builtins.list[builtins.float]) -> Expression: - """Calculates the approximate percentile(s) for a column of numeric values + """Calculates the approximate percentile(s) for a column of numeric values. For numeric columns, we use the `sketches_ddsketch crate `_. This is a Rust implementation of the paper `DDSketch: A Fast and Fully-Mergeable Quantile Sketch with Relative-Error Guarantees (Masson et al.) `_ @@ -891,7 +890,7 @@ def approx_percentiles(self, percentiles: builtins.float | builtins.list[builtin A grouped calculation of approximate percentiles: - >>> df = daft.from_pydict({"class": ["a", "a", "a", "b", "c"], "scores": [1, 2, 3, 1, None]}) + >>> df = daft.from_pydict({"class": ["a", "a", "a", "b", "c"], "scores": [1, 2, 3, 1, None]}) >>> df = df.groupby("class").agg( ... df["scores"].approx_percentiles(0.5).alias("approx_median_score"), ... df["scores"].approx_percentiles([0.25, 0.5, 0.75]).alias("approx_percentiles_scores"), @@ -922,27 +921,27 @@ def approx_percentiles(self, percentiles: builtins.float | builtins.list[builtin return Expression._from_pyexpr(expr) def mean(self) -> Expression: - """Calculates the mean of the values in the expression""" + """Calculates the mean of the values in the expression.""" expr = self._expr.mean() return Expression._from_pyexpr(expr) def stddev(self) -> Expression: - """Calculates the standard deviation of the values in the expression""" + """Calculates the standard deviation of the values in the expression.""" expr = self._expr.stddev() return Expression._from_pyexpr(expr) def min(self) -> Expression: - """Calculates the minimum value in the expression""" + """Calculates the minimum value in the expression.""" expr = self._expr.min() return Expression._from_pyexpr(expr) def max(self) -> Expression: - """Calculates the maximum value in the expression""" + """Calculates the maximum value in the expression.""" expr = self._expr.max() return Expression._from_pyexpr(expr) def any_value(self, ignore_nulls=False) -> Expression: - """Returns any value in the expression + """Returns any value in the expression. Args: ignore_nulls: whether to ignore null values when selecting the value. Defaults to False. @@ -951,12 +950,12 @@ def any_value(self, ignore_nulls=False) -> Expression: return Expression._from_pyexpr(expr) def agg_list(self) -> Expression: - """Aggregates the values in the expression into a list""" + """Aggregates the values in the expression into a list.""" expr = self._expr.agg_list() return Expression._from_pyexpr(expr) def agg_concat(self) -> Expression: - """Aggregates the values in the expression into a single string by concatenating them""" + """Aggregates the values in the expression into a single string by concatenating them.""" expr = self._expr.agg_concat() return Expression._from_pyexpr(expr) @@ -965,12 +964,15 @@ def _explode(self) -> Expression: return Expression._from_pyexpr(expr) def if_else(self, if_true: Expression, if_false: Expression) -> Expression: - """Conditionally choose values between two expressions using the current boolean expression as a condition + """Conditionally choose values between two expressions using the current boolean expression as a condition. Example: >>> import daft >>> df = daft.from_pydict({"A": [1, 2, 3], "B": [0, 2, 4]}) - >>> df = df.with_column("A_if_bigger_else_B", (df["A"] > df["B"]).if_else(df["A"], df["B"]),) + >>> df = df.with_column( + ... "A_if_bigger_else_B", + ... (df["A"] > df["B"]).if_else(df["A"], df["B"]), + ... ) >>> df.collect() ╭───────┬───────┬────────────────────╮ │ A ┆ B ┆ A_if_bigger_else_B │ @@ -998,7 +1000,7 @@ def if_else(self, if_true: Expression, if_false: Expression) -> Expression: return Expression._from_pyexpr(self._expr.if_else(if_true._expr, if_false._expr)) def apply(self, func: Callable, return_dtype: DataType) -> Expression: - """Apply a function on each value in a given expression + """Apply a function on each value in a given expression. .. NOTE:: This is just syntactic sugar on top of a UDF and is convenient to use when your function only operates @@ -1013,7 +1015,7 @@ def apply(self, func: Callable, return_dtype: DataType) -> Expression: ... return int(x_val) ... else: ... return 0 - >>> df.with_column("num_x", df['x'].apply(f, return_dtype=daft.DataType.int64())).collect() + >>> df.with_column("num_x", df["x"].apply(f, return_dtype=daft.DataType.int64())).collect() ╭──────┬───────╮ │ x ┆ num_x │ │ --- ┆ --- │ @@ -1052,12 +1054,12 @@ def batch_func(self_series): )(self) def is_null(self) -> Expression: - """Checks if values in the Expression are Null (a special value indicating missing data) + """Checks if values in the Expression are Null (a special value indicating missing data). Example: >>> import daft - >>> df = daft.from_pydict({"x": [1., None, float("nan")]}) - >>> df = df.select(df['x'].is_null()) + >>> df = daft.from_pydict({"x": [1.0, None, float("nan")]}) + >>> df = df.select(df["x"].is_null()) >>> df.collect() ╭─────────╮ │ x │ @@ -1080,12 +1082,12 @@ def is_null(self) -> Expression: return Expression._from_pyexpr(expr) def not_null(self) -> Expression: - """Checks if values in the Expression are not Null (a special value indicating missing data) + """Checks if values in the Expression are not Null (a special value indicating missing data). Example: >>> import daft - >>> df = daft.from_pydict({"x": [1., None, float("nan")]}) - >>> df = df.select(df['x'].not_null()) + >>> df = daft.from_pydict({"x": [1.0, None, float("nan")]}) + >>> df = df.select(df["x"].not_null()) >>> df.collect() ╭─────────╮ │ x │ @@ -1108,7 +1110,7 @@ def not_null(self) -> Expression: return Expression._from_pyexpr(expr) def fill_null(self, fill_value: Expression) -> Expression: - """Fills null values in the Expression with the provided fill_value + """Fills null values in the Expression with the provided fill_value. Example: >>> import daft @@ -1132,13 +1134,12 @@ def fill_null(self, fill_value: Expression) -> Expression: Returns: Expression: Expression with null values filled with the provided fill_value """ - fill_value = Expression._to_expression(fill_value) expr = self._expr.fill_null(fill_value._expr) return Expression._from_pyexpr(expr) def is_in(self, other: Any) -> Expression: - """Checks if values in the Expression are in the provided list + """Checks if values in the Expression are in the provided list. Example: >>> import daft @@ -1162,7 +1163,6 @@ def is_in(self, other: Any) -> Expression: Returns: Expression: Boolean Expression indicating whether values are in the provided list """ - if not isinstance(other, Expression): series = item_to_series("items", other) other = Expression._to_expression(series) @@ -1204,8 +1204,7 @@ def between(self, lower: Any, upper: Any) -> Expression: return Expression._from_pyexpr(expr) def hash(self, seed: Any | None = None) -> Expression: - """ - Hashes the values in the Expression. + """Hashes the values in the Expression. Uses the `XXH3_64bits `_ non-cryptographic hash function to hash the values in the expression. @@ -1230,8 +1229,7 @@ def minhash( seed: int = 1, hash_function: Literal["murmurhash3", "xxhash", "sha1"] = "murmurhash3", ) -> Expression: - """ - Runs the MinHash algorithm on the series. + """Runs the MinHash algorithm on the series. For a string, calculates the minimum hash over all its ngrams, repeating with `num_hashes` permutations. Returns as a list of 32-bit unsigned integers. @@ -1299,7 +1297,7 @@ def from_expression(cls: type[SomeExpressionNamespace], expr: Expression) -> Som class ExpressionUrlNamespace(ExpressionNamespace): @staticmethod def _should_use_multithreading_tokio_runtime() -> bool: - """Whether or not our expression should use the multithreaded tokio runtime under the hood, or a singlethreaded one + """Whether or not our expression should use the multithreaded tokio runtime under the hood, or a singlethreaded one. This matters because for distributed workloads, each process has its own tokio I/O runtime. if each distributed process is multithreaded (by default we spin up `N_CPU` threads) then we will be running `(N_CPU * N_PROC)` number of threads, and @@ -1316,7 +1314,7 @@ def _should_use_multithreading_tokio_runtime() -> bool: @staticmethod def _override_io_config_max_connections(max_connections: int, io_config: IOConfig | None) -> IOConfig: - """Use a user-provided `max_connections` argument to override the value in S3Config + """Use a user-provided `max_connections` argument to override the value in S3Config. This is because our Rust code under the hood actually does `min(S3Config's max_connections, url_download's max_connections)` to determine how many connections to allow per-thread. Thus we need to override the io_config here to ensure that the user's max_connections @@ -1333,7 +1331,7 @@ def download( io_config: IOConfig | None = None, use_native_downloader: bool = True, ) -> Expression: - """Treats each string as a URL, and downloads the bytes contents as a bytes column + """Treats each string as a URL, and downloads the bytes contents as a bytes column. .. NOTE:: If you are observing excessive S3 issues (such as timeouts, DNS errors or slowdown errors) during URL downloads, @@ -1387,13 +1385,13 @@ def upload( max_connections: int = 32, io_config: IOConfig | None = None, ) -> Expression: - """Uploads a column of binary data to the provided location (also supports S3, local etc) + """Uploads a column of binary data to the provided location (also supports S3, local etc). Files will be written into the location (folder) with a generated UUID filename, and the result will be returned as a column of string paths that is compatible with the ``.url.download()`` Expression. Example: - >>> col("data").url.upload("s3://my-bucket/my-folder") # doctest: +SKIP + >>> col("data").url.upload("s3://my-bucket/my-folder") # doctest: +SKIP Args: location: a folder location to upload data into @@ -1415,14 +1413,14 @@ def upload( class ExpressionFloatNamespace(ExpressionNamespace): def is_nan(self) -> Expression: - """Checks if values are NaN (a special float value indicating not-a-number) + """Checks if values are NaN (a special float value indicating not-a-number). .. NOTE:: Nulls will be propagated! I.e. this operation will return a null for null values. Example: >>> import daft - >>> df = daft.from_pydict({"data": [1., None, float("nan")]}) + >>> df = daft.from_pydict({"data": [1.0, None, float("nan")]}) >>> df = df.select(df["data"].float.is_nan()) >>> df.collect() ╭─────────╮ @@ -1452,7 +1450,7 @@ def is_inf(self) -> Expression: Example: >>> import daft - >>> df = daft.from_pydict({"data": [-float("inf"), 0., float("inf"), None]}) + >>> df = daft.from_pydict({"data": [-float("inf"), 0.0, float("inf"), None]}) >>> df = df.select(df["data"].float.is_inf()) >>> df.collect() ╭─────────╮ @@ -1477,7 +1475,7 @@ def is_inf(self) -> Expression: return Expression._from_pyexpr(native.is_inf(self._expr)) def not_nan(self) -> Expression: - """Checks if values are not NaN (a special float value indicating not-a-number) + """Checks if values are not NaN (a special float value indicating not-a-number). .. NOTE:: Nulls will be propagated! I.e. this operation will return a null for null values. @@ -1507,7 +1505,7 @@ def not_nan(self) -> Expression: return Expression._from_pyexpr(native.not_nan(self._expr)) def fill_nan(self, fill_value: Expression) -> Expression: - """Fills NaN values in the Expression with the provided fill_value + """Fills NaN values in the Expression with the provided fill_value. Example: >>> import daft @@ -1531,7 +1529,6 @@ def fill_nan(self, fill_value: Expression) -> Expression: Returns: Expression: Expression with Nan values filled with the provided fill_value """ - fill_value = Expression._to_expression(fill_value) expr = native.fill_nan(self._expr, fill_value._expr) return Expression._from_pyexpr(expr) @@ -1539,7 +1536,7 @@ def fill_nan(self, fill_value: Expression) -> Expression: class ExpressionDatetimeNamespace(ExpressionNamespace): def date(self) -> Expression: - """Retrieves the date for a datetime column + """Retrieves the date for a datetime column. Example: >>> import daft, datetime @@ -1574,7 +1571,7 @@ def date(self) -> Expression: return Expression._from_pyexpr(native.dt_date(self._expr)) def day(self) -> Expression: - """Retrieves the day for a datetime column + """Retrieves the day for a datetime column. Example: >>> import daft, datetime @@ -1609,7 +1606,7 @@ def day(self) -> Expression: return Expression._from_pyexpr(native.dt_day(self._expr)) def hour(self) -> Expression: - """Retrieves the day for a datetime column + """Retrieves the day for a datetime column. Example: >>> import daft, datetime @@ -1644,7 +1641,7 @@ def hour(self) -> Expression: return Expression._from_pyexpr(native.dt_hour(self._expr)) def minute(self) -> Expression: - """Retrieves the minute for a datetime column + """Retrieves the minute for a datetime column. Example: >>> import daft, datetime @@ -1679,7 +1676,7 @@ def minute(self) -> Expression: return Expression._from_pyexpr(native.dt_minute(self._expr)) def second(self) -> Expression: - """Retrieves the second for a datetime column + """Retrieves the second for a datetime column. Example: >>> import daft, datetime @@ -1714,7 +1711,7 @@ def second(self) -> Expression: return Expression._from_pyexpr(native.dt_second(self._expr)) def time(self) -> Expression: - """Retrieves the time for a datetime column + """Retrieves the time for a datetime column. Example: >>> import daft, datetime @@ -1749,11 +1746,12 @@ def time(self) -> Expression: return Expression._from_pyexpr(native.dt_time(self._expr)) def month(self) -> Expression: - """Retrieves the month for a datetime column + """Retrieves the month for a datetime column. Example: >>> import daft, datetime - >>> df = daft.from_pydict({ + >>> df = daft.from_pydict( + ... { ... "datetime": [ ... datetime.datetime(2024, 7, 3, 0, 0, 0), ... datetime.datetime(2024, 6, 4, 0, 0, 0), @@ -1782,11 +1780,12 @@ def month(self) -> Expression: return Expression._from_pyexpr(native.dt_month(self._expr)) def year(self) -> Expression: - """Retrieves the year for a datetime column + """Retrieves the year for a datetime column. Example: >>> import daft, datetime - >>> df = daft.from_pydict({ + >>> df = daft.from_pydict( + ... { ... "datetime": [ ... datetime.datetime(2024, 7, 3, 0, 0, 0), ... datetime.datetime(2023, 7, 4, 0, 0, 0), @@ -1816,11 +1815,12 @@ def year(self) -> Expression: return Expression._from_pyexpr(native.dt_year(self._expr)) def day_of_week(self) -> Expression: - """Retrieves the day of the week for a datetime column, starting at 0 for Monday and ending at 6 for Sunday + """Retrieves the day of the week for a datetime column, starting at 0 for Monday and ending at 6 for Sunday. Example: >>> import daft, datetime - >>> df = daft.from_pydict({ + >>> df = daft.from_pydict( + ... { ... "datetime": [ ... datetime.datetime(2024, 7, 3, 0, 0, 0), ... datetime.datetime(2024, 7, 4, 0, 0, 0), @@ -1849,11 +1849,12 @@ def day_of_week(self) -> Expression: return Expression._from_pyexpr(native.dt_day_of_week(self._expr)) def truncate(self, interval: str, relative_to: Expression | None = None) -> Expression: - """Truncates the datetime column to the specified interval + """Truncates the datetime column to the specified interval. Example: >>> import daft, datetime - >>> df = daft.from_pydict({ + >>> df = daft.from_pydict( + ... { ... "datetime": [ ... datetime.datetime(2021, 1, 1, 0, 1, 1), ... datetime.datetime(2021, 1, 1, 0, 1, 59), @@ -1889,7 +1890,7 @@ def truncate(self, interval: str, relative_to: Expression | None = None) -> Expr class ExpressionStringNamespace(ExpressionNamespace): def contains(self, substr: str | Expression) -> Expression: - """Checks whether each string contains the given pattern in a string column + """Checks whether each string contains the given pattern in a string column. Example: >>> import daft @@ -1920,7 +1921,7 @@ def contains(self, substr: str | Expression) -> Expression: return Expression._from_pyexpr(native.utf8_contains(self._expr, substr_expr._expr)) def match(self, pattern: str | Expression) -> Expression: - """Checks whether each string matches the given regular expression pattern in a string column + """Checks whether each string matches the given regular expression pattern in a string column. Example: >>> import daft @@ -1950,7 +1951,7 @@ def match(self, pattern: str | Expression) -> Expression: return Expression._from_pyexpr(native.utf8_match(self._expr, pattern_expr._expr)) def endswith(self, suffix: str | Expression) -> Expression: - """Checks whether each string ends with the given pattern in a string column + """Checks whether each string ends with the given pattern in a string column. Example: >>> import daft @@ -1980,7 +1981,7 @@ def endswith(self, suffix: str | Expression) -> Expression: return Expression._from_pyexpr(native.utf8_endswith(self._expr, suffix_expr._expr)) def startswith(self, prefix: str | Expression) -> Expression: - """Checks whether each string starts with the given pattern in a string column + """Checks whether each string starts with the given pattern in a string column. Example: >>> import daft @@ -2061,7 +2062,7 @@ def split(self, pattern: str | Expression, regex: bool = False) -> Expression: return Expression._from_pyexpr(native.utf8_split(self._expr, pattern_expr._expr, regex)) def concat(self, other: str | Expression) -> Expression: - """Concatenates two string expressions together + """Concatenates two string expressions together. .. NOTE:: Another (easier!) way to invoke this functionality is using the Python `+` operator which is @@ -2145,7 +2146,7 @@ def extract(self, pattern: str | Expression, index: int = 0) -> Expression: Returns: Expression: a String expression with the extracted regex match - See also: + See Also: `extract_all` """ pattern_expr = Expression._to_expression(pattern) @@ -2201,7 +2202,7 @@ def extract_all(self, pattern: str | Expression, index: int = 0) -> Expression: Returns: Expression: a List[Utf8] expression with the extracted regex matches - See also: + See Also: `extract` """ pattern_expr = Expression._to_expression(pattern) @@ -2267,7 +2268,7 @@ def replace( ) def length(self) -> Expression: - """Retrieves the length for a UTF-8 string column + """Retrieves the length for a UTF-8 string column. Example: >>> import daft @@ -2321,7 +2322,7 @@ def length_bytes(self) -> Expression: return Expression._from_pyexpr(native.utf8_length_bytes(self._expr)) def lower(self) -> Expression: - """Convert UTF-8 string to all lowercase + """Convert UTF-8 string to all lowercase. Example: >>> import daft @@ -2348,7 +2349,7 @@ def lower(self) -> Expression: return Expression._from_pyexpr(native.utf8_lower(self._expr)) def upper(self) -> Expression: - """Convert UTF-8 string to all upper + """Convert UTF-8 string to all upper. Example: >>> import daft @@ -2375,7 +2376,7 @@ def upper(self) -> Expression: return Expression._from_pyexpr(native.utf8_upper(self._expr)) def lstrip(self) -> Expression: - """Strip whitespace from the left side of a UTF-8 string + """Strip whitespace from the left side of a UTF-8 string. Example: >>> import daft @@ -2402,7 +2403,7 @@ def lstrip(self) -> Expression: return Expression._from_pyexpr(native.utf8_lstrip(self._expr)) def rstrip(self) -> Expression: - """Strip whitespace from the right side of a UTF-8 string + """Strip whitespace from the right side of a UTF-8 string. Example: >>> import daft @@ -2429,7 +2430,7 @@ def rstrip(self) -> Expression: return Expression._from_pyexpr(native.utf8_rstrip(self._expr)) def reverse(self) -> Expression: - """Reverse a UTF-8 string + """Reverse a UTF-8 string. Example: >>> import daft @@ -2456,7 +2457,7 @@ def reverse(self) -> Expression: return Expression._from_pyexpr(native.utf8_reverse(self._expr)) def capitalize(self) -> Expression: - """Capitalize a UTF-8 string + """Capitalize a UTF-8 string. Example: >>> import daft @@ -2483,7 +2484,7 @@ def capitalize(self) -> Expression: return Expression._from_pyexpr(native.utf8_capitalize(self._expr)) def left(self, nchars: int | Expression) -> Expression: - """Gets the n (from nchars) left-most characters of each string + """Gets the n (from nchars) left-most characters of each string. Example: >>> import daft @@ -2511,7 +2512,7 @@ def left(self, nchars: int | Expression) -> Expression: return Expression._from_pyexpr(native.utf8_left(self._expr, nchars_expr._expr)) def right(self, nchars: int | Expression) -> Expression: - """Gets the n (from nchars) right-most characters of each string + """Gets the n (from nchars) right-most characters of each string. Example: >>> import daft @@ -2539,7 +2540,7 @@ def right(self, nchars: int | Expression) -> Expression: return Expression._from_pyexpr(native.utf8_right(self._expr, nchars_expr._expr)) def find(self, substr: str | Expression) -> Expression: - """Returns the index of the first occurrence of the substring in each string + """Returns the index of the first occurrence of the substring in each string. .. NOTE:: The returned index is 0-based. @@ -2571,7 +2572,7 @@ def find(self, substr: str | Expression) -> Expression: return Expression._from_pyexpr(native.utf8_find(self._expr, substr_expr._expr)) def rpad(self, length: int | Expression, pad: str | Expression) -> Expression: - """Right-pads each string by truncating or padding with the character + """Right-pads each string by truncating or padding with the character. .. NOTE:: If the string is longer than the specified length, it will be truncated. @@ -2604,7 +2605,7 @@ def rpad(self, length: int | Expression, pad: str | Expression) -> Expression: return Expression._from_pyexpr(native.utf8_rpad(self._expr, length_expr._expr, pad_expr._expr)) def lpad(self, length: int | Expression, pad: str | Expression) -> Expression: - """Left-pads each string by truncating on the right or padding with the character + """Left-pads each string by truncating on the right or padding with the character. .. NOTE:: If the string is longer than the specified length, it will be truncated on the right. @@ -2637,7 +2638,7 @@ def lpad(self, length: int | Expression, pad: str | Expression) -> Expression: return Expression._from_pyexpr(native.utf8_lpad(self._expr, length_expr._expr, pad_expr._expr)) def repeat(self, n: int | Expression) -> Expression: - """Repeats each string n times + """Repeats each string n times. Example: >>> import daft @@ -2665,7 +2666,7 @@ def repeat(self, n: int | Expression) -> Expression: return Expression._from_pyexpr(native.utf8_repeat(self._expr, n_expr._expr)) def like(self, pattern: str | Expression) -> Expression: - """Checks whether each string matches the given SQL LIKE pattern, case sensitive + """Checks whether each string matches the given SQL LIKE pattern, case sensitive. .. NOTE:: Use % as a multiple-character wildcard or _ as a single-character wildcard. @@ -2696,7 +2697,7 @@ def like(self, pattern: str | Expression) -> Expression: return Expression._from_pyexpr(native.utf8_like(self._expr, pattern_expr._expr)) def ilike(self, pattern: str | Expression) -> Expression: - """Checks whether each string matches the given SQL LIKE pattern, case insensitive + """Checks whether each string matches the given SQL LIKE pattern, case insensitive. .. NOTE:: Use % as a multiple-character wildcard or _ as a single-character wildcard. @@ -2735,7 +2736,7 @@ def substr(self, start: int | Expression, length: int | Expression | None = None Example: >>> import daft >>> df = daft.from_pydict({"x": ["daft", "query", "engine"]}) - >>> df = df.select(df["x"].str.substr(2,4)) + >>> df = df.select(df["x"].str.substr(2, 4)) >>> df.show() ╭──────╮ │ x │ @@ -2759,7 +2760,7 @@ def substr(self, start: int | Expression, length: int | Expression | None = None return Expression._from_pyexpr(native.utf8_substr(self._expr, start_expr._expr, length_expr._expr)) def to_date(self, format: str) -> Expression: - """Converts a string to a date using the specified format + """Converts a string to a date using the specified format. .. NOTE:: The format must be a valid date format string. @@ -2790,7 +2791,7 @@ def to_date(self, format: str) -> Expression: return Expression._from_pyexpr(native.utf8_to_date(self._expr, format)) def to_datetime(self, format: str, timezone: str | None = None) -> Expression: - """Converts a string to a datetime using the specified format and timezone + """Converts a string to a datetime using the specified format and timezone. .. NOTE:: The format must be a valid datetime format string. @@ -2818,7 +2819,9 @@ def to_datetime(self, format: str, timezone: str | None = None) -> Expression: If a timezone is provided, the datetime will be parsed in that timezone >>> df = daft.from_pydict({"x": ["2021-01-01 00:00:00.123 +0800", "2021-01-02 12:30:00.456 +0800", None]}) - >>> df = df.with_column("datetime", df["x"].str.to_datetime("%Y-%m-%d %H:%M:%S%.3f %z", timezone="Asia/Shanghai")) + >>> df = df.with_column( + ... "datetime", df["x"].str.to_datetime("%Y-%m-%d %H:%M:%S%.3f %z", timezone="Asia/Shanghai") + ... ) >>> df.show() ╭───────────────────────────────┬────────────────────────────────────────────────╮ │ x ┆ datetime │ @@ -2847,15 +2850,17 @@ def normalize( nfd_unicode: bool = False, white_space: bool = False, ): - """Normalizes a string for more useful deduplication. + r"""Normalizes a string for more useful deduplication. .. NOTE:: All processing options are off by default. Example: >>> import daft - >>> df = daft.from_pydict({"x": ["hello world", "Hello, world!", "HELLO, \\nWORLD!!!!"]}) - >>> df = df.with_column("normalized", df["x"].str.normalize(remove_punct=True, lowercase=True, white_space=True)) + >>> df = daft.from_pydict({"x": ["hello world", "Hello, world!", "HELLO, \nWORLD!!!!"]}) + >>> df = df.with_column( + ... "normalized", df["x"].str.normalize(remove_punct=True, lowercase=True, white_space=True) + ... ) >>> df.show() ╭───────────────┬─────────────╮ │ x ┆ normalized │ @@ -2915,7 +2920,6 @@ def tokenize_encode( Returns: Expression: An expression with the encodings of the strings as lists of unsigned 32-bit integers. """ - # if special tokens are passed in, enable using special tokens if use_special_tokens is None: use_special_tokens = special_tokens is not None @@ -2963,8 +2967,7 @@ def count_matches( whole_words: bool = False, case_sensitive: bool = True, ): - """ - Counts the number of times a pattern, or multiple patterns, appear in a string. + """Counts the number of times a pattern, or multiple patterns, appear in a string. .. NOTE:: If a pattern is a substring of another pattern, the longest pattern is matched first. @@ -2995,7 +2998,7 @@ def count_matches( class ExpressionListNamespace(ExpressionNamespace): def join(self, delimiter: str | Expression) -> Expression: - """Joins every element of a list using the specified string delimiter + """Joins every element of a list using the specified string delimiter. Args: delimiter (str | Expression): the delimiter to use to join lists with @@ -3041,7 +3044,7 @@ def value_counts(self) -> Expression: return Expression._from_pyexpr(native.list_value_counts(self._expr)) def count(self, mode: Literal["all", "valid", "null"] | CountMode = CountMode.Valid) -> Expression: - """Counts the number of elements in each list + """Counts the number of elements in each list. Args: mode: A string ("all", "valid", or "null") that represents whether to count all values, non-null (valid) values, or null values. Defaults to "valid". @@ -3054,7 +3057,7 @@ def count(self, mode: Literal["all", "valid", "null"] | CountMode = CountMode.Va return Expression._from_pyexpr(native.list_count(self._expr, mode)) def lengths(self) -> Expression: - """Gets the length of each list + """Gets the length of each list. (DEPRECATED) Please use Expression.list.length instead @@ -3069,7 +3072,7 @@ def lengths(self) -> Expression: return Expression._from_pyexpr(native.list_count(self._expr, CountMode.All)) def length(self) -> Expression: - """Gets the length of each list + """Gets the length of each list. Returns: Expression: a UInt64 expression which is the length of each list @@ -3077,7 +3080,7 @@ def length(self) -> Expression: return Expression._from_pyexpr(native.list_count(self._expr, CountMode.All)) def get(self, idx: int | Expression, default: object = None) -> Expression: - """Gets the element at an index in each list + """Gets the element at an index in each list. Args: idx: index or indices to retrieve from each list @@ -3091,7 +3094,7 @@ def get(self, idx: int | Expression, default: object = None) -> Expression: return Expression._from_pyexpr(native.list_get(self._expr, idx_expr._expr, default_expr._expr)) def slice(self, start: int | Expression, end: int | Expression | None = None) -> Expression: - """Gets a subset of each list + """Gets a subset of each list. Args: start: index or column of indices. The slice will include elements starting from this index. If `start` is negative, it represents an offset from the end of the list @@ -3105,7 +3108,7 @@ def slice(self, start: int | Expression, end: int | Expression | None = None) -> return Expression._from_pyexpr(native.list_slice(self._expr, start_expr._expr, end_expr._expr)) def chunk(self, size: int) -> Expression: - """Splits each list into chunks of the given size + """Splits each list into chunks of the given size. Args: size: size of chunks to split the list into. Must be greater than 0 @@ -3186,7 +3189,7 @@ def sort(self, desc: bool | Expression = False, nulls_first: bool | Expression | class ExpressionStructNamespace(ExpressionNamespace): def get(self, name: str) -> Expression: - """Retrieves one field from a struct column + """Retrieves one field from a struct column. Args: name: the name of the field to retrieve @@ -3199,12 +3202,12 @@ def get(self, name: str) -> Expression: class ExpressionMapNamespace(ExpressionNamespace): def get(self, key: Expression) -> Expression: - """Retrieves the value for a key in a map column + """Retrieves the value for a key in a map column. Example: >>> import pyarrow as pa >>> import daft - >>> pa_array = pa.array([[("a", 1)],[],[("b", 2)]], type=pa.map_(pa.string(), pa.int64())) + >>> pa_array = pa.array([[("a", 1)], [], [("b", 2)]], type=pa.map_(pa.string(), pa.int64())) >>> df = daft.from_arrow(pa.table({"map_col": pa_array})) >>> df = df.with_column("a", df["map_col"].map.get("a")) >>> df.show() @@ -3237,7 +3240,7 @@ def get(self, key: Expression) -> Expression: class ExpressionsProjection(Iterable[Expression]): - """A collection of Expressions that can be projected onto a Table to produce another Table + """A collection of Expressions that can be projected onto a Table to produce another Table. Invariants: 1. All Expressions have names @@ -3315,7 +3318,7 @@ def to_name_set(self) -> set[str]: return {e.name() for e in self} def input_mapping(self) -> dict[str, str]: - """Returns a map of {output_name: input_name} for all expressions that are just no-ops/aliases of an existing input""" + """Returns a map of {output_name: input_name} for all expressions that are just no-ops/aliases of an existing input.""" result = {} for e in self: input_map = e._input_mapping() @@ -3350,8 +3353,7 @@ def decode( on_error: Literal["raise", "null"] = "raise", mode: str | ImageMode | None = None, ) -> Expression: - """ - Decodes the binary data in this column into images. + """Decodes the binary data in this column into images. This can only be applied to binary columns that contain encoded images (e.g. PNG, JPEG, etc.) @@ -3379,9 +3381,7 @@ def decode( return Expression._from_pyexpr(native.image_decode(self._expr, raise_on_error=raise_on_error, mode=mode)) def encode(self, image_format: str | ImageFormat) -> Expression: - """ - Encode an image column as the provided image file format, returning a binary column - of encoded bytes. + """Encode an image column as the provided image file format, returning a binary column of encoded bytes. Args: image_format: The image file format into which the images will be encoded. @@ -3396,8 +3396,7 @@ def encode(self, image_format: str | ImageFormat) -> Expression: return Expression._from_pyexpr(native.image_encode(self._expr, image_format)) def resize(self, w: int, h: int) -> Expression: - """ - Resize image into the provided width and height. + """Resize image into the provided width and height. Args: w: Desired width of the resized image. @@ -3413,8 +3412,7 @@ def resize(self, w: int, h: int) -> Expression: return Expression._from_pyexpr(native.image_resize(self._expr, w, h)) def crop(self, bbox: tuple[int, int, int, int] | Expression) -> Expression: - """ - Crops images with the provided bounding box + """Crops images with the provided bounding box. Args: bbox (tuple[float, float, float, float] | Expression): Either a tuple of (x, y, width, height) @@ -3443,7 +3441,7 @@ def to_mode(self, mode: str | ImageMode) -> Expression: class ExpressionPartitioningNamespace(ExpressionNamespace): def days(self) -> Expression: - """Partitioning Transform that returns the number of days since epoch (1970-01-01) + """Partitioning Transform that returns the number of days since epoch (1970-01-01). Returns: Expression: Int32 Expression in days @@ -3451,7 +3449,7 @@ def days(self) -> Expression: return Expression._from_pyexpr(self._expr.partitioning_days()) def hours(self) -> Expression: - """Partitioning Transform that returns the number of hours since epoch (1970-01-01) + """Partitioning Transform that returns the number of hours since epoch (1970-01-01). Returns: Expression: Int32 Expression in hours @@ -3459,26 +3457,25 @@ def hours(self) -> Expression: return Expression._from_pyexpr(self._expr.partitioning_hours()) def months(self) -> Expression: - """Partitioning Transform that returns the number of months since epoch (1970-01-01) + """Partitioning Transform that returns the number of months since epoch (1970-01-01). Returns: Expression: Int32 Expression in months """ - return Expression._from_pyexpr(self._expr.partitioning_months()) def years(self) -> Expression: - """Partitioning Transform that returns the number of years since epoch (1970-01-01) + """Partitioning Transform that returns the number of years since epoch (1970-01-01). Returns: Expression: Int32 Expression in years """ - return Expression._from_pyexpr(self._expr.partitioning_years()) def iceberg_bucket(self, n: int) -> Expression: - """Partitioning Transform that returns the Hash Bucket following the Iceberg Specification of murmur3_32_x86 - https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements + """Partitioning Transform that returns the Hash Bucket following the Iceberg Specification of murmur3_32_x86. + + See https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements for more details. Args: n (int): Number of buckets @@ -3490,7 +3487,8 @@ def iceberg_bucket(self, n: int) -> Expression: def iceberg_truncate(self, w: int) -> Expression: """Partitioning Transform that truncates the input to a standard width `w` following the Iceberg Specification. - https://iceberg.apache.org/spec/#truncate-transform-details + + https://iceberg.apache.org/spec/#truncate-transform-details. Args: w (int): width of the truncation @@ -3503,7 +3501,8 @@ def iceberg_truncate(self, w: int) -> Expression: class ExpressionJsonNamespace(ExpressionNamespace): def query(self, jq_query: str) -> Expression: - """Query JSON data in a column using a JQ-style filter https://jqlang.github.io/jq/manual/ + """Query JSON data in a column using a JQ-style filter https://jqlang.github.io/jq/manual/. + This expression uses jaq as the underlying executor, see https://github.com/01mf02/jaq for the full list of supported filters. Example: @@ -3530,11 +3529,10 @@ def query(self, jq_query: str) -> Expression: Returns: Expression: Expression representing the result of the JQ query as a column of JSON-compatible strings """ - return Expression._from_pyexpr(native.json_query(self._expr, jq_query)) class ExpressionEmbeddingNamespace(ExpressionNamespace): def cosine_distance(self, other: Expression) -> Expression: - """Compute the cosine distance between two embeddings""" + """Compute the cosine distance between two embeddings.""" return Expression._from_pyexpr(native.cosine_distance(self._expr, other._expr)) diff --git a/daft/expressions/testing.py b/daft/expressions/testing.py index d3c7f0e70b..8d64121755 100644 --- a/daft/expressions/testing.py +++ b/daft/expressions/testing.py @@ -9,9 +9,11 @@ def expr_structurally_equal(e1: Expression, e2: Expression) -> bool: - """Returns a boolean indicating whether two Expressions are structurally equal: + """Returns a boolean indicating whether two Expressions are structurally equal. + + Structurally equal is defined as: 1. Expressions' local parameters are value-wise equal 2. Expressions have the same number of children Expressions - 3. (Recursive) Expressions' children are structurally equal to each other as well + 3. (Recursive) Expressions' children are structurally equal to each other as well. """ return _eq(e1._expr, e2._expr) diff --git a/daft/filesystem.py b/daft/filesystem.py index 22438d6365..88ed7a9985 100644 --- a/daft/filesystem.py +++ b/daft/filesystem.py @@ -23,8 +23,7 @@ def _get_fs_from_cache(protocol: str, io_config: IOConfig | None) -> pafs.FileSystem | None: - """ - Get an instantiated pyarrow filesystem from the cache based on the URI protocol. + """Get an instantiated pyarrow filesystem from the cache based on the URI protocol. Returns None if no such cache entry exists. """ @@ -106,10 +105,7 @@ def get_protocol_from_path(path: str) -> str: def canonicalize_protocol(protocol: str) -> str: - """ - Return the canonical protocol from the provided protocol, such that there's a 1:1 - mapping between protocols and pyarrow/fsspec filesystem implementations. - """ + """Return the canonical protocol from the provided protocol, such that there's a 1:1 mapping between protocols and pyarrow/fsspec filesystem implementations.""" return _CANONICAL_PROTOCOLS.get(protocol, protocol) @@ -117,9 +113,9 @@ def _resolve_paths_and_filesystem( paths: str | pathlib.Path | list[str], io_config: IOConfig | None = None, ) -> tuple[list[str], pafs.FileSystem]: - """ - Resolves and normalizes all provided paths, infers a filesystem from the - paths, and ensures that all paths use the same filesystem. + """Resolves and normalizes the provided path and infers it's filesystem. + + Also ensures that the inferred filesystem is compatible with the passed filesystem, if provided. Args: paths: A single file/directory path or a list of file/directory paths. @@ -191,10 +187,9 @@ def _infer_filesystem( path: str, io_config: IOConfig | None, ) -> tuple[str, pafs.FileSystem]: - """ - Resolves and normalizes the provided path, infers a filesystem from the - path, and ensures that the inferred filesystem is compatible with the passed - filesystem, if provided. + """Resolves and normalizes the provided path and infers it's filesystem. + + Also ensures that the inferred filesystem is compatible with the passedfilesystem, if provided. Args: path: A single file/directory path. @@ -205,7 +200,7 @@ def _infer_filesystem( translated_kwargs: dict[str, Any] def _set_if_not_none(kwargs: dict[str, Any], key: str, val: Any | None): - """Helper method used when setting kwargs for pyarrow""" + """Helper method used when setting kwargs for pyarrow.""" if val is not None: kwargs[key] = val @@ -302,9 +297,7 @@ def _set_if_not_none(kwargs: dict[str, Any], key: str, val: Any | None): def _unwrap_protocol(path): - """ - Slice off any protocol prefixes on path. - """ + """Slice off any protocol prefixes on path.""" parsed = urllib.parse.urlparse(path, allow_fragments=False) # support '#' in path query = "?" + parsed.query if parsed.query else "" # support '?' in path return parsed.netloc + parsed.path + query @@ -350,10 +343,7 @@ def glob_path_with_stats( def join_path(fs: pafs.FileSystem, base_path: str, *sub_paths: str) -> str: - """ - Join a base path with sub-paths using the appropriate path separator - for the given filesystem. - """ + """Join a base path with sub-paths using the appropriate path separator for the given filesystem.""" if isinstance(fs, pafs.LocalFileSystem): return os.path.join(base_path, *sub_paths) else: diff --git a/daft/hudi/hudi_scan.py b/daft/hudi/hudi_scan.py index 5c10476947..d68cfd99bb 100644 --- a/daft/hudi/hudi_scan.py +++ b/daft/hudi/hudi_scan.py @@ -61,7 +61,7 @@ def to_scan_tasks(self, pushdowns: Pushdowns) -> Iterator[ScanTask]: files_metadata = hudi_table_metadata.files_metadata if len(self.partitioning_keys()) > 0 and pushdowns.partition_filters is None: - logging.warning( + logger.warning( "%s has partitioning keys = %s, but no partition filter was specified. This will result in a full table scan.", self.display_name(), self.partitioning_keys(), diff --git a/daft/iceberg/iceberg_scan.py b/daft/iceberg/iceberg_scan.py index 474eba36a4..261e162b80 100644 --- a/daft/iceberg/iceberg_scan.py +++ b/daft/iceberg/iceberg_scan.py @@ -157,7 +157,7 @@ def to_scan_tasks(self, pushdowns: Pushdowns) -> Iterator[ScanTask]: limit_files = limit is not None and pushdowns.filters is None and pushdowns.partition_filters is None if len(self.partitioning_keys()) > 0 and pushdowns.partition_filters is None: - logging.warning( + logger.warning( "%s has Partitioning Keys: %s but no partition filter was specified. This will result in a full table scan.", self.display_name(), self.partitioning_keys(), diff --git a/daft/iceberg/iceberg_write.py b/daft/iceberg/iceberg_write.py index 07c7bea9f8..4888b83351 100644 --- a/daft/iceberg/iceberg_write.py +++ b/daft/iceberg/iceberg_write.py @@ -36,7 +36,7 @@ def get_missing_columns(data_schema: "pa.Schema", iceberg_schema: "IcebergSchema def coerce_pyarrow_table_to_schema(pa_table: "pa.Table", schema: "pa.Schema") -> "pa.Table": - """Coerces a PyArrow table to the supplied schema + """Coerces a PyArrow table to the supplied schema. 1. For each field in `pa_table`, cast it to the field in `input_schema` if one with a matching name is available @@ -114,11 +114,10 @@ def partition_field_to_expr(field: "IcebergPartitionField", schema: "IcebergSche def to_partition_representation(value: Any): - """ - Converts a partition value to the format expected by Iceberg metadata. + """Converts a partition value to the format expected by Iceberg metadata. + Most transforms already do this, but the identity transforms preserve the original value type so we need to convert it. """ - if value is None: return None diff --git a/daft/iceberg/schema_field_id_mapping_visitor.py b/daft/iceberg/schema_field_id_mapping_visitor.py index d48b85510b..f7b6258fdb 100644 --- a/daft/iceberg/schema_field_id_mapping_visitor.py +++ b/daft/iceberg/schema_field_id_mapping_visitor.py @@ -19,7 +19,7 @@ def _nested_field_to_daft_pyfield(field: NestedField) -> PyField: class SchemaFieldIdMappingVisitor(SchemaVisitor[FieldIdMapping]): - """Extracts a mapping of {field_id: PyField} from an Iceberg schema""" + """Extracts a mapping of {field_id: PyField} from an Iceberg schema.""" def schema(self, schema: Schema, struct_result: FieldIdMapping) -> FieldIdMapping: """Visit a Schema.""" diff --git a/daft/internal/gpu.py b/daft/internal/gpu.py index 1c7a527c4d..7497839e91 100644 --- a/daft/internal/gpu.py +++ b/daft/internal/gpu.py @@ -4,8 +4,7 @@ def _raw_device_count_nvml() -> int: - """ - Return number of devices as reported by NVML or zero if NVML discovery/initialization failed. + """Return number of devices as reported by NVML or zero if NVML discovery/initialization failed. Inspired by PyTorch: https://github.com/pytorch/pytorch/blob/88e54de21976aa504e797e47f06b480b9108ef5c/torch/cuda/__init__.py#L711 """ diff --git a/daft/io/__init__.py b/daft/io/__init__.py index 09647f21f2..1b7dfb402d 100644 --- a/daft/io/__init__.py +++ b/daft/io/__init__.py @@ -20,21 +20,21 @@ from daft.io.file_path import from_glob_path __all__ = [ - "read_csv", - "read_json", + "AzureConfig", + "DataCatalogTable", + "DataCatalogType", + "GCSConfig", + "HTTPConfig", + "IOConfig", + "S3Config", + "S3Credentials", "from_glob_path", - "read_parquet", + "read_csv", + "read_deltalake", "read_hudi", "read_iceberg", - "read_deltalake", + "read_json", "read_lance", + "read_parquet", "read_sql", - "IOConfig", - "S3Config", - "S3Credentials", - "AzureConfig", - "GCSConfig", - "HTTPConfig", - "DataCatalogType", - "DataCatalogTable", ] diff --git a/daft/io/_csv.py b/daft/io/_csv.py index 54d00eb164..27ebd2a096 100644 --- a/daft/io/_csv.py +++ b/daft/io/_csv.py @@ -37,7 +37,7 @@ def read_csv( _buffer_size: Optional[int] = None, _chunk_size: Optional[int] = None, ) -> DataFrame: - """Creates a DataFrame from CSV file(s) + """Creates a DataFrame from CSV file(s). Example: >>> df = daft.read_csv("/path/to/file.csv") diff --git a/daft/io/_generator.py b/daft/io/_generator.py index 41781c479b..e5c185e464 100644 --- a/daft/io/_generator.py +++ b/daft/io/_generator.py @@ -61,7 +61,6 @@ def read_generator( Returns: DataFrame: a DataFrame containing the generated data """ - generator_scan_operator = GeneratorScanOperator( generators=generators, schema=schema, diff --git a/daft/io/_iceberg.py b/daft/io/_iceberg.py index 63ef90779e..62f47babba 100644 --- a/daft/io/_iceberg.py +++ b/daft/io/_iceberg.py @@ -88,7 +88,7 @@ def read_iceberg( snapshot_id: Optional[int] = None, io_config: Optional["IOConfig"] = None, ) -> DataFrame: - """Create a DataFrame from an Iceberg table + """Create a DataFrame from an Iceberg table. Example: >>> import pyiceberg diff --git a/daft/io/_json.py b/daft/io/_json.py index 294a6bb212..3626d775a1 100644 --- a/daft/io/_json.py +++ b/daft/io/_json.py @@ -30,7 +30,7 @@ def read_json( _buffer_size: Optional[int] = None, _chunk_size: Optional[int] = None, ) -> DataFrame: - """Creates a DataFrame from line-delimited JSON file(s) + """Creates a DataFrame from line-delimited JSON file(s). Example: >>> df = daft.read_json("/path/to/file.json") diff --git a/daft/io/_lance.py b/daft/io/_lance.py index 008fc70ded..bce5bb527e 100644 --- a/daft/io/_lance.py +++ b/daft/io/_lance.py @@ -26,7 +26,7 @@ def _lancedb_table_factory_function( @PublicAPI def read_lance(url: str, io_config: Optional["IOConfig"] = None) -> DataFrame: - """Create a DataFrame from a LanceDB table + """Create a DataFrame from a LanceDB table. .. NOTE:: This function requires the use of `LanceDB `_, which is the Python @@ -35,7 +35,6 @@ def read_lance(url: str, io_config: Optional["IOConfig"] = None) -> DataFrame: To ensure that this is installed with Daft, you may install: ``pip install getdaft[lance]`` Example: - >>> df = daft.read_lance("s3://my-lancedb-bucket/data/") >>> df.show() @@ -46,7 +45,6 @@ def read_lance(url: str, io_config: Optional["IOConfig"] = None) -> DataFrame: Returns: DataFrame: a DataFrame with the schema converted from the specified LanceDB table """ - try: import lance except ImportError as e: diff --git a/daft/io/_parquet.py b/daft/io/_parquet.py index 0ae9b9a0f0..b65c9b5441 100644 --- a/daft/io/_parquet.py +++ b/daft/io/_parquet.py @@ -32,7 +32,7 @@ def read_parquet( _multithreaded_io: Optional[bool] = None, _chunk_size: Optional[int] = None, # A hidden parameter for testing purposes. ) -> DataFrame: - """Creates a DataFrame from Parquet file(s) + """Creates a DataFrame from Parquet file(s). Example: >>> df = daft.read_parquet("/path/to/file.parquet") diff --git a/daft/io/_sql.py b/daft/io/_sql.py index 47b8710435..1fa714fcc5 100644 --- a/daft/io/_sql.py +++ b/daft/io/_sql.py @@ -79,22 +79,12 @@ def read_sql( Read data from a SQL query and partition the data by a column: - >>> df = daft.read_sql( - ... "SELECT * FROM my_table", - ... "sqlite:///my_database.db", - ... partition_col="id" - ... ) + >>> df = daft.read_sql("SELECT * FROM my_table", "sqlite:///my_database.db", partition_col="id") Read data from a SQL query and partition the data into 3 partitions: - >>> df = daft.read_sql( - ... "SELECT * FROM my_table", - ... "sqlite:///my_database.db", - ... partition_col="id", - ... num_partitions=3 - ... ) + >>> df = daft.read_sql("SELECT * FROM my_table", "sqlite:///my_database.db", partition_col="id", num_partitions=3) """ - if num_partitions is not None and partition_col is None: raise ValueError("Failed to execute sql: partition_col must be specified when num_partitions is specified") diff --git a/daft/io/catalog.py b/daft/io/catalog.py index 859e69c6c0..246eeb67ab 100644 --- a/daft/io/catalog.py +++ b/daft/io/catalog.py @@ -23,8 +23,7 @@ class DataCatalogType(Enum): @dataclass class DataCatalogTable: - """ - A reference to a table in some database in some data catalog. + """A reference to a table in some database in some data catalog. See :class:`~.DataCatalog` """ @@ -44,8 +43,7 @@ def __post_init__(self): ) def table_uri(self, io_config: IOConfig) -> str: - """ - Get the URI of the table in the data catalog. + """Get the URI of the table in the data catalog. Returns: str: The URI of the table. diff --git a/daft/io/object_store_options.py b/daft/io/object_store_options.py index 65855455b7..72858bc835 100644 --- a/daft/io/object_store_options.py +++ b/daft/io/object_store_options.py @@ -8,9 +8,9 @@ def io_config_to_storage_options(io_config: IOConfig, table_uri: str) -> dict[str, str] | None: - """ - Converts the Daft IOConfig to a storage options dict that the object_store crate - understands. The object_store crate is used by many Rust-backed Python libraries such as + """Converts the Daft IOConfig to a storage options dict that the object_store crate understands. + + The object_store crate is used by many Rust-backed Python libraries such as delta-rs and lance. This function takes as input the table_uri, which it uses to determine the backend to be used. diff --git a/daft/lazy_import.py b/daft/lazy_import.py index cfeac7ac48..63234da25a 100644 --- a/daft/lazy_import.py +++ b/daft/lazy_import.py @@ -17,7 +17,8 @@ class LazyImport: - """Lazy importer + """Manages Optional Dependency imports. + There are certain large imports (e.g. Ray, daft.unity_catalog.UnityCatalogTable, etc.) that do not need to be top-level imports. For example, Ray should only be imported when the ray runner is used, or specific ray data extension types are needed. We can lazily import these diff --git a/daft/logical/builder.py b/daft/logical/builder.py index 412e400cfd..b7316a0a80 100644 --- a/daft/logical/builder.py +++ b/daft/logical/builder.py @@ -31,7 +31,7 @@ def _apply_daft_planning_config_to_initializer(classmethod_func: Callable[..., LogicalPlanBuilder]): - """Decorator to be applied to any @classmethod instantiation method on LogicalPlanBuilder + """Decorator to be applied to any @classmethod instantiation method on LogicalPlanBuilder. This decorator ensures that the current DaftPlanningConfig is applied to the instantiated LogicalPlanBuilder """ @@ -50,17 +50,15 @@ def wrapper(cls: type[LogicalPlanBuilder], *args, **kwargs): class LogicalPlanBuilder: - """ - A logical plan builder for the Daft DataFrame. - """ + """A logical plan builder for the Daft DataFrame.""" def __init__(self, builder: _LogicalPlanBuilder) -> None: self._builder = builder def to_physical_plan_scheduler(self, daft_execution_config: PyDaftExecutionConfig) -> PhysicalPlanScheduler: - """ - Convert the underlying logical plan to a physical plan scheduler, which is - used to generate executable tasks for the physical plan. + """Convert the underlying logical plan to a physical plan scheduler. + + physical plan scheduler is used to generate executable tasks for the physical plan. This should be called after triggering optimization with self.optimize(). @@ -87,16 +85,12 @@ def to_adaptive_physical_plan_scheduler( ) def schema(self) -> Schema: - """ - The schema of the current logical plan. - """ + """The schema of the current logical plan.""" pyschema = self._builder.schema() return Schema._from_pyschema(pyschema) def pretty_print(self, simple: bool = False, format: str = "ascii") -> str: - """ - Pretty prints the current underlying logical plan. - """ + """Pretty prints the current underlying logical plan.""" from daft.dataframe.display import MermaidOptions if format == "ascii": @@ -110,9 +104,7 @@ def __repr__(self) -> str: return self._builder.repr_ascii(simple=False) def optimize(self) -> LogicalPlanBuilder: - """ - Optimize the underlying logical plan. - """ + """Optimize the underlying logical plan.""" builder = self._builder.optimize() return LogicalPlanBuilder(builder) diff --git a/daft/logical/map_partition_ops.py b/daft/logical/map_partition_ops.py index b77ca97352..935894c861 100644 --- a/daft/logical/map_partition_ops.py +++ b/daft/logical/map_partition_ops.py @@ -13,11 +13,11 @@ class MapPartitionOp: @abstractmethod def get_output_schema(self) -> Schema: - """Returns the output schema after running this MapPartitionOp""" + """Returns the output schema after running this MapPartitionOp.""" @abstractmethod def run(self, input_partition: MicroPartition) -> MicroPartition: - """Runs this MapPartitionOp on the supplied vPartition""" + """Runs this MapPartitionOp on the supplied vPartition.""" class ExplodeOp(MapPartitionOp): diff --git a/daft/logical/schema.py b/daft/logical/schema.py index 5ee40dfb4b..4335d0d3d2 100644 --- a/daft/logical/schema.py +++ b/daft/logical/schema.py @@ -64,7 +64,7 @@ def _from_pyschema(schema: _PySchema) -> Schema: @classmethod def from_pyarrow_schema(cls, pa_schema: pa.Schema) -> Schema: - """Creates a Daft Schema from a PyArrow Schema + """Creates a Daft Schema from a PyArrow Schema. Args: pa_schema (pa.Schema): PyArrow schema to convert @@ -77,7 +77,7 @@ def from_pyarrow_schema(cls, pa_schema: pa.Schema) -> Schema: ) def to_pyarrow_schema(self) -> pa.Schema: - """Converts a Daft Schema to a PyArrow Schema + """Converts a Daft Schema to a PyArrow Schema. Returns: pa.Schema: PyArrow schema that corresponds to the provided Daft schema diff --git a/daft/pickle/cloudpickle.py b/daft/pickle/cloudpickle.py index 872e2edd4e..1cc1dc6cec 100644 --- a/daft/pickle/cloudpickle.py +++ b/daft/pickle/cloudpickle.py @@ -349,9 +349,13 @@ def _find_imported_submodules(code, top_level_dependencies): ``` import concurrent.futures import cloudpickle + + def func(): x = concurrent.futures.ThreadPoolExecutor - if __name__ == '__main__': + + + if __name__ == "__main__": cloudpickle.dumps(func) ``` The globals extracted by cloudpickle in the function's state include the @@ -415,6 +419,7 @@ def cell_set(cell, value): >>> def f(var): ... def g(): ... var += 1 + ... ... return g will not modify the closure variable ``var```inplace, but instead try to diff --git a/daft/plan_scheduler/physical_plan_scheduler.py b/daft/plan_scheduler/physical_plan_scheduler.py index fd8249dccb..6391d9d4d3 100644 --- a/daft/plan_scheduler/physical_plan_scheduler.py +++ b/daft/plan_scheduler/physical_plan_scheduler.py @@ -20,9 +20,7 @@ class PhysicalPlanScheduler: - """ - Generates executable tasks for an underlying physical plan. - """ + """Generates executable tasks for an underlying physical plan.""" def __init__(self, scheduler: _PhysicalPlanScheduler): self._scheduler = scheduler @@ -38,9 +36,7 @@ def num_partitions(self) -> int: return self._scheduler.num_partitions() def pretty_print(self, simple: bool = False, format: str = "ascii") -> str: - """ - Pretty prints the current underlying physical plan. - """ + """Pretty prints the current underlying physical plan.""" from daft.dataframe.display import MermaidOptions if format == "ascii": diff --git a/daft/runners/partitioning.py b/daft/runners/partitioning.py index c7a58cad4e..502dabcff8 100644 --- a/daft/runners/partitioning.py +++ b/daft/runners/partitioning.py @@ -22,7 +22,7 @@ @dataclass(frozen=True) class TableReadOptions: - """Options for reading a vPartition + """Options for reading a vPartition. Args: num_rows: Number of rows to read, or None to read all rows @@ -35,7 +35,7 @@ class TableReadOptions: @dataclass(frozen=True) class TableParseCSVOptions: - """Options for parsing CSVs + """Options for parsing CSVs. Args: delimiter: The delimiter to use when parsing CSVs, defaults to "," @@ -59,7 +59,7 @@ class TableParseCSVOptions: @dataclass(frozen=True) class TableParseParquetOptions: - """Options for parsing Parquet files + """Options for parsing Parquet files. Args: coerce_int96_timestamp_unit: TimeUnit to use when parsing Int96 fields @@ -196,7 +196,8 @@ def cancel(self) -> None: @abstractmethod def _noop(self, _: PartitionT) -> None: """Implement this as a no-op. - https://peps.python.org/pep-0544/#overriding-inferred-variance-of-protocol-classes + + https://peps.python.org/pep-0544/#overriding-inferred-variance-of-protocol-classes. """ ... @@ -229,10 +230,7 @@ def to_arrow(self) -> pa.Table: return merged_partition.to_arrow() def items(self) -> list[tuple[PartID, MaterializedResult[PartitionT]]]: - """ - Returns all (partition id, partition) in this PartitionSet, - ordered by partition ID. - """ + """Returns all (partition id, partition) in this PartitionSet ordered by partition ID.""" raise NotImplementedError() def values(self) -> list[MaterializedResult[PartitionT]]: diff --git a/daft/runners/pyrunner.py b/daft/runners/pyrunner.py index e386d0ea99..3fb6b61277 100644 --- a/daft/runners/pyrunner.py +++ b/daft/runners/pyrunner.py @@ -69,8 +69,7 @@ def try_acquire(self, resource_request: ResourceRequest) -> AcquiredResources | return resources[0] if resources is not None else None def try_acquire_multiple(self, resource_requests: list[ResourceRequest]) -> list[AcquiredResources] | None: - """ - Attempts to acquire the requested resources. + """Attempts to acquire the requested resources. If the requested resources are available, returns a list of `AcquiredResources` with the amount of acquired CPUs and memory, as well as the specific GPUs that were acquired per request. @@ -166,8 +165,7 @@ def release(self, resources: AcquiredResources | list[AcquiredResources]): class PyActorSingleton: - """ - This class stores the singleton `initialized_projection` that is isolated to each Python process. It stores the projection with initialized actor pool UDF objects of a single actor. + """This class stores the singleton `initialized_projection` that is isolated to each Python process. It stores the projection with initialized actor pool UDF objects of a single actor. Currently, only one actor pool UDF per actor is supported, but we allow multiple here in case we want to support multiple actor pool UDFs in the future. @@ -447,10 +445,7 @@ def actor_pool_context( del self._actor_pools[actor_pool_id] def _create_resource_release_callback(self, resources: AcquiredResources) -> Callable[[futures.Future], None]: - """ - This higher order function is used so that the `resources` released by the callback - are from the ones stored in the variable at the creation of the callback instead of during its call. - """ + """This higher order function is used so that the `resources` released by the callback are from the ones stored in the variable at the creation of the callback instead of during its call.""" return lambda _: self._resources.release(resources) def _physical_plan_to_partitions( diff --git a/daft/runners/ray_metrics.py b/daft/runners/ray_metrics.py index 0b2497a082..df542446c6 100644 --- a/daft/runners/ray_metrics.py +++ b/daft/runners/ray_metrics.py @@ -26,7 +26,7 @@ class TaskEvent: @dataclasses.dataclass(frozen=True) class StartTaskEvent(TaskEvent): - """Marks the start of a task, along with available metadata""" + """Marks the start of a task, along with available metadata.""" # Start Unix timestamp start: float @@ -47,14 +47,14 @@ class StartTaskEvent(TaskEvent): @dataclasses.dataclass(frozen=True) class EndTaskEvent(TaskEvent): - """Marks the end of a task, along with available metadata""" + """Marks the end of a task, along with available metadata.""" # End Unix timestamp end: float class _NodeInfo: - """Information about nodes and their workers""" + """Information about nodes and their workers.""" def __init__(self): self.node_to_workers = {} @@ -62,7 +62,7 @@ def __init__(self): self.worker_idxs = {} def get_node_and_worker_idx(self, node_id: str, worker_id: str) -> tuple[int, int]: - """Returns a node and worker index for the provided IDs""" + """Returns a node and worker index for the provided IDs.""" # Truncate to save space node_id = node_id[:8] worker_id = worker_id[:8] @@ -80,7 +80,7 @@ def get_node_and_worker_idx(self, node_id: str, worker_id: str) -> tuple[int, in return node_idx, worker_idx def collect_node_info(self) -> dict[str, list[str]]: - """Returns a dictionary of {node_id: [worker_ids...]}""" + """Returns a dictionary of {node_id: [worker_ids...]}.""" return self.node_to_workers.copy() @@ -91,7 +91,7 @@ def __init__(self): self._node_info: dict[str, _NodeInfo] = defaultdict(_NodeInfo) def ready(self): - """Returns when the metrics actor is ready""" + """Returns when the metrics actor is ready.""" # Discussion on how to check if an actor is ready: https://github.com/ray-project/ray/issues/14923 return None @@ -106,7 +106,7 @@ def mark_task_start( ray_assigned_resources: dict, ray_task_id: str, ): - """Records a task start event""" + """Records a task start event.""" # Update node info node_idx, worker_idx = self._node_info[execution_id].get_node_and_worker_idx(node_id, worker_id) @@ -132,7 +132,7 @@ def get_task_events(self, execution_id: str, idx: int) -> tuple[list[TaskEvent], return (events[idx:], len(events)) def collect_and_close(self, execution_id: str) -> dict[str, list[str]]: - """Collect the metrics associated with this execution, cleaning up the memory used for this execution ID""" + """Collect the metrics associated with this execution, cleaning up the memory used for this execution ID.""" # Data about the available nodes and worker IDs in those nodes node_data = self._node_info[execution_id].collect_node_info() @@ -149,7 +149,7 @@ class MetricsActorHandle: actor: ray.actor.ActorHandle def wait(self) -> None: - """Call to block until the underlying actor is ready""" + """Call to block until the underlying actor is ready.""" return ray.wait([self.actor.ready.remote()], fetch_local=False) def mark_task_start( @@ -185,14 +185,14 @@ def mark_task_end( ) def get_task_events(self, idx: int) -> tuple[list[TaskEvent], int]: - """Collect task metrics from a given logical event index + """Collect task metrics from a given logical event index. Returns the task metrics and the new logical event index (to be used as a pagination offset token on subsequent requests) """ return ray.get(self.actor.get_task_events.remote(self.execution_id, idx)) def collect_and_close(self) -> dict[str, set[str]]: - """Collect node metrics and close the metrics actor for this execution""" + """Collect node metrics and close the metrics actor for this execution.""" return ray.get(self.actor.collect_and_close.remote(self.execution_id)) @@ -207,7 +207,7 @@ def collect_and_close(self) -> dict[str, set[str]]: def get_metrics_actor(execution_id: str) -> MetricsActorHandle: - """Retrieves a handle to the Actor for a given job_id""" + """Retrieves a handle to the Actor for a given job_id.""" with _metrics_actor_lock: actor = _MetricsActor.options( # type: ignore[attr-defined] name="daft_metrics_actor", diff --git a/daft/runners/ray_runner.py b/daft/runners/ray_runner.py index b852bfa3b0..7893fc7c0c 100644 --- a/daft/runners/ray_runner.py +++ b/daft/runners/ray_runner.py @@ -555,7 +555,7 @@ def get_metas(*partitions: MicroPartition) -> list[PartitionMetadata]: def _ray_num_cpus_provider(ttl_seconds: int = 1) -> Generator[int, None, None]: - """Helper that gets the number of CPUs from Ray + """Helper that gets the number of CPUs from Ray. Used as a generator as it provides a guard against calling ray.cluster_resources() more than once per `ttl_seconds`. @@ -578,10 +578,7 @@ def _ray_num_cpus_provider(ttl_seconds: int = 1) -> Generator[int, None, None]: class Scheduler(ActorPoolManager): def __init__(self, max_task_backlog: int | None, use_ray_tqdm: bool) -> None: - """ - max_task_backlog: Max number of inflight tasks waiting for cores. - """ - + """max_task_backlog: Max number of inflight tasks waiting for cores.""" # As of writing, Ray does not seem to be guaranteed to support # more than this number of pending scheduling tasks. # Ray has an internal proto that reports backlogged tasks [1], @@ -683,10 +680,9 @@ def _construct_dispatch_batch( dispatches_allowed: int, runner_tracer: RunnerTracer, ) -> tuple[list[PartitionTask], bool]: - """Constructs a batch of PartitionTasks that should be dispatched + """Constructs a batch of PartitionTasks that should be dispatched. Args: - execution_id: The ID of the current execution. tasks: The iterator over the physical plan. dispatches_allowed (int): The maximum number of tasks that can be dispatched in this batch. @@ -758,7 +754,7 @@ def _dispatch_tasks( daft_execution_config_objref: ray.ObjectRef, runner_tracer: RunnerTracer, ) -> Iterator[tuple[PartitionTask, list[ray.ObjectRef]]]: - """Iteratively Dispatches a batch of tasks to the Ray backend""" + """Iteratively Dispatches a batch of tasks to the Ray backend.""" with runner_tracer.dispatching(): for task in tasks_to_dispatch: if task.actor_pool_id is None: @@ -814,14 +810,15 @@ def _await_tasks( return readies def _is_active(self, execution_id: str): - """Checks if the execution for the provided `execution_id` is still active""" + """Checks if the execution for the provided `execution_id` is still active.""" return self.active_by_df.get(execution_id, False) def _place_in_queue(self, execution_id: str, item: ray.ObjectRef): - """Places a result into the queue for the provided `execution_id + """Places a result into the queue for the provided `execution_id. NOTE: This will block and poll busily until space is available on the queue - `""" + ` + """ while self._is_active(execution_id): try: self.results_by_df[execution_id].put(item, timeout=0.1) @@ -1113,7 +1110,7 @@ def run( class RayRoundRobinActorPool: - """Naive implementation of an ActorPool that performs round-robin task submission to the actors""" + """Naive implementation of an ActorPool that performs round-robin task submission to the actors.""" def __init__( self, diff --git a/daft/runners/ray_tracing.py b/daft/runners/ray_tracing.py index baa1d4969b..b200651a76 100644 --- a/daft/runners/ray_tracing.py +++ b/daft/runners/ray_tracing.py @@ -1,4 +1,4 @@ -"""This module contains utilities and wrappers that instrument tracing over our RayRunner's task scheduling + execution +"""This module contains utilities and wrappers that instrument tracing over our RayRunner's task scheduling + execution. These utilities are meant to provide light wrappers on top of Ray functionality (e.g. remote functions, actors, ray.get/ray.wait) which allow us to intercept these calls and perform the necessary actions for tracing the interaction between Daft and Ray. @@ -46,7 +46,7 @@ def get_daft_trace_location(log_location: pathlib.Path) -> pathlib.Path: @contextlib.contextmanager def ray_tracer(execution_id: str, daft_execution_config: PyDaftExecutionConfig) -> Iterator[RunnerTracer]: - """Instantiates a RunnerTracer for the duration of the code block""" + """Instantiates a RunnerTracer for the duration of the code block.""" # Dump the RayRunner trace if we detect an active Ray session, otherwise we give up and do not write the trace ray_logs_location = get_log_location() filepath: pathlib.Path | None @@ -82,19 +82,19 @@ def ray_tracer(execution_id: str, daft_execution_config: PyDaftExecutionConfig) @dataclasses.dataclass class TraceWriter: - """Handles writing trace events to a JSON file in Chrome Trace Event Format""" + """Handles writing trace events to a JSON file in Chrome Trace Event Format.""" file: TextIO | None start: float has_written_event: bool = False def write_header(self) -> None: - """Initialize the JSON file with an opening bracket""" + """Initialize the JSON file with an opening bracket.""" if self.file is not None: self.file.write("[") def write_metadata(self, event: dict[str, Any]) -> None: - """Write a metadata event to the trace file + """Write a metadata event to the trace file. Args: event: The metadata event to write @@ -106,7 +106,7 @@ def write_metadata(self, event: dict[str, Any]) -> None: self.has_written_event = True def write_event(self, event: dict[str, Any], ts: int | None = None) -> int: - """Write a single trace event to the file + """Write a single trace event to the file. Args: event: The event data to write @@ -308,7 +308,7 @@ def _write_process_and_thread_names( process_meta: list[tuple[int, str]], thread_meta: list[tuple[int, int, str]], ): - """Writes metadata for the file + """Writes metadata for the file. Args: process_meta: Pass in custom names for PIDs as a list of (pid, name). @@ -581,7 +581,7 @@ def task_received_as_ready(self, task_id: str, stage_id: int): @dataclasses.dataclass(frozen=True) class _RayFunctionWrapper: - """Wrapper around a Ray remote function that allows us to intercept calls and record the call for a given task ID""" + """Wrapper around a Ray remote function that allows us to intercept calls and record the call for a given task ID.""" f: ray.remote_function.RemoteFunction @@ -593,7 +593,7 @@ def options(self, *args, **kwargs) -> _RayFunctionWrapper: def ray_remote_traced(f: ray.remote_function.RemoteFunction): - """Decorates a Ray Remote function to ensure that we can trace it + """Decorates a Ray Remote function to ensure that we can trace it. Usage: @@ -609,7 +609,7 @@ def ray_remote_traced(f: ray.remote_function.RemoteFunction): @dataclasses.dataclass(frozen=True) class _RayRunnableFunctionWrapper: - """Runnable variant of RayFunctionWrapper that supports `.remote` calls""" + """Runnable variant of RayFunctionWrapper that supports `.remote` calls.""" f: ray.remote_function.RemoteFunction runner_tracer: RunnerTracer @@ -625,7 +625,7 @@ def remote(self, *args, **kwargs): @dataclasses.dataclass(frozen=True) class MaterializedPhysicalPlanWrapper: - """Wrapper around MaterializedPhysicalPlan that hooks into tracing capabilities""" + """Wrapper around MaterializedPhysicalPlan that hooks into tracing capabilities.""" plan: MaterializedPhysicalPlan runner_tracer: RunnerTracer @@ -654,7 +654,7 @@ def __next__(self): @contextlib.contextmanager def collect_ray_task_metrics(execution_id: str, task_id: str, stage_id: int, execution_config: PyDaftExecutionConfig): - """Context manager that will ping the metrics actor to record various execution metrics about a given task""" + """Context manager that will ping the metrics actor to record various execution metrics about a given task.""" if execution_config.enable_ray_tracing: import time diff --git a/daft/series.py b/daft/series.py index 7e6581a3c1..9be77c401a 100644 --- a/daft/series.py +++ b/daft/series.py @@ -11,9 +11,7 @@ class Series: - """ - A Daft Series is an array of data of a single type, and is usually a column in a DataFrame. - """ + """A Daft Series is an array of data of a single type, and is usually a column in a DataFrame.""" _series: PySeries @@ -28,14 +26,12 @@ def _from_pyseries(pyseries: PySeries) -> Series: @staticmethod def from_arrow(array: pa.Array | pa.ChunkedArray, name: str = "arrow_series") -> Series: - """ - Construct a Series from an pyarrow array or chunked array. + """Construct a Series from an pyarrow array or chunked array. Args: array: The pyarrow (chunked) array whose data we wish to put in the Series. name: The name associated with the Series; this is usually the column name. """ - _ensure_registered_super_ext_type() if DataType.from_arrow_type(array.type) == DataType.python(): # If the Arrow type is not natively supported, go through the Python list path. @@ -76,7 +72,6 @@ def from_pylist(data: list, name: str = "list_series", pyobj: str = "allow") -> falling back to Python type representation, or ``"force"`` the data to only have a Python type representation. Default is ``"allow"``. """ - if not isinstance(data, list): raise TypeError(f"expected a python list, got {type(data)}") @@ -98,8 +93,7 @@ def from_pylist(data: list, name: str = "list_series", pyobj: str = "allow") -> @classmethod def from_numpy(cls, data: np.ndarray, name: str = "numpy_series") -> Series: - """ - Construct a Series from a NumPy ndarray. + """Construct a Series from a NumPy ndarray. If the provided NumPy ndarray is 1-dimensional, Daft will attempt to store the ndarray in a pyarrow Array. If the ndarray has more than 1 dimension OR storing the 1D array in Arrow failed, @@ -125,8 +119,7 @@ def from_numpy(cls, data: np.ndarray, name: str = "numpy_series") -> Series: @classmethod def from_pandas(cls, data: pd.Series, name: str = "pd_series") -> Series: - """ - Construct a Series from a pandas Series. + """Construct a Series from a pandas Series. This will first try to convert the series into a pyarrow array, then will fall back to converting the series to a NumPy ndarray and going through that construction path, @@ -210,9 +203,7 @@ def datatype(self) -> DataType: return DataType._from_pydatatype(self._series.data_type()) def to_arrow(self) -> pa.Array: - """ - Convert this Series to an pyarrow array. - """ + """Convert this Series to an pyarrow array.""" _ensure_registered_super_ext_type() dtype = self.datatype() @@ -228,9 +219,7 @@ def to_arrow(self) -> pa.Array: return arrow_arr def to_pylist(self) -> list: - """ - Convert this Series to a Python list. - """ + """Convert this Series to a Python list.""" if self.datatype()._is_python_type(): return self._series.to_pylist() elif self.datatype()._should_cast_to_python(): @@ -340,68 +329,69 @@ def tan(self) -> Series: return Series._from_pyseries(self._series.tan()) def cot(self) -> Series: - """The elementwise cotangent of a numeric series""" + """The elementwise cotangent of a numeric series.""" return Series._from_pyseries(self._series.cot()) def arcsin(self) -> Series: - """The elementwise arc sine of a numeric series""" + """The elementwise arc sine of a numeric series.""" return Series._from_pyseries(self._series.arcsin()) def arccos(self) -> Series: - """The elementwise arc cosine of a numeric series""" + """The elementwise arc cosine of a numeric series.""" return Series._from_pyseries(self._series.arccos()) def arctan(self) -> Series: - """The elementwise arc tangent of a numeric series""" + """The elementwise arc tangent of a numeric series.""" return Series._from_pyseries(self._series.arctan()) def arctan2(self, other: Series) -> Series: - """Calculates the four quadrant arctangent of coordinates (y, x)""" + """Calculates the four quadrant arctangent of coordinates (y, x).""" if not isinstance(other, Series): raise TypeError(f"expected another Series but got {type(other)}") return Series._from_pyseries(self._series.arctan2(other._series)) def arctanh(self) -> Series: - """The elementwise inverse hyperbolic tangent of a numeric series""" + """The elementwise inverse hyperbolic tangent of a numeric series.""" return Series._from_pyseries(self._series.arctanh()) def arccosh(self) -> Series: - """The elementwise inverse hyperbolic cosine of a numeric series""" + """The elementwise inverse hyperbolic cosine of a numeric series.""" return Series._from_pyseries(self._series.arccosh()) def arcsinh(self) -> Series: - """The elementwise inverse hyperbolic sine of a numeric series""" + """The elementwise inverse hyperbolic sine of a numeric series.""" return Series._from_pyseries(self._series.arcsinh()) def radians(self) -> Series: - """The elementwise radians of a numeric series""" + """The elementwise radians of a numeric series.""" return Series._from_pyseries(self._series.radians()) def degrees(self) -> Series: - """The elementwise degrees of a numeric series""" + """The elementwise degrees of a numeric series.""" return Series._from_pyseries(self._series.degrees()) def log2(self) -> Series: - """The elementwise log2 of a numeric series""" + """The elementwise log2 of a numeric series.""" return Series._from_pyseries(self._series.log2()) def log10(self) -> Series: - """The elementwise log10 of a numeric series""" + """The elementwise log10 of a numeric series.""" return Series._from_pyseries(self._series.log10()) def log(self, base: float) -> Series: """The elementwise log with given base, of a numeric series. + Args: base: The base of the logarithm. """ return Series._from_pyseries(self._series.log(base)) def ln(self) -> Series: - """The elementwise ln of a numeric series""" + """The elementwise ln of a numeric series.""" return Series._from_pyseries(self._series.ln()) def exp(self) -> Series: - """The e^self of a numeric series""" + """The e^self of a numeric series.""" return Series._from_pyseries(self._series.exp()) def __add__(self, other: object) -> Series: @@ -576,8 +566,7 @@ def minhash( seed: int = 1, hash_function: Literal["murmurhash3", "xxhash", "sha1"] = "murmurhash3", ) -> Series: - """ - Runs the MinHash algorithm on the series. + """Runs the MinHash algorithm on the series. For a string, calculates the minimum hash over all its ngrams, repeating with `num_hashes` permutations. Returns as a list of 32-bit unsigned integers. diff --git a/daft/sql/_sql_funcs.py b/daft/sql/_sql_funcs.py index 030cd3b53f..5a8a1f790b 100644 --- a/daft/sql/_sql_funcs.py +++ b/daft/sql/_sql_funcs.py @@ -1,6 +1,4 @@ -"""This module is used for Sphinx documentation only. We procedurally generate Python functions to allow -Sphinx to generate documentation pages for every SQL function. -""" +"""This module is used for Sphinx documentation only. We procedurally generate Python functions to allow Sphinx to generate documentation pages for every SQL function.""" from __future__ import annotations diff --git a/daft/sql/sql.py b/daft/sql/sql.py index 76c9cc9d53..31f9016fe9 100644 --- a/daft/sql/sql.py +++ b/daft/sql/sql.py @@ -15,8 +15,7 @@ class SQLCatalog: - """ - SQLCatalog is a simple map from table names to dataframes used in query planning. + """SQLCatalog is a simple map from table names to dataframes used in query planning. EXPERIMENTAL: This features is early in development and will change. """ @@ -41,7 +40,7 @@ def _copy_from(self, other: "SQLCatalog") -> None: @PublicAPI def sql_expr(sql: str) -> Expression: - """Parses a SQL string into a Daft Expression + """Parses a SQL string into a Daft Expression. This function allows you to create Daft Expressions from SQL snippets, which can then be used in Daft operations or combined with other Daft Expressions. @@ -99,13 +98,12 @@ def sql_expr(sql: str) -> Expression: @PublicAPI def sql(sql: str, catalog: Optional[SQLCatalog] = None, register_globals: bool = True) -> DataFrame: - """Run a SQL query, returning the results as a DataFrame + """Run a SQL query, returning the results as a DataFrame. .. WARNING:: This features is early in development and will likely experience API changes. Examples: - A simple example joining 2 dataframes together using a SQL statement, relying on Daft to detect the names of SQL tables using their corresponding Python variable names. diff --git a/daft/sql/sql_scan.py b/daft/sql/sql_scan.py index 9f014bdbd1..48ee7fe4d6 100644 --- a/daft/sql/sql_scan.py +++ b/daft/sql/sql_scan.py @@ -216,7 +216,7 @@ def _get_partition_bounds(self, num_scan_tasks: int) -> list[Any]: except Exception as e: warnings.warn( - f"Failed to calculate partition bounds for read_sql using percentile strategy: {str(e)}. " + f"Failed to calculate partition bounds for read_sql using percentile strategy: {e!s}. " "Falling back to MIN_MAX strategy." ) self._partition_bound_strategy = PartitionBoundStrategy.MIN_MAX diff --git a/daft/table/micropartition.py b/daft/table/micropartition.py index ac8426ed73..b1c60a9c3f 100644 --- a/daft/table/micropartition.py +++ b/daft/table/micropartition.py @@ -155,7 +155,7 @@ def to_pandas( ### def cast_to_schema(self, schema: Schema) -> MicroPartition: - """Casts a MicroPartition into the provided schema""" + """Casts a MicroPartition into the provided schema.""" return MicroPartition._from_pymicropartition(self._micropartition.cast_to_schema(schema._schema)) def eval_expression_list(self, exprs: ExpressionsProjection) -> MicroPartition: @@ -250,7 +250,7 @@ def quantiles(self, num: int) -> MicroPartition: return MicroPartition._from_pymicropartition(self._micropartition.quantiles(num)) def explode(self, columns: ExpressionsProjection) -> MicroPartition: - """NOTE: Expressions here must be Explode expressions (Expression._explode())""" + """NOTE: Expressions here must be Explode expressions.""" to_explode_pyexprs = [e._expr for e in columns] return MicroPartition._from_pymicropartition(self._micropartition.explode(to_explode_pyexprs)) diff --git a/daft/table/partitioning.py b/daft/table/partitioning.py index 2333a198e5..f7b097f066 100644 --- a/daft/table/partitioning.py +++ b/daft/table/partitioning.py @@ -52,8 +52,7 @@ def _create_partitions(self): self._partitions, self._partition_values = self.table.partition_by_value(partition_keys=self.partition_keys) def partitions(self) -> List[MicroPartition]: - """ - Returns a list of MicroPartitions representing the table partitioned by the partition keys. + """Returns a list of MicroPartitions representing the table partitioned by the partition keys. If the table is not partitioned, returns the original table as the single element in the list. """ @@ -62,8 +61,7 @@ def partitions(self) -> List[MicroPartition]: return self._partitions # type: ignore def partition_values(self) -> Optional[MicroPartition]: - """ - Returns the partition values, with each row corresponding to the partition at the same index in PartitionedTable.partitions(). + """Returns the partition values, with each row corresponding to the partition at the same index in PartitionedTable.partitions(). If the table is not partitioned, returns None. @@ -73,8 +71,7 @@ def partition_values(self) -> Optional[MicroPartition]: return self._partition_values def partition_values_str(self) -> Optional[MicroPartition]: - """ - Returns the partition values converted to human-readable strings, keeping null values as null. + """Returns the partition values converted to human-readable strings, keeping null values as null. If the table is not partitioned, returns None. """ diff --git a/daft/table/schema_inference.py b/daft/table/schema_inference.py index 66b76bbbec..1d0a1bffe7 100644 --- a/daft/table/schema_inference.py +++ b/daft/table/schema_inference.py @@ -23,7 +23,8 @@ def from_csv( storage_config: StorageConfig | None = None, csv_options: TableParseCSVOptions = TableParseCSVOptions(), ) -> Schema: - """Infers a Schema from a CSV file + """Infers a Schema from a CSV file. + Args: file (str | IO): either a file-like object or a string file path (potentially prefixed with a protocol such as "s3://") fs (fsspec.AbstractFileSystem): fsspec FileSystem to use for reading data. @@ -31,7 +32,7 @@ def from_csv( csv_options (vPartitionParseCSVOptions, optional): CSV-specific configs to apply when reading the file read_options (TableReadOptions, optional): Options for reading the file Returns: - Schema: Inferred Schema from the CSV + Schema: Inferred Schema from the CSV. """ # Have PyArrow generate the column names if user specifies that there are no headers pyarrow_autogenerate_column_names = csv_options.header_index is None @@ -77,7 +78,7 @@ def from_json( file: FileInput, storage_config: StorageConfig | None = None, ) -> Schema: - """Reads a Schema from a JSON file + """Reads a Schema from a JSON file. Args: file (FileInput): either a file-like object or a string file path (potentially prefixed with a protocol such as "s3://") @@ -111,7 +112,7 @@ def from_parquet( file: FileInput, storage_config: StorageConfig | None = None, ) -> Schema: - """Infers a Schema from a Parquet file""" + """Infers a Schema from a Parquet file.""" io_config = None if storage_config is not None: config = storage_config.config diff --git a/daft/table/table.py b/daft/table/table.py index ba0868fdaf..d77fc2a3f8 100644 --- a/daft/table/table.py +++ b/daft/table/table.py @@ -157,7 +157,7 @@ def slice(self, start: int, end: int) -> Table: ### def to_table(self) -> Table: - """For compatibility with MicroPartition""" + """For compatibility with MicroPartition.""" return self def to_arrow(self) -> pa.Table: @@ -221,7 +221,7 @@ def to_pandas( ### def cast_to_schema(self, schema: Schema) -> Table: - """Casts a Table into the provided schema""" + """Casts a Table into the provided schema.""" return Table._from_pytable(self._table.cast_to_schema(schema._schema)) def eval_expression_list(self, exprs: ExpressionsProjection) -> Table: @@ -306,7 +306,7 @@ def quantiles(self, num: int) -> Table: return Table._from_pytable(self._table.quantiles(num)) def explode(self, columns: ExpressionsProjection) -> Table: - """NOTE: Expressions here must be Explode expressions (Expression._explode())""" + """NOTE: Expressions here must be Explode expressions.""" to_explode_pyexprs = [e._expr for e in columns] return Table._from_pytable(self._table.explode(to_explode_pyexprs)) diff --git a/daft/table/table_io.py b/daft/table/table_io.py index e28c069ab3..1a74e37659 100644 --- a/daft/table/table_io.py +++ b/daft/table/table_io.py @@ -70,8 +70,9 @@ def _open_stream( def _cast_table_to_schema(table: MicroPartition, read_options: TableReadOptions, schema: Schema) -> pa.Table: - """Performs a cast of a Daft MicroPartition to the requested Schema/Data. This is required because: + """Performs a cast of a Daft MicroPartition to the requested Schema/Data. + This is required because: 1. Data read from the datasource may have types that do not match the inferred global schema 2. Data read from the datasource may have columns that are out-of-order with the inferred schema 3. We may need only a subset of columns, or differently-ordered columns, in `read_options` @@ -95,7 +96,7 @@ def read_json( json_read_options: JsonReadOptions | None = None, read_options: TableReadOptions = TableReadOptions(), ) -> MicroPartition: - """Reads a MicroPartition from a JSON file + """Reads a MicroPartition from a JSON file. Args: file (str | IO): either a file-like object or a string file path (potentially prefixed with a protocol such as "s3://") @@ -150,7 +151,7 @@ def read_parquet( read_options: TableReadOptions = TableReadOptions(), parquet_options: TableParseParquetOptions = TableParseParquetOptions(), ) -> MicroPartition: - """Reads a MicroPartition from a Parquet file + """Reads a MicroPartition from a Parquet file. Args: file (str | IO): either a file-like object or a string file path (potentially prefixed with a protocol such as "s3://") @@ -234,7 +235,7 @@ def read_sql( read_options: TableReadOptions = TableReadOptions(), predicate: Expression | None = None, ) -> MicroPartition: - """Reads a MicroPartition from a SQL query + """Reads a MicroPartition from a SQL query. Args: sql (str): SQL query to execute @@ -245,7 +246,6 @@ def read_sql( Returns: MicroPartition: MicroPartition from SQL query """ - pa_table = conn.execute_sql_query(sql) mp = MicroPartition.from_arrow(pa_table) @@ -285,7 +285,7 @@ def read_csv( csv_options: TableParseCSVOptions = TableParseCSVOptions(), read_options: TableReadOptions = TableReadOptions(), ) -> MicroPartition: - """Reads a MicroPartition from a CSV file + """Reads a MicroPartition from a CSV file. Args: file (str | IO): either a file-like object or a string file path (potentially prefixed with a protocol such as "s3://") diff --git a/daft/udf.py b/daft/udf.py index beecbc7f45..ccd13ec808 100644 --- a/daft/udf.py +++ b/daft/udf.py @@ -83,7 +83,7 @@ def run_udf( py_return_dtype: PyDataType, batch_size: int | None, ) -> PySeries: - """API to call from Rust code that will call an UDF (initialized, in the case of actor pool UDFs) on the inputs""" + """API to call from Rust code that will call an UDF (initialized, in the case of actor pool UDFs) on the inputs.""" return_dtype = DataType._from_pydatatype(py_return_dtype) kwarg_keys = list(bound_args.bound_args.kwargs.keys()) arg_keys = bound_args.arg_keys() @@ -326,7 +326,6 @@ def with_concurrency(self, concurrency: int) -> UDF: """Override the concurrency of this UDF, which tells Daft how many instances of your UDF to run concurrently. Example: - >>> import daft >>> >>> @daft.udf(return_dtype=daft.DataType.string(), num_gpus=1) @@ -343,11 +342,9 @@ def with_concurrency(self, concurrency: int) -> UDF: return dataclasses.replace(self, concurrency=concurrency) def with_init_args(self, *args, **kwargs) -> UDF: - """Replace initialization arguments for a class UDF when calling `__init__` at runtime - on each instance of the UDF. + """Replace initialization arguments for a class UDF when calling `__init__` at runtime on each instance of the UDF. Example: - >>> import daft >>> >>> @daft.udf(return_dtype=daft.DataType.string()) @@ -403,7 +400,7 @@ def udf( memory_bytes: int | None = None, batch_size: int | None = None, ) -> Callable[[UserDefinedPyFuncLike], UDF]: - """`@udf` Decorator to convert a Python function/class into a `UDF` + """`@udf` Decorator to convert a Python function/class into a `UDF`. UDFs allow users to run arbitrary Python code on the outputs of Expressions. diff --git a/daft/udf_library/url_udfs.py b/daft/udf_library/url_udfs.py index aa97dc670f..37f4c485bc 100644 --- a/daft/udf_library/url_udfs.py +++ b/daft/udf_library/url_udfs.py @@ -17,7 +17,7 @@ def _worker_thread_initializer() -> None: - """Initializes per-thread local state""" + """Initializes per-thread local state.""" thread_local.filesystems_cache = {} @@ -47,8 +47,7 @@ def _download(path: str | None, on_error: Literal["raise", "null"]) -> bytes | N def _warmup_fsspec_registry(urls_pylist: list[str | None]) -> None: - """HACK: filesystem.get_filesystem calls fsspec.get_filesystem_class under the hood, which throws an error - if accessed concurrently for the first time. We "warm" it up in a single-threaded fashion here + """HACK: filesystem.get_filesystem calls fsspec.get_filesystem_class under the hood, which throws an error if accessed concurrently for the first time. We "warm" it up in a single-threaded fashion here. This should be fixed in the next release of FSSpec See: https://github.com/Eventual-Inc/Daft/issues/892 @@ -76,7 +75,6 @@ def download_udf( fs (fsspec.AbstractFileSystem): fsspec FileSystem to use for downloading data. By default, Daft will automatically construct a FileSystem instance internally. """ - urls_pylist = urls.to_arrow().to_pylist() _warmup_fsspec_registry(urls_pylist) diff --git a/daft/unity_catalog/unity_catalog.py b/daft/unity_catalog/unity_catalog.py index d0150fc087..f22e310193 100644 --- a/daft/unity_catalog/unity_catalog.py +++ b/daft/unity_catalog/unity_catalog.py @@ -17,7 +17,7 @@ class UnityCatalogTable: class UnityCatalog: - """Client to access the Unity Catalog + """Client to access the Unity Catalog. Unity Catalog is an open-sourced data catalog that can be self-hosted, or hosted by Databricks. diff --git a/daft/viz/html_viz_hooks.py b/daft/viz/html_viz_hooks.py index 17418d30e2..928bb831e6 100644 --- a/daft/viz/html_viz_hooks.py +++ b/daft/viz/html_viz_hooks.py @@ -14,8 +14,7 @@ def register_viz_hook(klass: type[HookClass], hook: Callable[[object], str]): - """Registers a visualization hook that returns the appropriate HTML for - visualizing a specific class in HTML""" + """Registers a visualization hook that returns the appropriate HTML for visualizing a specific class in HTML.""" _VIZ_HOOKS_REGISTRY[klass] = hook @@ -38,7 +37,7 @@ def _viz_pil_image(val: pil_image.Image) -> str: bio = io.BytesIO() img.save(bio, "JPEG") base64_img = base64.b64encode(bio.getvalue()) - return f'{str(val)}' + return f'{val!s}' register_viz_hook(pil_image.Image, _viz_pil_image) _PILLOW_REGISTERED = True diff --git a/docs/source/ext/sql_autosummary.py b/docs/source/ext/sql_autosummary.py index 5e37456cbe..14a6e04d13 100644 --- a/docs/source/ext/sql_autosummary.py +++ b/docs/source/ext/sql_autosummary.py @@ -42,7 +42,7 @@ def get_sql_func_names(): def generate_stub(name: str): - """Generates a stub string for a SQL function""" + """Generates a stub string for a SQL function.""" stub = name + "\n" stub += "=" * len(name) + "\n\n" stub += STUB_TEMPLATE.format(module_name=SQL_MODULE_NAME, name=name) diff --git a/tests/actor_pool/test_actor_cuda_devices.py b/tests/actor_pool/test_actor_cuda_devices.py index 74af05599c..2ec0648dce 100644 --- a/tests/actor_pool/test_actor_cuda_devices.py +++ b/tests/actor_pool/test_actor_cuda_devices.py @@ -19,7 +19,7 @@ @contextmanager def reset_runner_with_gpus(num_gpus, monkeypatch): - """If current runner does not have enough GPUs, create a new runner with mocked GPU resources""" + """If current runner does not have enough GPUs, create a new runner with mocked GPU resources.""" if len(cuda_visible_devices()) < num_gpus: if get_tests_daft_runner_name() == "ray": try: diff --git a/tests/actor_pool/test_pyactor_pool.py b/tests/actor_pool/test_pyactor_pool.py index 477fcca22f..f97b3bf8a8 100644 --- a/tests/actor_pool/test_pyactor_pool.py +++ b/tests/actor_pool/test_pyactor_pool.py @@ -39,7 +39,7 @@ def test_pyactor_pool(): final_metadata=[ppm], ) done, _ = wait([result], timeout=None) - result_data = list(done)[0].result()[0] + result_data = next(iter(done)).result()[0] assert result_data.partition().to_pydict() == {"x": [2, 2, 2]} result = pool.submit( @@ -48,7 +48,7 @@ def test_pyactor_pool(): final_metadata=[ppm], ) done, _ = wait([result], timeout=None) - result_data = list(done)[0].result()[0] + result_data = next(iter(done)).result()[0] assert result_data.partition().to_pydict() == {"x": [3, 3, 3]} result = pool.submit( @@ -57,7 +57,7 @@ def test_pyactor_pool(): final_metadata=[ppm], ) done, _ = wait([result], timeout=None) - result_data = list(done)[0].result()[0] + result_data = next(iter(done)).result()[0] assert result_data.partition().to_pydict() == {"x": [4, 4, 4]} diff --git a/tests/conftest.py b/tests/conftest.py index c72aefd3c3..74e8e1e771 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -30,7 +30,7 @@ def pytest_configure(config): def get_tests_daft_runner_name() -> Literal["ray"] | Literal["py"] | Literal["native"]: - """Test utility that checks the environment variable for the runner that is being used for the test""" + """Test utility that checks the environment variable for the runner that is being used for the test.""" name = os.getenv("DAFT_RUNNER") assert name is not None, "Tests must be run with $DAFT_RUNNER env var" name = name.lower() @@ -90,7 +90,7 @@ def join_strategy(request): @pytest.fixture(scope="function") def make_df(data_source, tmp_path) -> daft.Dataframe: - """Makes a dataframe when provided with data""" + """Makes a dataframe when provided with data.""" def _make_df( data: pa.Table | dict | list, diff --git a/tests/connect/conftest.py b/tests/connect/conftest.py index 7f6b05a27a..ab7e36777b 100644 --- a/tests/connect/conftest.py +++ b/tests/connect/conftest.py @@ -6,13 +6,11 @@ @pytest.fixture(scope="session") def spark_session(): - """ - Fixture to create and clean up a Spark session. + """Fixture to create and clean up a Spark session. This fixture is available to all test files and creates a single Spark session for the entire test suite run. """ - from daft.daft import connect_start # Start Daft Connect server diff --git a/tests/cookbook/conftest.py b/tests/cookbook/conftest.py index 265fe340a3..fb3c6b38d6 100644 --- a/tests/cookbook/conftest.py +++ b/tests/cookbook/conftest.py @@ -48,7 +48,5 @@ def service_requests_csv_pd_df(): params=[1, 2] if get_tests_daft_runner_name() != "native" else [1], ) def repartition_nparts(request): - """Adds a `n_repartitions` parameter to test cases which provides the number of - partitions that the test case should repartition its dataset into for testing - """ + """Adds a `n_repartitions` parameter to test cases which provides the number of partitions that the test case should repartition its dataset into for testing.""" return request.param diff --git a/tests/cookbook/test_aggregations.py b/tests/cookbook/test_aggregations.py index 4dcfbf0cc6..b5509e4e85 100644 --- a/tests/cookbook/test_aggregations.py +++ b/tests/cookbook/test_aggregations.py @@ -13,7 +13,7 @@ def test_sum(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """Sums across an entire column for the entire table""" + """Sums across an entire column for the entire table.""" daft_df = daft_df.repartition(repartition_nparts).sum(col("Unique Key").alias("unique_key_sum")) service_requests_csv_pd_df = pd.DataFrame.from_records( [{"unique_key_sum": service_requests_csv_pd_df["Unique Key"].sum()}] @@ -23,7 +23,7 @@ def test_sum(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morse def test_approx_percentiles(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """Computes approx percentile across an entire column for the entire table""" + """Computes approx percentile across an entire column for the entire table.""" daft_df = daft_df.repartition(repartition_nparts).agg( col("Unique Key").alias("unique_key_median").approx_percentiles([0.25, 0.5, 0.75]) ) @@ -38,7 +38,7 @@ def test_approx_percentiles(daft_df, service_requests_csv_pd_df, repartition_npa def test_mean(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """Averages across a column for entire table""" + """Averages across a column for entire table.""" daft_df = daft_df.repartition(repartition_nparts).mean(col("Unique Key").alias("unique_key_mean")) service_requests_csv_pd_df = pd.DataFrame.from_records( [{"unique_key_mean": service_requests_csv_pd_df["Unique Key"].mean()}] @@ -48,7 +48,7 @@ def test_mean(daft_df, service_requests_csv_pd_df, repartition_nparts, with_mors def test_min(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """min across a column for entire table""" + """Min across a column for entire table.""" daft_df = daft_df.repartition(repartition_nparts).min(col("Unique Key").alias("unique_key_min")) service_requests_csv_pd_df = pd.DataFrame.from_records( [{"unique_key_min": service_requests_csv_pd_df["Unique Key"].min()}] @@ -58,7 +58,7 @@ def test_min(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morse def test_max(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """max across a column for entire table""" + """Max across a column for entire table.""" daft_df = daft_df.repartition(repartition_nparts).max(col("Unique Key").alias("unique_key_max")) service_requests_csv_pd_df = pd.DataFrame.from_records( [{"unique_key_max": service_requests_csv_pd_df["Unique Key"].max()}] @@ -68,7 +68,7 @@ def test_max(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morse def test_count(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """count a column for entire table""" + """Count a column for entire table.""" daft_df = daft_df.repartition(repartition_nparts).count(col("Unique Key").alias("unique_key_count")) service_requests_csv_pd_df = pd.DataFrame.from_records( [{"unique_key_count": service_requests_csv_pd_df["Unique Key"].count()}] @@ -79,7 +79,7 @@ def test_count(daft_df, service_requests_csv_pd_df, repartition_nparts, with_mor def test_list(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """list agg a column for entire table""" + """List agg a column for entire table.""" daft_df = daft_df.repartition(repartition_nparts).agg_list(col("Unique Key").alias("unique_key_list")).collect() unique_key_list = service_requests_csv_pd_df["Unique Key"].to_list() @@ -89,7 +89,7 @@ def test_list(daft_df, service_requests_csv_pd_df, repartition_nparts, with_mors def test_global_agg(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """Averages across a column for entire table""" + """Averages across a column for entire table.""" daft_df = daft_df.repartition(repartition_nparts).agg( [ col("Unique Key").mean().alias("unique_key_mean"), @@ -113,7 +113,7 @@ def test_global_agg(daft_df, service_requests_csv_pd_df, repartition_nparts, wit def test_filtered_sum(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """Sums across an entire column for the entire table filtered by a certain condition""" + """Sums across an entire column for the entire table filtered by a certain condition.""" daft_df = ( daft_df.repartition(repartition_nparts) .where(col("Borough") == "BROOKLYN") @@ -140,7 +140,7 @@ def test_filtered_sum(daft_df, service_requests_csv_pd_df, repartition_nparts, w ], ) def test_sum_groupby(daft_df, service_requests_csv_pd_df, repartition_nparts, keys, with_morsel_size): - """Sums across groups""" + """Sums across groups.""" daft_df = daft_df.repartition(repartition_nparts).groupby(*[col(k) for k in keys]).sum(col("Unique Key")) service_requests_csv_pd_df = service_requests_csv_pd_df.groupby(keys).sum("Unique Key").reset_index() daft_pd_df = daft_df.to_pandas() @@ -155,7 +155,7 @@ def test_sum_groupby(daft_df, service_requests_csv_pd_df, repartition_nparts, ke ], ) def test_approx_percentile_groupby(daft_df, service_requests_csv_pd_df, repartition_nparts, keys, with_morsel_size): - """Computes approx percentile across groups""" + """Computes approx percentile across groups.""" daft_df = ( daft_df.repartition(repartition_nparts) .groupby(*[col(k) for k in keys]) @@ -181,7 +181,7 @@ def test_approx_percentile_groupby(daft_df, service_requests_csv_pd_df, repartit ], ) def test_mean_groupby(daft_df, service_requests_csv_pd_df, repartition_nparts, keys, with_morsel_size): - """Sums across groups""" + """Sums across groups.""" daft_df = daft_df.repartition(repartition_nparts).groupby(*[col(k) for k in keys]).mean(col("Unique Key")) service_requests_csv_pd_df = service_requests_csv_pd_df.groupby(keys).mean("Unique Key").reset_index() daft_pd_df = daft_df.to_pandas() @@ -196,7 +196,7 @@ def test_mean_groupby(daft_df, service_requests_csv_pd_df, repartition_nparts, k ], ) def test_count_groupby(daft_df, service_requests_csv_pd_df, repartition_nparts, keys, with_morsel_size): - """count across groups""" + """Count across groups.""" daft_df = daft_df.repartition(repartition_nparts).groupby(*[col(k) for k in keys]).count() service_requests_csv_pd_df = service_requests_csv_pd_df.groupby(keys).count().reset_index() for cname in service_requests_csv_pd_df: @@ -214,7 +214,7 @@ def test_count_groupby(daft_df, service_requests_csv_pd_df, repartition_nparts, ], ) def test_min_groupby(daft_df, service_requests_csv_pd_df, repartition_nparts, keys, with_morsel_size): - """min across groups""" + """Min across groups.""" daft_df = ( daft_df.repartition(repartition_nparts) .groupby(*[col(k) for k in keys]) @@ -235,7 +235,7 @@ def test_min_groupby(daft_df, service_requests_csv_pd_df, repartition_nparts, ke ], ) def test_max_groupby(daft_df, service_requests_csv_pd_df, repartition_nparts, keys, with_morsel_size): - """max across groups""" + """Max across groups.""" daft_df = ( daft_df.repartition(repartition_nparts) .groupby(*[col(k) for k in keys]) @@ -256,7 +256,7 @@ def test_max_groupby(daft_df, service_requests_csv_pd_df, repartition_nparts, ke ], ) def test_sum_groupby_sorted(daft_df, service_requests_csv_pd_df, repartition_nparts, keys, with_morsel_size): - """Test sorting after a groupby""" + """Test sorting after a groupby.""" daft_df = ( daft_df.repartition(repartition_nparts) .groupby(*[col(k) for k in keys]) @@ -278,7 +278,7 @@ def test_sum_groupby_sorted(daft_df, service_requests_csv_pd_df, repartition_npa ], ) def test_map_groups(daft_df, service_requests_csv_pd_df, repartition_nparts, keys, with_morsel_size): - """Test map_groups""" + """Test map_groups.""" @udf(return_dtype=DataType.float64()) def average_resolution_time(created_date, closed_date): diff --git a/tests/cookbook/test_computations.py b/tests/cookbook/test_computations.py index 6a676b5c9d..11999f016f 100644 --- a/tests/cookbook/test_computations.py +++ b/tests/cookbook/test_computations.py @@ -5,7 +5,7 @@ def test_add_one_to_column(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """Creating a new column that is derived from (1 + other_column) and retrieving the top N results""" + """Creating a new column that is derived from (1 + other_column) and retrieving the top N results.""" daft_df = daft_df.repartition(repartition_nparts).with_column("unique_key_mod", col("Unique Key") + 1) service_requests_csv_pd_df["unique_key_mod"] = service_requests_csv_pd_df["Unique Key"] + 1 daft_pd_df = daft_df.to_pandas() @@ -13,7 +13,7 @@ def test_add_one_to_column(daft_df, service_requests_csv_pd_df, repartition_npar def test_add_one_to_column_name_override(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """Creating a new column that is derived from (1 + other_column) and retrieving the top N results""" + """Creating a new column that is derived from (1 + other_column) and retrieving the top N results.""" daft_df = ( daft_df.repartition(repartition_nparts) .with_column("Unique Key", col("Unique Key") + 1) @@ -25,7 +25,7 @@ def test_add_one_to_column_name_override(daft_df, service_requests_csv_pd_df, re def test_add_one_to_column_limit(daft_df, service_requests_csv_pd_df, with_morsel_size): - """Creating a new column that is derived from (1 + other_column) and retrieving the top N results""" + """Creating a new column that is derived from (1 + other_column) and retrieving the top N results.""" daft_df = daft_df.with_column("unique_key_mod", col("Unique Key") + 1).limit(10) service_requests_csv_pd_df["unique_key_mod"] = service_requests_csv_pd_df["Unique Key"] + 1 service_requests_csv_pd_df = service_requests_csv_pd_df.head(10) @@ -34,7 +34,7 @@ def test_add_one_to_column_limit(daft_df, service_requests_csv_pd_df, with_morse def test_add_one_twice_to_column(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """Creating a new column that is derived from (1 + other_column) and retrieving the top N results""" + """Creating a new column that is derived from (1 + other_column) and retrieving the top N results.""" daft_df = daft_df.repartition(repartition_nparts).with_column("unique_key_mod", col("Unique Key") + 1) daft_df = daft_df.with_column("unique_key_mod_second", col("unique_key_mod") + 1) service_requests_csv_pd_df["unique_key_mod"] = service_requests_csv_pd_df["Unique Key"] + 1 @@ -44,7 +44,7 @@ def test_add_one_twice_to_column(daft_df, service_requests_csv_pd_df, repartitio def test_difference_cols(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """Creating a new column that is derived from 2 other columns and retrieving the top N results""" + """Creating a new column that is derived from 2 other columns and retrieving the top N results.""" daft_df = daft_df.repartition(repartition_nparts).with_column( "unique_key_mod", col("Unique Key") - col("Unique Key") ) diff --git a/tests/cookbook/test_count_rows.py b/tests/cookbook/test_count_rows.py index 2b0b36d257..4db1619f44 100644 --- a/tests/cookbook/test_count_rows.py +++ b/tests/cookbook/test_count_rows.py @@ -6,13 +6,13 @@ def test_count_rows(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """Count rows for the entire table""" + """Count rows for the entire table.""" daft_df_row_count = daft_df.repartition(repartition_nparts).count_rows() assert daft_df_row_count == service_requests_csv_pd_df.shape[0] def test_dataframe_count_no_args(daft_df, service_requests_csv_pd_df): - """Counts rows using `df.count()` without any arguments""" + """Counts rows using `df.count()` without any arguments.""" results = daft_df.count().to_pydict() assert "count" in results assert len(results["count"]) == 1 @@ -20,7 +20,7 @@ def test_dataframe_count_no_args(daft_df, service_requests_csv_pd_df): def test_filtered_count_rows(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """Count rows on a table filtered by a certain condition""" + """Count rows on a table filtered by a certain condition.""" daft_df_row_count = daft_df.repartition(repartition_nparts).where(col("Borough") == "BROOKLYN").count_rows() pd_df_row_count = len(service_requests_csv_pd_df[service_requests_csv_pd_df["Borough"] == "BROOKLYN"]) @@ -35,19 +35,19 @@ def test_filtered_count_rows(daft_df, service_requests_csv_pd_df, repartition_np ], ) def test_groupby_count_rows(daft_df, service_requests_csv_pd_df, repartition_nparts, keys, with_morsel_size): - """Count rows after group by""" + """Count rows after group by.""" daft_df = daft_df.repartition(repartition_nparts).groupby(*[col(k) for k in keys]).sum(col("Unique Key")) service_requests_csv_pd_df = service_requests_csv_pd_df.groupby(keys).sum("Unique Key").reset_index() assert daft_df.count_rows() == len(service_requests_csv_pd_df) def test_dataframe_length_after_collect(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """Count rows after group by""" + """Count rows after group by.""" daft_df = daft_df.repartition(repartition_nparts).collect() assert len(daft_df) == len(service_requests_csv_pd_df) def test_dataframe_length_before_collect(daft_df): - """Count rows for the entire table""" + """Count rows for the entire table.""" with pytest.raises(RuntimeError): len(daft_df) diff --git a/tests/cookbook/test_dataloading.py b/tests/cookbook/test_dataloading.py index d75497b2fe..8e873fad23 100644 --- a/tests/cookbook/test_dataloading.py +++ b/tests/cookbook/test_dataloading.py @@ -11,7 +11,7 @@ def test_load(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """Loading data from a CSV or Parquet works""" + """Loading data from a CSV or Parquet works.""" pd_slice = service_requests_csv_pd_df daft_slice = daft_df.repartition(repartition_nparts) daft_pd_df = daft_slice.to_pandas() @@ -19,7 +19,7 @@ def test_load(daft_df, service_requests_csv_pd_df, repartition_nparts, with_mors def test_load_csv_no_headers(tmp_path: pathlib.Path): - """Generate a default set of headers `f0, f1, ... f{n}` when loading a CSV that has no headers""" + """Generate a default set of headers `f0, f1, ... f{n}` when loading a CSV that has no headers.""" csv = tmp_path / "headerless_iris.csv" csv.write_text("\n".join(pathlib.Path(COOKBOOK_DATA_CSV).read_text().split("\n")[1:])) daft_df = daft.read_csv(str(csv), has_headers=False) @@ -30,7 +30,7 @@ def test_load_csv_no_headers(tmp_path: pathlib.Path): def test_load_csv_tab_delimited(tmp_path: pathlib.Path): - """Generate a default set of headers `col_0, col_1, ... col_{n}` when loading a CSV that has no headers""" + """Generate a default set of headers `col_0, col_1, ... col_{n}` when loading a CSV that has no headers.""" csv = tmp_path / "headerless_iris.csv" csv.write_text(pathlib.Path(COOKBOOK_DATA_CSV).read_text().replace(",", "\t")) daft_df = daft.read_csv(str(csv), delimiter="\t") @@ -40,7 +40,7 @@ def test_load_csv_tab_delimited(tmp_path: pathlib.Path): def test_load_json(tmp_path: pathlib.Path): - """Generate a default set of headers `col_0, col_1, ... col_{n}` when loading a JSON file""" + """Generate a default set of headers `col_0, col_1, ... col_{n}` when loading a JSON file.""" json_file = tmp_path / "iris.json" pd_df = pd.read_csv(COOKBOOK_DATA_CSV) pd_df.to_json(json_file, lines=True, orient="records") diff --git a/tests/cookbook/test_distinct.py b/tests/cookbook/test_distinct.py index 814c295b47..94ff5ccf9f 100644 --- a/tests/cookbook/test_distinct.py +++ b/tests/cookbook/test_distinct.py @@ -14,7 +14,7 @@ ], ) def test_distinct_all_columns(daft_df, service_requests_csv_pd_df, repartition_nparts, keys, with_morsel_size): - """Sums across groups""" + """Sums across groups.""" daft_df = daft_df.repartition(repartition_nparts).select(*[col(k) for k in keys]).distinct() service_requests_csv_pd_df = ( diff --git a/tests/cookbook/test_filter.py b/tests/cookbook/test_filter.py index 26320bc958..25291667ae 100644 --- a/tests/cookbook/test_filter.py +++ b/tests/cookbook/test_filter.py @@ -37,8 +37,7 @@ ], ) def test_filter(daft_df_ops, daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """Filter the dataframe, retrieve the top N results and select a subset of columns""" - + """Filter the dataframe, retrieve the top N results and select a subset of columns.""" daft_noise_complaints = daft_df_ops(daft_df.repartition(repartition_nparts)) pd_noise_complaints = service_requests_csv_pd_df[ @@ -105,7 +104,7 @@ def test_filter(daft_df_ops, daft_df, service_requests_csv_pd_df, repartition_np ], ) def test_complex_filter(daft_df_ops, daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """Filter the dataframe with a complex filter and select a subset of columns""" + """Filter the dataframe with a complex filter and select a subset of columns.""" daft_noise_complaints_brooklyn = daft_df_ops(daft_df.repartition(repartition_nparts)) pd_noise_complaints_brooklyn = service_requests_csv_pd_df[ @@ -164,7 +163,7 @@ def test_complex_filter(daft_df_ops, daft_df, service_requests_csv_pd_df, repart ], ) def test_chain_filter(daft_df_ops, daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """Filter the dataframe with a chain of filters and select a subset of columns""" + """Filter the dataframe with a chain of filters and select a subset of columns.""" daft_noise_complaints_brooklyn = daft_df_ops(daft_df.repartition(repartition_nparts)) pd_noise_complaints_brooklyn = service_requests_csv_pd_df @@ -179,7 +178,7 @@ def test_chain_filter(daft_df_ops, daft_df, service_requests_csv_pd_df, repartit def test_filter_on_projection(): - """Filter the dataframe with on top of a projection""" + """Filter the dataframe with on top of a projection.""" df = daft.from_pydict({"x": [1, 1, 1, 1, 1]}) df = df.select(col("x") * 2) df = df.where(col("x") == 1) diff --git a/tests/cookbook/test_literals.py b/tests/cookbook/test_literals.py index f652136439..df1900461e 100644 --- a/tests/cookbook/test_literals.py +++ b/tests/cookbook/test_literals.py @@ -7,7 +7,7 @@ def test_literal_column(daft_df, service_requests_csv_pd_df): - """Creating a new column that is derived from (1 + other_column) and retrieving the top N results""" + """Creating a new column that is derived from (1 + other_column) and retrieving the top N results.""" daft_df = daft_df.with_column("literal_col", lit(1)) daft_pd_df = daft_df.to_pandas() service_requests_csv_pd_df["literal_col"] = 1 @@ -16,7 +16,7 @@ def test_literal_column(daft_df, service_requests_csv_pd_df): def test_literal_column_computation(daft_df, service_requests_csv_pd_df): - """Creating a new column that is derived from (1 + other_column) and retrieving the top N results""" + """Creating a new column that is derived from (1 + other_column) and retrieving the top N results.""" daft_df = daft_df.with_column("literal_col", lit(1) + 1) daft_pd_df = daft_df.to_pandas() service_requests_csv_pd_df["literal_col"] = 1 + 1 @@ -25,7 +25,7 @@ def test_literal_column_computation(daft_df, service_requests_csv_pd_df): def test_literal_column_aggregation(daft_df, service_requests_csv_pd_df): - """Creating a new column that is derived from (1 + other_column) and retrieving the top N results""" + """Creating a new column that is derived from (1 + other_column) and retrieving the top N results.""" daft_df = daft_df.repartition(2).groupby("Borough").agg(col("Unique Key").sum()) daft_df = daft_df.with_column("literal_col", lit(1) + 1) daft_pd_df = daft_df.to_pandas() @@ -37,7 +37,7 @@ def test_literal_column_aggregation(daft_df, service_requests_csv_pd_df): def test_pyobj_literal_column(daft_df, service_requests_csv_pd_df): - """Creating a new column that is derived from (1 + other_column) and retrieving the top N results""" + """Creating a new column that is derived from (1 + other_column) and retrieving the top N results.""" daft_df = daft_df.with_column("literal_col", lit({"foo": "bar"})) daft_pd_df = daft_df.to_pandas() service_requests_csv_pd_df["literal_col"] = pd.Series( @@ -47,7 +47,7 @@ def test_pyobj_literal_column(daft_df, service_requests_csv_pd_df): def test_literal_column_computation_apply(daft_df, service_requests_csv_pd_df): - """Creating a new column that is derived from (1 + other_column) and retrieving the top N results""" + """Creating a new column that is derived from (1 + other_column) and retrieving the top N results.""" daft_df = daft_df.with_column( "literal_col", lit({"foo": "bar"}).apply(lambda d: d["foo"], return_dtype=DataType.string()) ) diff --git a/tests/cookbook/test_pandas_cookbook.py b/tests/cookbook/test_pandas_cookbook.py index 2e967ec4b1..9ee1613b71 100644 --- a/tests/cookbook/test_pandas_cookbook.py +++ b/tests/cookbook/test_pandas_cookbook.py @@ -1,4 +1,4 @@ -"""This module tests examples from https://pandas.pydata.org/docs/user_guide/cookbook.html""" +"""This module tests examples from https://pandas.pydata.org/docs/user_guide/cookbook.html.""" from __future__ import annotations diff --git a/tests/cookbook/test_sorting.py b/tests/cookbook/test_sorting.py index 83a3e6928b..65ea75cacb 100644 --- a/tests/cookbook/test_sorting.py +++ b/tests/cookbook/test_sorting.py @@ -8,7 +8,7 @@ def test_sorted_by_expr(daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size): - """Sort by a column that undergoes an expression""" + """Sort by a column that undergoes an expression.""" daft_df = daft_df.repartition(repartition_nparts) daft_sorted_df = daft_df.sort(((col("Unique Key") % 2) == 0).if_else(col("Unique Key"), col("Unique Key") * -1)) daft_sorted_pd_df = daft_sorted_df.to_pandas() @@ -37,7 +37,7 @@ def test_sorted_by_expr(daft_df, service_requests_csv_pd_df, repartition_nparts, ], ) def test_get_sorted(daft_df, service_requests_csv_pd_df, repartition_nparts, sort_keys, with_morsel_size): - """Sort by a column""" + """Sort by a column.""" daft_df = daft_df.repartition(repartition_nparts) daft_sorted_df = daft_df.sort([col(k) for k in sort_keys], desc=True) daft_sorted_pd_df = daft_sorted_df.to_pandas() @@ -56,7 +56,7 @@ def test_get_sorted(daft_df, service_requests_csv_pd_df, repartition_nparts, sor ], ) def test_get_sorted_top_n(daft_df, service_requests_csv_pd_df, repartition_nparts, sort_keys, with_morsel_size): - """Sort by a column""" + """Sort by a column.""" daft_df = daft_df.repartition(repartition_nparts) daft_sorted_df = daft_df.sort([col(k) for k in sort_keys], desc=True).limit(100) daft_sorted_pd_df = daft_sorted_df.to_pandas() @@ -77,7 +77,7 @@ def test_get_sorted_top_n(daft_df, service_requests_csv_pd_df, repartition_npart def test_get_sorted_top_n_flipped_desc( daft_df, service_requests_csv_pd_df, repartition_nparts, sort_keys, with_morsel_size ): - """Sort by a column""" + """Sort by a column.""" daft_df = daft_df.repartition(repartition_nparts) desc_list = [True] for i in range(len(sort_keys) - 1): @@ -108,7 +108,7 @@ def test_get_sorted_top_n_flipped_desc( def test_get_sorted_top_n_projected( daft_df_ops, daft_df, service_requests_csv_pd_df, repartition_nparts, with_morsel_size ): - """Sort by a column and retrieve specific columns from the top N results""" + """Sort by a column and retrieve specific columns from the top N results.""" daft_df = daft_df.repartition(repartition_nparts) expected = service_requests_csv_pd_df.sort_values(by="Unique Key", ascending=False)[ ["Unique Key", "Complaint Type"] diff --git a/tests/dataframe/test_creation.py b/tests/dataframe/test_creation.py index 9265c405cd..4772919c14 100644 --- a/tests/dataframe/test_creation.py +++ b/tests/dataframe/test_creation.py @@ -51,7 +51,7 @@ def __eq__(self, other: MyObjWithValue): @pytest.mark.parametrize("read_method", ["read_csv", "read_json", "read_parquet"]) def test_load_missing(read_method): - """Loading data from a missing filepath""" + """Loading data from a missing filepath.""" with pytest.raises(FileNotFoundError): getattr(daft, read_method)(str(uuid.uuid4())) diff --git a/tests/dataframe/test_shuffles.py b/tests/dataframe/test_shuffles.py index 9051d2cb9f..4b7cae6cb9 100644 --- a/tests/dataframe/test_shuffles.py +++ b/tests/dataframe/test_shuffles.py @@ -33,9 +33,7 @@ def generator( @pytest.fixture(scope="function") def pre_shuffle_merge_ctx(): - """ - Fixture that provides a context manager for pre-shuffle merge testing. - """ + """Fixture that provides a context manager for pre-shuffle merge testing.""" def _ctx(threshold: int | None = None): return daft.execution_config_ctx(shuffle_algorithm="pre_shuffle_merge", pre_shuffle_merge_threshold=threshold) @@ -52,9 +50,7 @@ def _ctx(threshold: int | None = None): [(100, 100), (100, 1), (100, 50), (100, 200)], ) def test_pre_shuffle_merge_small_partitions(pre_shuffle_merge_ctx, input_partitions, output_partitions): - """ - Test that pre-shuffle merge is working for small partitions less than the memory threshold - """ + """Test that pre-shuffle merge is working for small partitions less than the memory threshold.""" def num_rows_fn(): return output_partitions @@ -90,9 +86,7 @@ def bytes_per_row_fn(): [(100, 100), (100, 1), (100, 50), (100, 200)], ) def test_pre_shuffle_merge_big_partitions(pre_shuffle_merge_ctx, input_partitions, output_partitions): - """ - Test that pre-shuffle merge is working for big partitions greater than the threshold - """ + """Test that pre-shuffle merge is working for big partitions greater than the threshold.""" def num_rows_fn(): return output_partitions @@ -128,9 +122,7 @@ def bytes_per_row_fn(): [(100, 100), (100, 1), (100, 50), (100, 200)], ) def test_pre_shuffle_merge_randomly_sized_partitions(pre_shuffle_merge_ctx, input_partitions, output_partitions): - """ - Test that pre-shuffle merge is working for randomly sized partitions - """ + """Test that pre-shuffle merge is working for randomly sized partitions.""" def num_rows_fn(): return output_partitions diff --git a/tests/expressions/test_udf.py b/tests/expressions/test_udf.py index 13b35d3bbf..0fad641939 100644 --- a/tests/expressions/test_udf.py +++ b/tests/expressions/test_udf.py @@ -342,7 +342,7 @@ def test_udf_arbitrary_number_of_kwargs(batch_size): @udf(return_dtype=DataType.string(), batch_size=batch_size) def repeat_kwargs(**kwargs): data = {k: v.to_pylist() for k, v in kwargs.items()} - length = len(data[list(data.keys())[0]]) + length = len(data[next(iter(data.keys()))]) return Series.from_pylist(["".join([key * data[key][i] for key in data]) for i in range(length)]) expr = repeat_kwargs(a=col("a"), b=col("b"), c=col("c")) diff --git a/tests/expressions/typing/conftest.py b/tests/expressions/typing/conftest.py index 5bc3b21ae4..ee5222b8a0 100644 --- a/tests/expressions/typing/conftest.py +++ b/tests/expressions/typing/conftest.py @@ -115,7 +115,7 @@ ids=[f"{dt1}-{dt2}-{dt3}" for (dt1, _), (dt2, _), (dt3, _) in ALL_DATATYPES_TERNARY_PAIRS], ) def ternary_data_fixture(request) -> tuple[Series, Series, Series]: - """Returns ternary permutation of Series' of all DataType pairs""" + """Returns ternary permutation of Series' of all DataType pairs.""" (dt1, data1), (dt2, data2), (dt3, data3) = request.param s1 = Series.from_arrow(data1, name="first") assert s1.datatype() == dt1 @@ -132,7 +132,7 @@ def ternary_data_fixture(request) -> tuple[Series, Series, Series]: ids=[f"{dt1}-{dt2}" for (dt1, _), (dt2, _) in ALL_DATATYPES_BINARY_PAIRS], ) def binary_data_fixture(request) -> tuple[Series, Series]: - """Returns binary permutation of Series' of all DataType pairs""" + """Returns binary permutation of Series' of all DataType pairs.""" (dt1, data1), (dt2, data2) = request.param s1 = Series.from_arrow(data1, name="lhs") assert s1.datatype() == dt1 @@ -147,7 +147,7 @@ def binary_data_fixture(request) -> tuple[Series, Series]: ids=[f"{dt}" for (dt, _) in ALL_DTYPES], ) def unary_data_fixture(request) -> Series: - """Returns unary permutation of Series' of all DataType pairs""" + """Returns unary permutation of Series' of all DataType pairs.""" (dt, data) = request.param s = Series.from_arrow(data, name="arg") assert s.datatype() == dt @@ -160,7 +160,7 @@ def unary_data_fixture(request) -> Series: ids=[f"{dt}" for (dt, _) in DECIMAL_DTYPES], ) def decimal_unary_data_fixture(request) -> Series: - """Returns unary permutation of Series' of select decimal DataType pairs""" + """Returns unary permutation of Series' of select decimal DataType pairs.""" (dt, data) = request.param s = Series.from_arrow(data, name="arg") assert s.datatype() == dt @@ -173,7 +173,7 @@ def assert_typing_resolve_vs_runtime_behavior( run_kernel: Callable[[], Series], resolvable: bool, ): - """Asserts that typing behavior during schema resolution matches behavior during runtime on Series' + """Asserts that typing behavior during schema resolution matches behavior during runtime on Series'. Example Usage: @@ -211,12 +211,12 @@ def assert_typing_resolve_vs_runtime_behavior( def is_numeric_or_null(dt: DataType) -> bool: - """Checks if this type is a numeric or null type""" + """Checks if this type is a numeric or null type.""" return dt == DataType.null() or is_numeric(dt) def is_numeric(dt: DataType) -> bool: - """Checks if this type is a numeric type""" + """Checks if this type is a numeric type.""" return ( dt == DataType.int8() or dt == DataType.int16() @@ -232,7 +232,7 @@ def is_numeric(dt: DataType) -> bool: def is_integer(dt: DataType) -> bool: - """Checks if this type is an integer type""" + """Checks if this type is an integer type.""" return ( dt == DataType.int8() or dt == DataType.int16() @@ -246,12 +246,12 @@ def is_integer(dt: DataType) -> bool: def is_signed_integer(dt: DataType) -> bool: - """Checks if this type is a signed integer type""" + """Checks if this type is a signed integer type.""" return dt == DataType.int8() or dt == DataType.int16() or dt == DataType.int32() or dt == DataType.int64() def is_comparable(dt: DataType): - """Checks if this type is a comparable type""" + """Checks if this type is a comparable type.""" return ( is_numeric(dt) or dt == DataType.bool() @@ -264,7 +264,7 @@ def is_comparable(dt: DataType): def is_numeric_bitwidth_gte_32(dt: DataType): - """Checks if type is numeric and above a bitwidth of 32""" + """Checks if type is numeric and above a bitwidth of 32.""" return ( dt == DataType.int32() or dt == DataType.int64() @@ -276,7 +276,9 @@ def is_numeric_bitwidth_gte_32(dt: DataType): def has_supertype(dt1: DataType, dt2: DataType) -> bool: - """Checks if two DataTypes have supertypes - note that this is a simplified + """Checks if two DataTypes have supertypes. + + this is a simplified version of `supertype.rs`, since it only defines "reachability" within the supertype tree in a more human-readable way for testing purposes. """ diff --git a/tests/expressions/typing/test_aggs.py b/tests/expressions/typing/test_aggs.py index 2536c8bb44..9dc945dbf1 100644 --- a/tests/expressions/typing/test_aggs.py +++ b/tests/expressions/typing/test_aggs.py @@ -45,9 +45,10 @@ def test_numeric_aggs(unary_data_fixture, op): def test_decimal_sum(decimal_unary_data_fixture): - """a copy of the above but for decimal types that do not more widely support + """A copy of the above but for decimal types that do not more widely support numeric operations. When they do and can be added to ALL_DTYPES and resolve - is_numeric to True, this test can be removed.""" + is_numeric to True, this test can be removed. + """ # noqa: D205 arg = decimal_unary_data_fixture def op(x): diff --git a/tests/expressions/typing/test_arithmetic.py b/tests/expressions/typing/test_arithmetic.py index c10ddb20fd..189d8b96d2 100644 --- a/tests/expressions/typing/test_arithmetic.py +++ b/tests/expressions/typing/test_arithmetic.py @@ -17,8 +17,7 @@ def plus_type_validation(lhs: DataType, rhs: DataType) -> bool: - """Checks whether these input types are resolvable for the + operation""" - + """Checks whether these input types are resolvable for the + operation.""" # Plus only works for certain types for arg in (lhs, rhs): if not (is_numeric(arg) or (arg == DataType.string()) or (arg == DataType.bool()) or (arg == DataType.null())): @@ -43,7 +42,7 @@ def test_plus(binary_data_fixture): def binary_numeric_arithmetic_type_validation(lhs: DataType, rhs: DataType, op: ops) -> bool: - """Checks whether these input types are resolvable for arithmetic operations""" + """Checks whether these input types are resolvable for arithmetic operations.""" # (temporal - temporal = duration) if lhs._is_temporal_type() and rhs._is_temporal_type() and lhs == rhs and op == ops.sub: return True diff --git a/tests/integration/iceberg/test_partition_pruning.py b/tests/integration/iceberg/test_partition_pruning.py index 05d4059f39..a4a70d3f2b 100644 --- a/tests/integration/iceberg/test_partition_pruning.py +++ b/tests/integration/iceberg/test_partition_pruning.py @@ -213,7 +213,7 @@ def test_daft_iceberg_table_predicate_pushdown_on_number(predicate, table, limit @pytest.mark.integration() def test_daft_iceberg_table_predicate_pushdown_empty_scan(local_iceberg_catalog): - catalog_name, pyiceberg_catalog = local_iceberg_catalog + _, pyiceberg_catalog = local_iceberg_catalog tab = pyiceberg_catalog.load_table("default.test_partitioned_by_months") df = daft.read_iceberg(tab) df = df.where(df["dt"] > date(2030, 1, 1)) diff --git a/tests/integration/io/conftest.py b/tests/integration/io/conftest.py index 3a9e577a9a..9dcfac36d2 100644 --- a/tests/integration/io/conftest.py +++ b/tests/integration/io/conftest.py @@ -71,7 +71,7 @@ def azure_storage_public_config() -> daft.io.IOConfig: @pytest.fixture(scope="session") def nginx_config() -> tuple[str, pathlib.Path]: - """Returns the (nginx_server_url, static_files_tmpdir) as a tuple""" + """Returns the (nginx_server_url, static_files_tmpdir) as a tuple.""" return ( "http://127.0.0.1:8080", pathlib.Path("/tmp/daft-integration-testing/nginx"), @@ -80,7 +80,7 @@ def nginx_config() -> tuple[str, pathlib.Path]: @pytest.fixture(scope="session", params=["standard", "adaptive"], ids=["standard", "adaptive"]) def retry_server_s3_config(request) -> daft.io.IOConfig: - """Returns the URL to the local retry_server fixture""" + """Returns the URL to the local retry_server fixture.""" retry_mode = request.param return daft.io.IOConfig( s3=daft.io.S3Config(endpoint_url="http://127.0.0.1:8001", anonymous=True, num_tries=10, retry_mode=retry_mode) @@ -96,7 +96,7 @@ def retry_server_s3_config(request) -> daft.io.IOConfig: def minio_create_bucket( minio_io_config: daft.io.IOConfig, bucket_name: str = "my-minio-bucket" ) -> YieldFixture[list[str]]: - """Creates a bucket in MinIO + """Creates a bucket in MinIO. Yields a s3fs FileSystem """ @@ -118,7 +118,7 @@ def minio_create_bucket( def mount_data_minio( minio_io_config: daft.io.IOConfig, folder: pathlib.Path, bucket_name: str = "my-minio-bucket" ) -> YieldFixture[list[str]]: - """Mounts data in `folder` into files in minio + """Mounts data in `folder` into files in minio. Yields a list of S3 URLs """ @@ -137,7 +137,7 @@ def mount_data_minio( @contextlib.contextmanager def mount_data_nginx(nginx_config: tuple[str, pathlib.Path], folder: pathlib.Path) -> YieldFixture[list[str]]: - """Mounts data in `folder` into servable static files in NGINX + """Mounts data in `folder` into servable static files in NGINX. Yields a list of HTTP URLs """ @@ -178,7 +178,7 @@ def mount_data_nginx(nginx_config: tuple[str, pathlib.Path], folder: pathlib.Pat @pytest.fixture(scope="session") def image_data() -> YieldFixture[bytes]: - """Bytes of a small image""" + """Bytes of a small image.""" bio = io.BytesIO() image = Image.fromarray(np.ones((3, 3)).astype(np.uint8)) image.save(bio, format="JPEG") @@ -187,7 +187,7 @@ def image_data() -> YieldFixture[bytes]: @pytest.fixture(scope="function") def image_data_folder(image_data, tmpdir) -> YieldFixture[str]: - """Dumps 10 small JPEG files into a tmpdir""" + """Dumps 10 small JPEG files into a tmpdir.""" tmpdir = pathlib.Path(tmpdir) for i in range(10): @@ -201,7 +201,7 @@ def image_data_folder(image_data, tmpdir) -> YieldFixture[str]: def mock_http_image_urls( nginx_config: tuple[str, pathlib.Path], image_data_folder: pathlib.Path ) -> YieldFixture[list[str]]: - """Uses the docker-compose Nginx server to serve HTTP image URLs + """Uses the docker-compose Nginx server to serve HTTP image URLs. This fixture yields: list[str]: URLs of files available on the HTTP server @@ -212,14 +212,14 @@ def mock_http_image_urls( @pytest.fixture(scope="function") def minio_image_data_fixture(minio_io_config, image_data_folder) -> YieldFixture[list[str]]: - """Populates the minio session with some fake data and yields (S3Config, paths)""" + """Populates the minio session with some fake data and yields (S3Config, paths).""" with mount_data_minio(minio_io_config, image_data_folder) as urls: yield urls @pytest.fixture(scope="session") def small_images_s3_paths() -> list[str]: - """Paths to small *.jpg files in a public S3 bucket""" + """Paths to small *.jpg files in a public S3 bucket.""" return [f"s3://daft-public-data/test_fixtures/small_images/rickroll{i}.jpg" for i in range(6)] + [ f"s3a://daft-public-data/test_fixtures/small_images/rickroll{i}.jpg" for i in range(6) ] diff --git a/tests/integration/io/docker-compose/retry_server/main.py b/tests/integration/io/docker-compose/retry_server/main.py index e66739fc4e..e382bc3f04 100644 --- a/tests/integration/io/docker-compose/retry_server/main.py +++ b/tests/integration/io/docker-compose/retry_server/main.py @@ -1,4 +1,4 @@ -"""This file defines a FastAPI server that emulates an S3 implementation that serves Parquet files +"""This file defines a FastAPI server that emulates an S3 implementation that serves Parquet files. This S3 implementation serves a small Parquet file at the location: `s3://{bucket-name}/{status_code}/{num_errors}/{item_id}` diff --git a/tests/integration/io/docker-compose/retry_server/routers/head_retries_parquet_bucket.py b/tests/integration/io/docker-compose/retry_server/routers/head_retries_parquet_bucket.py index c57cf30c4b..9c0a24a0b4 100644 --- a/tests/integration/io/docker-compose/retry_server/routers/head_retries_parquet_bucket.py +++ b/tests/integration/io/docker-compose/retry_server/routers/head_retries_parquet_bucket.py @@ -22,7 +22,7 @@ async def retryable_bucket_head( request: Request, status_code: int, status_code_str: str, num_errors: int, item_id: str ): - """Reading of Parquet starts with a head request, which potentially must be retried as well""" + """Reading of Parquet starts with a head request, which potentially must be retried as well.""" key = item_id if key not in ITEM_ID_TO_NUM_RETRIES: ITEM_ID_TO_NUM_RETRIES[key] = 1 diff --git a/tests/integration/io/docker-compose/retry_server/routers/rate_limited_echo_gets_bucket.py b/tests/integration/io/docker-compose/retry_server/routers/rate_limited_echo_gets_bucket.py index 1c34142cfb..1cfc3f8890 100644 --- a/tests/integration/io/docker-compose/retry_server/routers/rate_limited_echo_gets_bucket.py +++ b/tests/integration/io/docker-compose/retry_server/routers/rate_limited_echo_gets_bucket.py @@ -26,7 +26,7 @@ def rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded) -> Res @app.get(OBJECT_KEY_URL) @limiter.shared_limit(limit_value="80/second", scope="my_shared_limit") async def rate_limited_bucket_get(request: Request, item_id: str): - """This endpoint will just echo the `item_id` and return that as the response body""" + """This endpoint will just echo the `item_id` and return that as the response body.""" result = item_id.encode("utf-8") return Response( status_code=200, diff --git a/tests/integration/io/docker-compose/retry_server/utils/parquet_generation.py b/tests/integration/io/docker-compose/retry_server/utils/parquet_generation.py index e93a9b43c3..2c4744811e 100644 --- a/tests/integration/io/docker-compose/retry_server/utils/parquet_generation.py +++ b/tests/integration/io/docker-compose/retry_server/utils/parquet_generation.py @@ -7,7 +7,7 @@ def generate_parquet_file(): - """Generate a small Parquet file and return the path to the file""" + """Generate a small Parquet file and return the path to the file.""" tmpfile = tempfile.NamedTemporaryFile() tbl = pa.Table.from_pydict({"foo": [1, 2, 3]}) papq.write_table(tbl, tmpfile.name) diff --git a/tests/integration/io/test_url_download_private_aws_s3.py b/tests/integration/io/test_url_download_private_aws_s3.py index ccf91edd1e..8d0f795c74 100644 --- a/tests/integration/io/test_url_download_private_aws_s3.py +++ b/tests/integration/io/test_url_download_private_aws_s3.py @@ -9,7 +9,7 @@ @pytest.fixture(scope="session") def io_config(pytestconfig) -> IOConfig: - """Create IOConfig with botocore's current session""" + """Create IOConfig with botocore's current session.""" if pytestconfig.getoption("--credentials") is not True: pytest.skip("Test can only run in a credentialled environment, and when run with the `--credentials` flag") diff --git a/tests/io/delta_lake/test_table_write.py b/tests/io/delta_lake/test_table_write.py index c3076a92c6..098caac34a 100644 --- a/tests/io/delta_lake/test_table_write.py +++ b/tests/io/delta_lake/test_table_write.py @@ -60,7 +60,7 @@ def test_deltalake_multi_write_basic(tmp_path, base_table): def test_deltalake_write_cloud(base_table, cloud_paths): deltalake = pytest.importorskip("deltalake") - path, io_config, catalog_table = cloud_paths + path, io_config, _ = cloud_paths df = daft.from_arrow(base_table) result = df.write_deltalake(str(path), io_config=io_config) result = result.to_pydict() @@ -93,7 +93,7 @@ def test_deltalake_write_overwrite_basic(tmp_path): def test_deltalake_write_overwrite_cloud(cloud_paths): deltalake = pytest.importorskip("deltalake") - path, io_config, catalog_table = cloud_paths + path, io_config, _ = cloud_paths df1 = daft.from_pydict({"a": [1, 2]}) df1.write_deltalake(str(path), io_config=io_config) diff --git a/tests/io/iceberg/test_iceberg_writes.py b/tests/io/iceberg/test_iceberg_writes.py index c3956bd388..8282ed1626 100644 --- a/tests/io/iceberg/test_iceberg_writes.py +++ b/tests/io/iceberg/test_iceberg_writes.py @@ -161,7 +161,7 @@ def test_read_and_overwrite(simple_local_table): def test_missing_columns_write(simple_local_table): - table, num_partitions = simple_local_table + table, _ = simple_local_table df = daft.from_pydict({"x": [1, 2, 3, 4, 5]}) diff --git a/tests/io/test_merge_scan_tasks.py b/tests/io/test_merge_scan_tasks.py index 2f5c1ac144..dd7696d8c4 100644 --- a/tests/io/test_merge_scan_tasks.py +++ b/tests/io/test_merge_scan_tasks.py @@ -7,8 +7,7 @@ @pytest.fixture(scope="function") def csv_files(tmpdir): - """Writes 3 CSV files, each of 10 bytes in size, to tmpdir and yield tmpdir""" - + """Writes 3 CSV files, each of 10 bytes in size, to tmpdir and yield tmpdir.""" for i in range(3): path = tmpdir / f"file.{i}.csv" path.write_text("a,b,c\n1,2,", "utf8") # 10 bytes diff --git a/tests/io/test_parquet.py b/tests/io/test_parquet.py index be2be34fca..8249e291d1 100644 --- a/tests/io/test_parquet.py +++ b/tests/io/test_parquet.py @@ -30,7 +30,7 @@ @contextlib.contextmanager -def _parquet_write_helper(data: pa.Table, row_group_size: int = None, papq_write_table_kwargs: dict = {}): +def _parquet_write_helper(data: pa.Table, row_group_size: int | None = None, papq_write_table_kwargs: dict = {}): with tempfile.TemporaryDirectory() as directory_name: file = os.path.join(directory_name, "tempfile") papq.write_table(data, file, row_group_size=row_group_size, **papq_write_table_kwargs) @@ -260,7 +260,7 @@ def compare_before_and_after(before, after): def test_parquet_helper(data_and_type, use_daft_writer): data, data_type = data_and_type index_data = [x for x in range(0, len(data))] - file_path = f"{tmpdir}/{str(uuid.uuid4())}.parquet" + file_path = f"{tmpdir}/{uuid.uuid4()!s}.parquet" # Test Daft roundtrip. Daft does not support the dictionary logical type, hence we skip # writing with Daft for this type. @@ -281,7 +281,7 @@ def test_parquet_helper(data_and_type, use_daft_writer): compare_before_and_after(before, after) # Test Arrow write with Daft read. - file_path = f"{tmpdir}/{str(uuid.uuid4())}.parquet" + file_path = f"{tmpdir}/{uuid.uuid4()!s}.parquet" before = pa.Table.from_arrays( [pa.array(data, type=data_type), pa.array(index_data, type=pa.int64())], names=["nested_col", "_index"] ) @@ -354,7 +354,7 @@ def test_parquet_limits_across_row_groups(tmpdir, minio_io_config): default_row_group_size = daft_execution_config.parquet_target_row_group_size int_array = np.full(shape=4096, fill_value=3, dtype=np.int32) before = daft.from_pydict({"col": pa.array(int_array, type=pa.int32())}) - file_path = f"{tmpdir}/{str(uuid.uuid4())}.parquet" + file_path = f"{tmpdir}/{uuid.uuid4()!s}.parquet" # Decrease the target row group size before writing the parquet file. daft.set_execution_config(parquet_target_row_group_size=test_row_group_size) before.write_parquet(file_path) diff --git a/tests/io/test_split_scan_tasks.py b/tests/io/test_split_scan_tasks.py index 7b92655668..96ca4ba83f 100644 --- a/tests/io/test_split_scan_tasks.py +++ b/tests/io/test_split_scan_tasks.py @@ -9,7 +9,7 @@ @pytest.fixture(scope="function") def parquet_files(tmpdir): - """Writes 1 Parquet file with 10 rowgroups, each of 100 bytes in size""" + """Writes 1 Parquet file with 10 rowgroups, each of 100 bytes in size.""" tbl = pa.table({"data": ["aaa"] * 100}) path = tmpdir / "file.pq" papq.write_table(tbl, str(path), row_group_size=10, use_dictionary=False) diff --git a/tests/io/test_url_download_local.py b/tests/io/test_url_download_local.py index 4cce1c01ab..57dcbf6d9e 100644 --- a/tests/io/test_url_download_local.py +++ b/tests/io/test_url_download_local.py @@ -10,7 +10,7 @@ @pytest.fixture(scope="function") def local_image_data_fixture(tmpdir, image_data) -> YieldFixture[list[str]]: - """Populates the local tmpdir with some fake data and returns filepaths""" + """Populates the local tmpdir with some fake data and returns filepaths.""" # Dump some images into the tmpdir tmpdir = pathlib.Path(tmpdir) urls = [] diff --git a/tests/io/test_write_modes.py b/tests/io/test_write_modes.py index 4d6ff0bbea..40a224f2fa 100644 --- a/tests/io/test_write_modes.py +++ b/tests/io/test_write_modes.py @@ -149,7 +149,7 @@ def test_write_modes_s3_minio( num_partitions, partition_cols, ): - path = f"s3://{bucket}/{str(uuid.uuid4())}" + path = f"s3://{bucket}/{uuid.uuid4()!s}" existing_data = {"a": ["a", "a", "b", "b"], "b": [1, 2, 3, 4]} new_data = { "a": ["a", "a", "b", "b"], @@ -186,7 +186,7 @@ def test_write_modes_s3_minio_empty_data( write_mode, format, ): - path = f"s3://{bucket}/{str(uuid.uuid4())}" + path = f"s3://{bucket}/{uuid.uuid4()!s}" existing_data = {"a": ["a", "a", "b", "b"], "b": ["c", "d", "e", "f"]} new_data = { "a": ["a", "a", "b", "b"], diff --git a/tests/microbenchmarks/test_df_arithmetic.py b/tests/microbenchmarks/test_df_arithmetic.py index 1a20c748c2..9b2a115a56 100644 --- a/tests/microbenchmarks/test_df_arithmetic.py +++ b/tests/microbenchmarks/test_df_arithmetic.py @@ -19,7 +19,7 @@ def gen_aranged_df(num_samples=1_000_000) -> DataFrame: @pytest.mark.benchmark(group="arithmetic") def test_integer_multiplications(gen_aranged_df, benchmark) -> None: - """Integer multiplications between 1_000_000 values + """Integer multiplications between 1_000_000 values. Adapted from: https://github.com/duckdb/duckdb/blob/master/benchmark/micro/arithmetic/multiplications.benchmark """ diff --git a/tests/microbenchmarks/test_file_read.py b/tests/microbenchmarks/test_file_read.py index 9744bedf73..078b182387 100644 --- a/tests/microbenchmarks/test_file_read.py +++ b/tests/microbenchmarks/test_file_read.py @@ -14,7 +14,6 @@ @pytest.fixture(scope="module", params=[(1, 64), (8, 8), (64, 1)], ids=["1x64mib", "8x8mib", "64x1mib"]) def gen_simple_csvs(request) -> str: """Creates some CSVs in a directory. Returns the name of the directory.""" - num_files, mibs_per_file = request.param _8bytes = b"aaa,bbb\n" diff --git a/tests/microbenchmarks/test_filter.py b/tests/microbenchmarks/test_filter.py index 57963d4fcd..6415331ccf 100644 --- a/tests/microbenchmarks/test_filter.py +++ b/tests/microbenchmarks/test_filter.py @@ -123,7 +123,7 @@ def generate_list_int64_keep_none() -> tuple[dict, daft.Expression, list]: ], ) def test_filter(test_data_generator, benchmark) -> None: - """If_else between NUM_ROWS values""" + """If_else between NUM_ROWS values.""" data, expected = test_data_generator() table = MicroPartition.from_pydict(data) diff --git a/tests/microbenchmarks/test_groups_and_aggs.py b/tests/microbenchmarks/test_groups_and_aggs.py index b8fdfc8785..7965868392 100644 --- a/tests/microbenchmarks/test_groups_and_aggs.py +++ b/tests/microbenchmarks/test_groups_and_aggs.py @@ -52,7 +52,6 @@ def bench() -> DataFrame: @pytest.mark.parametrize("num_samples", NUM_SAMPLES) def test_comparable_agg(benchmark, num_samples) -> None: """Test aggregation performance for comparisons against string types.""" - data = [str(uuid4()) for _ in range(num_samples)] + ["ffffffff-ffff-ffff-ffff-ffffffffffff"] random.shuffle(data) @@ -72,7 +71,6 @@ def bench() -> DataFrame: @pytest.mark.parametrize("num_samples", NUM_SAMPLES) def test_numeric_agg(benchmark, num_samples) -> None: """Test aggregation performance for numeric aggregation ops.""" - df = daft.from_pydict({"mycol": np.arange(num_samples)}).collect() # Run the benchmark. @@ -90,7 +88,6 @@ def bench() -> DataFrame: @pytest.mark.parametrize("num_samples", NUM_SAMPLES) def test_groupby(benchmark, num_samples, num_groups) -> None: """Test performance of grouping to one group vs many.""" - keys = np.arange(num_samples) if num_groups is not None: keys = keys % num_groups @@ -124,7 +121,6 @@ def bench() -> DataFrame: @pytest.mark.parametrize("num_samples", NUM_SAMPLES) def test_groupby_string(benchmark, num_samples, num_groups) -> None: """Test performance of grouping to one group vs many.""" - keys = np.arange(num_samples) if num_groups is not None: keys = keys % num_groups @@ -161,7 +157,6 @@ def test_multicolumn_groupby(benchmark, num_columns, num_samples) -> None: The group cardinality is the same in both cases; a redundant column is used for the multicolumn group. """ - num_groups = 100 keys = np.arange(num_samples) % num_groups diff --git a/tests/microbenchmarks/test_if_else.py b/tests/microbenchmarks/test_if_else.py index 0de1a7d75b..f1e2c79151 100644 --- a/tests/microbenchmarks/test_if_else.py +++ b/tests/microbenchmarks/test_if_else.py @@ -110,7 +110,7 @@ def generate_list_params() -> tuple[dict, daft.Expression, list]: ], ) def test_if_else(test_data_generator, benchmark) -> None: - """If_else between NUM_ROWS values""" + """If_else between NUM_ROWS values.""" data, expr, expected = test_data_generator() table = MicroPartition.from_pydict(data) diff --git a/tests/microbenchmarks/test_join.py b/tests/microbenchmarks/test_join.py index 1ace4ad14d..0995bab0ce 100644 --- a/tests/microbenchmarks/test_join.py +++ b/tests/microbenchmarks/test_join.py @@ -26,7 +26,6 @@ def test_join_simple(benchmark, num_samples, num_partitions, join_type) -> None: Keys are consecutive integers; no data payload; one-to-one matches. """ - left_arr = np.arange(num_samples) np.random.shuffle(left_arr) right_arr = np.arange(num_samples) @@ -71,7 +70,6 @@ def bench_join() -> DataFrame: @pytest.mark.parametrize("join_type", JOIN_TYPES) def test_join_largekey(benchmark, num_samples, num_partitions, join_type) -> None: """Test the impact of string keys vs integer keys.""" - keys = [str(uuid4()) for _ in range(num_samples)] left_keys = keys.copy() @@ -120,7 +118,6 @@ def bench_join() -> DataFrame: @pytest.mark.parametrize("join_type", JOIN_TYPES) def test_join_withdata(benchmark, num_samples, num_partitions, join_type) -> None: """Test the impact of data payloads.""" - left_arr = np.arange(num_samples) np.random.shuffle(left_arr) right_arr = np.arange(num_samples) @@ -179,7 +176,6 @@ def test_broadcast_join(benchmark, left_bigger, num_partitions, join_type) -> No The cardinality is one-to-many. """ - small_length = 1_000 big_factor = 10 @@ -234,7 +230,6 @@ def test_multicolumn_joins(benchmark, num_columns, num_samples, num_partitions, The join cardinality is the same for all cases; redundant columns are used for the multicolumn joins. """ - left_arr = np.arange(num_samples) np.random.shuffle(left_arr) right_arr = np.arange(num_samples) diff --git a/tests/microbenchmarks/test_sort.py b/tests/microbenchmarks/test_sort.py index 6dd4bac6d3..b6dad3f44a 100644 --- a/tests/microbenchmarks/test_sort.py +++ b/tests/microbenchmarks/test_sort.py @@ -21,7 +21,6 @@ def test_sort_simple(benchmark, num_samples, num_partitions) -> None: Keys are consecutive integers; no data payload. """ - arr = np.arange(num_samples) np.random.shuffle(arr) @@ -53,7 +52,6 @@ def bench() -> DataFrame: ) def test_sort_strings(benchmark, num_samples, num_partitions) -> None: """Test the impact of string keys vs integer keys.""" - keys = [str(uuid4()) for _ in range(num_samples)] random.shuffle(keys) @@ -86,7 +84,6 @@ def bench() -> DataFrame: ) def test_sort_withdata(benchmark, num_samples, num_partitions) -> None: """Test the impact of data payloads.""" - arr = np.arange(num_samples) np.random.shuffle(arr) @@ -132,7 +129,6 @@ def test_multicolumn_sort(benchmark, num_columns, num_samples, num_partitions) - Using all columns produces a unique sort key. """ - arr = np.arange(num_samples) np.random.shuffle(arr) diff --git a/tests/microbenchmarks/test_take.py b/tests/microbenchmarks/test_take.py index 0a31520865..bb7b5f46dc 100644 --- a/tests/microbenchmarks/test_take.py +++ b/tests/microbenchmarks/test_take.py @@ -97,7 +97,7 @@ def generate_list_int64_take_reversed() -> tuple[dict, daft.Expression, list]: ], ) def test_take(test_data_generator, benchmark) -> None: - """If_else between NUM_ROWS values""" + """If_else between NUM_ROWS values.""" data, idx, expected = test_data_generator() table = MicroPartition.from_pydict(data) diff --git a/tests/property_based_testing/strategies.py b/tests/property_based_testing/strategies.py index 58da7634e3..5a680ef28b 100644 --- a/tests/property_based_testing/strategies.py +++ b/tests/property_based_testing/strategies.py @@ -46,7 +46,7 @@ @composite def generate_data(draw, daft_type: DataType, strategies: dict[DataType, SearchStrategy] = _default_strategies): - """Helper to generate data when given a daft_type""" + """Helper to generate data when given a daft_type.""" if daft_type not in strategies: raise NotImplementedError(f"Strategy for type {daft_type} not implemented") return draw(strategies[daft_type], label=f"Generated data for type {daft_type}") @@ -90,7 +90,7 @@ def series( dtypes: SearchStrategy[DataType] = all_dtypes, strategies: dict[DataType, SearchStrategy] = _default_strategies, ) -> Series: - """Generate a column of data + """Generate a column of data. Args: draw: Hypothesis draw function diff --git a/tests/property_based_testing/test_sort.py b/tests/property_based_testing/test_sort.py index d6cc75b043..dd22f50ed2 100644 --- a/tests/property_based_testing/test_sort.py +++ b/tests/property_based_testing/test_sort.py @@ -30,13 +30,13 @@ def _is_nan(obj: Any) -> bool: - """Checks if an object is a float NaN""" + """Checks if an object is a float NaN.""" return isinstance(obj, float) and math.isnan(obj) @settings(max_examples=int(os.getenv("HYPOTHESIS_MAX_EXAMPLES", 100)), stateful_step_count=8, deadline=None) class DataframeSortStateMachine(RuleBasedStateMachine): - """Tests sorts in the face of various other operations such as filters, projections etc + """Tests sorts in the face of various other operations such as filters, projections etc. Creates N number of sort key columns named "sort_key_{i}", and one "row_num" column which enumerates the original row number. @@ -58,7 +58,7 @@ def __init__(self): @rule(data=data(), num_sort_cols=integers(min_value=1, max_value=MAX_NUM_SORT_COLS)) @precondition(lambda self: self.df is None) def newdataframe(self, data, num_sort_cols): - """Start node of the state machine, creates an initial dataframe""" + """Start node of the state machine, creates an initial dataframe.""" self.sort_keys = [f"sort_key_{i}" for i in range(num_sort_cols)] columns_dict_data = data.draw( @@ -75,7 +75,7 @@ def newdataframe(self, data, num_sort_cols): @rule(data=data()) @precondition(lambda self: self.df is not None) def run_sort(self, data): - """Run a sort on the accumulated dataframe plan""" + """Run a sort on the accumulated dataframe plan.""" sort_on = data.draw(permutations(self.sort_keys)) descending = data.draw(lists(min_size=len(self.sort_keys), max_size=len(self.sort_keys), elements=booleans())) self.df = self.df.sort(sort_on, desc=descending) @@ -84,7 +84,7 @@ def run_sort(self, data): @rule() @precondition(lambda self: self.sorted_on is not None) def collect_after_sort(self): - """Optionally runs after any sort step to check that sort is maintained""" + """Optionally runs after any sort step to check that sort is maintained.""" sorted_data = self.df.to_pydict() sorted_on_cols = [c for c, _ in self.sorted_on] sorted_on_desc = [d for _, d in self.sorted_on] @@ -152,7 +152,7 @@ def collect_after_sort(self): @rule(data=data()) @precondition(lambda self: self.df is not None) def repartition_df(self, data): - """Runs a repartitioning step""" + """Runs a repartitioning step.""" num_partitions = data.draw( self.repartition_num_partitions_strategy, label="Number of partitions for repartitioning" ) @@ -164,7 +164,7 @@ def repartition_df(self, data): @rule(data=data()) @precondition(lambda self: self.df is not None) def filter_df(self, data): - """Runs a filter on a simple equality predicate on a random column""" + """Runs a filter on a simple equality predicate on a random column.""" assert self.df is not None col_name_to_filter = data.draw(sampled_from(self.df.schema().column_names()), label="Column to filter on") col_daft_type = self.df.schema()[col_name_to_filter].dtype @@ -185,7 +185,7 @@ def filter_df(self, data): @rule(data=data()) @precondition(lambda self: self.df is not None) def project_df(self, data): - """Runs a projection on a random column, replacing it""" + """Runs a projection on a random column, replacing it.""" assert self.df is not None column_name = data.draw(sampled_from(self.df.schema().column_names()), label="Column to filter on") column_daft_type = self.df.schema()[column_name].dtype diff --git a/tests/series/test_size_bytes.py b/tests/series/test_size_bytes.py index d4a7b6cb2e..9c30c58fdf 100644 --- a/tests/series/test_size_bytes.py +++ b/tests/series/test_size_bytes.py @@ -18,7 +18,7 @@ def get_total_buffer_size(arr: pa.Array) -> int: - """Helper to get total buffer size because older versions of Arrow don't have the Array.get_total_buffer_size() method""" + """Helper to get total buffer size because older versions of Arrow don't have this method.""" return sum([buf.size if buf is not None else 0 for buf in arr.buffers()]) diff --git a/tests/sql/test_sql.py b/tests/sql/test_sql.py index f80aec4de7..f06c4f64f7 100644 --- a/tests/sql/test_sql.py +++ b/tests/sql/test_sql.py @@ -11,7 +11,7 @@ def load_tpch_queries(): - """Load all TPCH queries into a list of (name,sql) tuples""" + """Load all TPCH queries into a list of (name,sql) tuples.""" queries = [] for filename in os.listdir(TPCH_QUERIES): filepath = os.path.join(TPCH_QUERIES, filename) @@ -24,7 +24,7 @@ def load_tpch_queries(): def load_tpch_query(filename): - """Load a single TPCH query from a file""" + """Load a single TPCH query from a file.""" filepath = os.path.join(TPCH_QUERIES, filename) if os.path.isfile(filepath) and filepath.endswith(".sql"): with open(filepath) as f: diff --git a/tests/table/table_io/test_json.py b/tests/table/table_io/test_json.py index b6bdf88608..202f17ba29 100644 --- a/tests/table/table_io/test_json.py +++ b/tests/table/table_io/test_json.py @@ -51,7 +51,7 @@ def test_read_input(tmpdir, use_native_downloader): @contextlib.contextmanager def _json_write_helper(data: dict[str, list[Any]]): with tempfile.TemporaryDirectory() as directory_name: - first_key = list(data.keys())[0] + first_key = next(iter(data.keys())) data_len = len(data[first_key]) for k in data: assert len(data[k]) == data_len diff --git a/tests/table/table_io/test_parquet.py b/tests/table/table_io/test_parquet.py index 3a17edd4ec..d350d016ca 100644 --- a/tests/table/table_io/test_parquet.py +++ b/tests/table/table_io/test_parquet.py @@ -53,7 +53,7 @@ def test_read_input(tmpdir): @contextlib.contextmanager -def _parquet_write_helper(data: pa.Table, row_group_size: int = None, papq_write_table_kwargs: dict = {}): +def _parquet_write_helper(data: pa.Table, row_group_size: int | None = None, papq_write_table_kwargs: dict = {}): with tempfile.TemporaryDirectory() as directory_name: file = os.path.join(directory_name, "tempfile") papq.write_table(data, file, row_group_size=row_group_size, **papq_write_table_kwargs) diff --git a/tests/table/test_joins.py b/tests/table/test_joins.py index 6c58bbce6e..81c305026a 100644 --- a/tests/table/test_joins.py +++ b/tests/table/test_joins.py @@ -220,7 +220,6 @@ def test_table_join_multicolumn_nocross(join_impl, null_safe_equal) -> None: ) def test_table_join_multicolumn_cross(join_impl, null_safe_equal) -> None: """A multicol join that should produce a cross product and a non-cross product.""" - left_table = MicroPartition.from_pydict( { "a": ["apple", "apple", "banana", "banana", "banana", None], diff --git a/tests/test_context.py b/tests/test_context.py index ba7d7dadf3..97b373f0c8 100644 --- a/tests/test_context.py +++ b/tests/test_context.py @@ -19,8 +19,7 @@ def with_null_env(): def test_explicit_set_runner_py(): - """Test that a freshly imported context doesn't have a runner config set and can be set explicitly to Python""" - + """Test that a freshly imported context doesn't have a runner config set and can be set explicitly to Python.""" explicit_set_runner_script = """ import daft print(daft.context.get_context()._runner) @@ -34,8 +33,7 @@ def test_explicit_set_runner_py(): def test_implicit_set_runner_py(): - """Test that a freshly imported context doesn't have a runner config set and can be set implicitly to Python""" - + """Test that a freshly imported context doesn't have a runner config set and can be set implicitly to Python.""" implicit_set_runner_script = """ import daft print(daft.context.get_context()._runner) @@ -49,8 +47,7 @@ def test_implicit_set_runner_py(): def test_explicit_set_runner_ray(): - """Test that a freshly imported context doesn't have a runner config set and can be set explicitly to Ray""" - + """Test that a freshly imported context doesn't have a runner config set and can be set explicitly to Ray.""" explicit_set_runner_script_ray = """ import daft print(daft.context.get_context()._runner) @@ -64,8 +61,7 @@ def test_explicit_set_runner_ray(): def test_implicit_set_runner_ray(): - """Test that a freshly imported context doesn't have a runner config set and can be set implicitly to Ray""" - + """Test that a freshly imported context doesn't have a runner config set and can be set implicitly to Ray.""" implicit_set_runner_script_ray = """ import daft import ray @@ -81,8 +77,7 @@ def test_implicit_set_runner_ray(): def test_switch_local_runners(): - """Test that a runner can be switched from Python to Native""" - + """Test that a runner can be switched from Python to Native.""" switch_local_runners_script = """ import daft print(daft.context.get_context()._runner) @@ -107,8 +102,7 @@ def test_switch_local_runners(): ], ) def test_cannot_switch_local_to_ray(set_local_command): - """Test that a runner cannot be switched from local to Ray""" - + """Test that a runner cannot be switched from local to Ray.""" script = f""" import daft {set_local_command} @@ -128,8 +122,7 @@ def test_cannot_switch_local_to_ray(set_local_command): ], ) def test_cannot_switch_from_ray(set_new_runner_command): - """Test that a runner cannot be switched from Ray""" - + """Test that a runner cannot be switched from Ray.""" script = f""" import daft daft.context.set_runner_ray() @@ -142,8 +135,7 @@ def test_cannot_switch_from_ray(set_new_runner_command): @pytest.mark.parametrize("daft_runner_envvar", ["py", "ray", "native"]) def test_env_var(daft_runner_envvar): - """Test that environment variables are correctly picked up""" - + """Test that environment variables are correctly picked up.""" autodetect_script = """ import daft df = daft.from_pydict({"foo": [1, 2, 3]}) @@ -158,8 +150,7 @@ def test_env_var(daft_runner_envvar): def test_in_ray_job(): - """Test that Ray job ID environment variable is being picked up""" - + """Test that Ray job ID environment variable is being picked up.""" autodetect_script = """ import daft df = daft.from_pydict({"foo": [1, 2, 3]}) diff --git a/tests/test_size_estimations.py b/tests/test_size_estimations.py index 343b8377ed..09833a4454 100644 --- a/tests/test_size_estimations.py +++ b/tests/test_size_estimations.py @@ -9,7 +9,7 @@ def get_scantask_estimated_size(pq_path: str, size_on_disk: int, columns: list[str] | None = None) -> int: - """Retrieve the estimated size for reading a given Parquet file""" + """Retrieve the estimated size for reading a given Parquet file.""" return native_testing_utils.estimate_in_memory_size_bytes(str(pq_path), size_on_disk, columns=columns) diff --git a/tests/utils.py b/tests/utils.py index 11863e2567..0d8ffebad0 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -14,7 +14,7 @@ def sort_arrow_table(tbl: pa.Table, sort_by: str): - """In arrow versions < 7, pa.Table does not support sorting yet so we add a helper method here""" + """In arrow versions < 7, pa.Table does not support sorting yet so we add a helper method here.""" sort_indices = pac.sort_indices(tbl.column(sort_by)) return pac.take(tbl, sort_indices) diff --git a/tools/attach_debugger.py b/tools/attach_debugger.py index 1023ad9119..f803c25edb 100644 --- a/tools/attach_debugger.py +++ b/tools/attach_debugger.py @@ -1,12 +1,9 @@ -""" -This file was copied from the Polars project (https://github.com/pola-rs/polars/blob/main/py-polars/debug/launch.py) -under the license provided by Ritchie Vink and NVIDIA Corporation & Affiliates. +# This file was copied from the Polars project (https://github.com/pola-rs/polars/blob/main/py-polars/debug/launch.py) +# under the license provided by Ritchie Vink and NVIDIA Corporation & Affiliates. The following parameter determines the sleep time of the Python process after a signal +# is sent that attaches the Rust LLDB debugger. If the Rust LLDB debugger attaches to the +# current session too late, it might miss any set breakpoints. If this happens +# consistently, it is recommended to increase this value. -The following parameter determines the sleep time of the Python process after a signal -is sent that attaches the Rust LLDB debugger. If the Rust LLDB debugger attaches to the -current session too late, it might miss any set breakpoints. If this happens -consistently, it is recommended to increase this value. -""" import os import re @@ -18,8 +15,7 @@ def launch_debugging() -> None: - """ - Debug Rust files via Python. + """Debug Rust files via Python. Determine the pID for the current debugging session, attach the Rust LLDB launcher, and execute the originally-requested script. diff --git a/tutorials/image_querying/top_n_red_color.ipynb b/tutorials/image_querying/top_n_red_color.ipynb index a2fe0bab28..036657a060 100644 --- a/tutorials/image_querying/top_n_red_color.ipynb +++ b/tutorials/image_querying/top_n_red_color.ipynb @@ -400,7 +400,7 @@ "\n", "\n", "def magic_red_detector(img: np.ndarray) -> PIL.Image.Image:\n", - " \"\"\"Gets a new image which is a mask covering all 'red' areas in the image\"\"\"\n", + " \"\"\"Gets a new image which is a mask covering all 'red' areas in the image.\"\"\"\n", " img = PIL.Image.fromarray(img)\n", " lower = np.array([245, 100, 100])\n", " upper = np.array([10, 255, 255])\n", diff --git a/tutorials/talks_and_demos/data-ai-summit-2024.ipynb b/tutorials/talks_and_demos/data-ai-summit-2024.ipynb index 4ba9d9555c..3a7f59820a 100644 --- a/tutorials/talks_and_demos/data-ai-summit-2024.ipynb +++ b/tutorials/talks_and_demos/data-ai-summit-2024.ipynb @@ -405,9 +405,7 @@ "\n", "@daft.udf(return_dtype=daft.DataType.string())\n", "def generate_presigned_url(s3_urls, expires_in=3600):\n", - " \"\"\"\n", - " Generate a presigned Amazon S3 URLs\n", - " \"\"\"\n", + " \"\"\"Generate a presigned Amazon S3 URLs.\"\"\"\n", " s3_client = boto3.client(\"s3\")\n", " presigned_urls = []\n", " for s3_url in s3_urls.to_pylist():\n", @@ -421,7 +419,7 @@ "\n", "@daft.udf(return_dtype=daft.DataType.string())\n", "def run_gpt4o_on_urls(images_urls, prompt=DEFAULT_PROMPT):\n", - " \"\"\"Run the gpt-4o LLM by making an API call to OpenAI\"\"\"\n", + " \"\"\"Run the gpt-4o LLM by making an API call to OpenAI.\"\"\"\n", " results = []\n", " for url in images_urls.to_pylist():\n", " payload = {\n",