Eventual-Inc · samster25 · Dec 6, 2024 · Dec 6, 2024 · Dec 6, 2024 · Dec 6, 2024
diff --git a/.github/ci-scripts/format_env_vars.py b/.github/ci-scripts/format_env_vars.py
@@ -1,5 +1,4 @@
-"""
-Given a comma-separated string of environment variables, parse them into a dictionary.
+"""Given a comma-separated string of environment variables, parse them into a dictionary.
 
 Example:
     env_str = "a=1,b=2"

diff --git a/.github/ci-scripts/get_wheel_name_from_s3.py b/.github/ci-scripts/get_wheel_name_from_s3.py
@@ -1,5 +1,4 @@
-"""
-Given a commit hash and a "platform substring", prints the wheelname of the wheel (if one exists) to stdout.
+"""Given a commit hash and a "platform substring", prints the wheelname of the wheel (if one exists) to stdout.
 
 # Example
 

diff --git a/.github/ci-scripts/read_inline_metadata.py b/.github/ci-scripts/read_inline_metadata.py
@@ -3,10 +3,7 @@
 # dependencies = []
 # ///
 
-"""
-The `read` function below is sourced from:
-https://packaging.python.org/en/latest/specifications/inline-script-metadata/#inline-script-metadata
-"""
+"""The `read` function below is sourced from: https://packaging.python.org/en/latest/specifications/inline-script-metadata/#inline-script-metadata."""
 
 import re
 

diff --git a/.github/ci-scripts/wheellib.py b/.github/ci-scripts/wheellib.py
@@ -2,6 +2,6 @@
 
 
 def get_platform_tag(wheelname: str) -> str:
-    distribution, version, build_tag, tags = parse_wheel_filename(wheelname)
+    _, _, _, tags = parse_wheel_filename(wheelname)
     assert len(tags) == 1, "Multiple tags found"
     return next(iter(tags)).platform
diff --git a/.github/working-dir/shuffle_testing.py b/.github/working-dir/shuffle_testing.py
@@ -6,7 +6,7 @@
 import random
 import time
 from functools import partial
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
 import numpy as np
 import pyarrow as pa
@@ -32,8 +32,8 @@ def parse_size(size_str: str) -> int:
 
 
 def get_skewed_distribution(num_partitions: int, skew_factor: float) -> np.ndarray:
-    """
-    Generate a skewed distribution using a power law.
+    """Generate a skewed distribution using a power law.
+
     Higher skew_factor means more skewed distribution.
     """
     if skew_factor <= 0:
@@ -46,8 +46,7 @@ def get_skewed_distribution(num_partitions: int, skew_factor: float) -> np.ndarr
 
 
 def get_partition_size(base_size: int, size_variation: float, partition_idx: int) -> int:
-    """
-    Calculate size for a specific partition with variation.
+    """Calculate size for a specific partition with variation.
 
     Args:
         base_size: The base partition size in bytes
@@ -80,7 +79,6 @@ def generate(
     partition_idx: int,
 ):
     """Generate data for a single partition with optional skew, timing and size variations."""
-
     # Calculate actual partition size with variation
     actual_partition_size = get_partition_size(base_partition_size, size_variation, partition_idx)
     num_rows = actual_partition_size // ROW_SIZE
@@ -135,7 +133,7 @@ def generator(
         )
 
 
-def setup_daft(shuffle_algorithm: str = None):
+def setup_daft(shuffle_algorithm: Optional[str] = None):
     """Configure Daft execution settings."""
     daft.context.set_runner_ray()
     daft.context.set_execution_config(shuffle_algorithm=shuffle_algorithm, pre_shuffle_merge_threshold=8 * GB)
@@ -152,7 +150,7 @@ def run_benchmark(
     skew_factor: float,
     timing_variation: float,
     size_variation: float,
-    shuffle_algorithm: str = None,
+    shuffle_algorithm: Optional[str] = None,
 ) -> Dict[str, Any]:
     """Run the memory benchmark and return statistics."""
     setup_daft(shuffle_algorithm)
@@ -249,7 +247,7 @@ def main():
         print(f"Total time: {timing:.2f}s")
 
     except Exception as e:
-        print(f"Error running benchmark: {str(e)}")
+        print(f"Error running benchmark: {e!s}")
         raise
 
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -57,7 +57,7 @@ repos:
 
 - repo: https://github.com/astral-sh/ruff-pre-commit
       # Ruff version.
-  rev: v0.6.2
+  rev: v0.8.2
   hooks:
       # Run the linter.
   - id: ruff

diff --git a/.ruff.toml b/.ruff.toml
@@ -4,6 +4,7 @@ line-length = 120
 target-version = "py38"
 
 [format]
+docstring-code-format = true
 # Like Black, indent with spaces, rather than tabs.
 indent-style = "space"
 # Like Black, automatically detect the appropriate line ending.
@@ -20,15 +21,37 @@ extend-select = [
   "LOG",  # flake8-logging
   "G",  # flake8-logging-format
   "I",  # isort
+  "RUF010",  # Use explicit conversion flag
+  "RUF013",  # PEP 484 prohibits implicit Optional
+  "RUF015",  # Prefer next({iterable}) over single element slice
+  "RUF017",  # Avoid quadratic list summation
+  "RUF022",  # __all__ is not sorted
+  "RUF032",  # Decimal() called with float literal argument
+  "RUF034",  # Useless if-else condition
+  "RUF041",  # Unnecessary nested Literal
   "RUF100",  # unused-noqa"
-  "T10"  # flake8-debugger
+  "T10",  # flake8-debugger
+  "D"  # pydocstyle rules
 ]
 ignore = [
-  "E402"  # Module level import not at top of file [TODO(sammy): We want to fix this]
+  "E402",  # Module level import not at top of file [TODO(sammy): We want to fix this]
+  "D417",  # requires documentation for every function parameter.
+  "D100",  # Missing docstring in public module
+  "D101",  # Missing docstring in public class
+  "D102",  # Missing docstring in public method
+  "D103",  # Missing docstring in public function
+  "D104",  # Missing docstring in public package
+  "D105",  # Missing docstring in magic method
+  "D106",  # Missing docstring in public nested class
+  "D107"  # Missing docstring in __init__
 ]
+preview = true
 
 [lint.per-file-ignores]
 # Do not enforce usage and import order rules in init files
 "__init__.py" = ["E402", "F401", "I"]
 # Allow wild imports in conftest
 "tests/conftest.py" = ["F405", "E402", "F403"]
+
+[lint.pydocstyle]
+convention = "google"
diff --git a/benchmarking/parquet/conftest.py b/benchmarking/parquet/conftest.py
@@ -73,7 +73,7 @@ def daft_dataframe_read(path: str, columns: list[str] | None = None) -> pa.Table
     ],
 )
 def read_fn(request):
-    """Fixture which returns the function to read a PyArrow table from a path"""
+    """Fixture which returns the function to read a PyArrow table from a path."""
     return request.param
 
 
@@ -116,5 +116,5 @@ def boto_bulk_read(paths: list[str], columns: list[str] | None = None) -> list[p
     ],
 )
 def bulk_read_fn(request):
-    """Fixture which returns the function to read a PyArrow table from a path"""
+    """Fixture which returns the function to read a PyArrow table from a path."""
     return request.param
diff --git a/benchmarking/tpch/__main__.py b/benchmarking/tpch/__main__.py
@@ -175,7 +175,7 @@ def run_all_benchmarks(
 
 
 def generate_parquet_data(tpch_gen_folder: str, scale_factor: float, num_parts: int) -> str:
-    """Generates Parquet data and returns the path to the folder
+    """Generates Parquet data and returns the path to the folder.
 
     Args:
         tpch_gen_folder (str): Path to the folder containing the TPCH dbgen tool and generated data
@@ -193,7 +193,7 @@ def get_daft_version() -> str:
 
 
 def get_daft_benchmark_runner_name() -> Literal["ray"] | Literal["py"] | Literal["native"]:
-    """Test utility that checks the environment variable for the runner that is being used for the benchmarking"""
+    """Test utility that checks the environment variable for the runner that is being used for the benchmarking."""
     name = os.getenv("DAFT_RUNNER")
     assert name is not None, "Tests must be run with $DAFT_RUNNER env var"
     name = name.lower()
@@ -217,7 +217,7 @@ def get_ray_runtime_env(requirements: str | None) -> dict:
 
 
 def warmup_environment(requirements: str | None, parquet_folder: str):
-    """Performs necessary setup of Daft on the current benchmarking environment"""
+    """Performs necessary setup of Daft on the current benchmarking environment."""
     if get_daft_benchmark_runner_name() == "ray":
         runtime_env = get_ray_runtime_env(requirements)
 

diff --git a/benchmarking/tpch/data_generation.py b/benchmarking/tpch/data_generation.py
@@ -202,7 +202,7 @@
 
 
 def gen_sqlite_db(csv_filepath: str, num_parts: int) -> str:
-    """Generates a SQLite DB from a folder filled with generated CSVs
+    """Generates a SQLite DB from a folder filled with generated CSVs.
 
     Args:
         csv_filepath (str): path to folder with generated CSVs
@@ -243,7 +243,7 @@ def import_table(table, table_path):
 
 
 def gen_csv_files(basedir: str, num_parts: int, scale_factor: float) -> str:
-    """Generates CSV files
+    """Generates CSV files.
 
     Args:
         basedir (str): path to generate files into
@@ -302,7 +302,7 @@ def gen_csv_files(basedir: str, num_parts: int, scale_factor: float) -> str:
 
 
 def gen_parquet(csv_files_location: str) -> str:
-    """Generates Parquet from generated CSV files
+    """Generates Parquet from generated CSV files.
 
     Args:
         csv_files_location (str): path to folder with generated CSV files

diff --git a/benchmarking/tpch/pipelined_data_generation.py b/benchmarking/tpch/pipelined_data_generation.py
@@ -1,4 +1,4 @@
-"""This script provides a pipelined data generation implementation of data_generation.py
+"""This script provides a pipelined data generation implementation of data_generation.py.
 
 Note that after running this script, data will no longer be coherent/exist locally. This is used for generating large amounts
 of benchmarking data that lands directly in AWS S3, but for local benchmarking/testing use-cases use data_generation.py instead.
@@ -52,7 +52,7 @@ def pipelined_data_generation(
 
     if not cachedir.exists():
         logger.info("Cloning tpch dbgen repo")
-        subprocess.check_output(shlex.split(f"git clone https://github.com/electrum/tpch-dbgen {str(cachedir)}"))
+        subprocess.check_output(shlex.split(f"git clone https://github.com/electrum/tpch-dbgen {cachedir!s}"))
         subprocess.check_output("make", cwd=str(cachedir))
 
     for i, part_indices in enumerate(batch(range(1, num_parts + 1), n=parallelism)):

diff --git a/benchmarking/tpch/ray_job_runner.py b/benchmarking/tpch/ray_job_runner.py
@@ -23,8 +23,7 @@ async def wait_on_job(logs, timeout_s):
 
 
 def run_on_ray(ray_address: str, job_params: dict, timeout_s: int = 1500):
-    """Submits a job to run in the Ray cluster"""
-
+    """Submits a job to run in the Ray cluster."""
     print("Submitting benchmarking job to Ray cluster...")
     print("Parameters:")
     print(job_params)
@@ -57,7 +56,7 @@ def ray_job_params(
 ) -> dict:
     return dict(
         submission_id=f"tpch-q{tpch_qnum}-{str(uuid.uuid4())[:4]}",
-        entrypoint=f"python3 {str(entrypoint.relative_to(working_dir))} --parquet-folder {parquet_folder_path} --question-number {tpch_qnum}",
+        entrypoint=f"python3 {entrypoint.relative_to(working_dir)!s} --parquet-folder {parquet_folder_path} --question-number {tpch_qnum}",
         runtime_env={
             "working_dir": str(working_dir),
             **runtime_env,

diff --git a/benchmarking/tpch/subprefix_s3_files.py b/benchmarking/tpch/subprefix_s3_files.py
@@ -1,7 +1,7 @@
-"""
-Introduces more prefixes into TPCH data files hosted on S3.
+"""Introduces more prefixes into TPCH data files hosted on S3.
+
 This improves S3 read performance. For more details, see:
-https://docs.aws.amazon.com/AmazonS3/latest/userguide/optimizing-performance.html
+https://docs.aws.amazon.com/AmazonS3/latest/userguide/optimizing-performance.html.
 
 Does this by copying existing files into subfolders, based on file prefix.
 e.g. copies

diff --git a/daft/__init__.py b/daft/__init__.py
@@ -37,7 +37,7 @@ def get_build_type() -> str:
 
 
 def refresh_logger() -> None:
-    """Refreshes Daft's internal rust logging to the current python log level"""
+    """Refreshes Daft's internal rust logging to the current python log level."""
     _refresh_logger()
 
 
@@ -94,46 +94,46 @@ def refresh_logger() -> None:
 to_struct = Expression.to_struct
 
 __all__ = [
-    "from_pylist",
-    "from_pydict",
-    "from_arrow",
-    "from_pandas",
-    "from_ray_dataset",
-    "from_dask_dataframe",
-    "from_glob_path",
-    "read_csv",
-    "read_json",
-    "read_parquet",
-    "read_hudi",
-    "read_iceberg",
-    "read_deltalake",
-    "read_sql",
-    "read_lance",
-    "DataCatalogType",
     "DataCatalogTable",
+    "DataCatalogType",
     "DataFrame",
-    "Expression",
-    "col",
-    "interval",
     "DataType",
-    "ImageMode",
+    "Expression",
     "ImageFormat",
-    "lit",
-    "Series",
-    "TimeUnit",
-    "register_viz_hook",
-    "refresh_logger",
-    "udf",
+    "ImageMode",
     "ResourceRequest",
     "Schema",
-    "set_planning_config",
-    "set_execution_config",
-    "planning_config_ctx",
+    "Series",
+    "TimeUnit",
+    "coalesce",
+    "col",
     "execution_config_ctx",
+    "from_arrow",
+    "from_dask_dataframe",
+    "from_glob_path",
+    "from_pandas",
+    "from_pydict",
+    "from_pylist",
+    "from_ray_dataset",
+    "interval",
+    "lit",
+    "planning_config_ctx",
+    "read_csv",
+    "read_deltalake",
+    "read_hudi",
+    "read_iceberg",
+    "read_json",
+    "read_lance",
+    "read_parquet",
+    "read_sql",
     "read_table",
+    "refresh_logger",
     "register_table",
+    "register_viz_hook",
+    "set_execution_config",
+    "set_planning_config",
     "sql",
     "sql_expr",
     "to_struct",
-    "coalesce",
+    "udf",
 ]