Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs: Enable Linting of docstrings #3506

Merged
merged 10 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/ci-scripts/format_env_vars.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""
Given a comma-separated string of environment variables, parse them into a dictionary.
"""Given a comma-separated string of environment variables, parse them into a dictionary.

Example:
env_str = "a=1,b=2"
Expand Down
3 changes: 1 addition & 2 deletions .github/ci-scripts/get_wheel_name_from_s3.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""
Given a commit hash and a "platform substring", prints the wheelname of the wheel (if one exists) to stdout.
"""Given a commit hash and a "platform substring", prints the wheelname of the wheel (if one exists) to stdout.

# Example

Expand Down
5 changes: 1 addition & 4 deletions .github/ci-scripts/read_inline_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@
# dependencies = []
# ///

"""
The `read` function below is sourced from:
https://packaging.python.org/en/latest/specifications/inline-script-metadata/#inline-script-metadata
"""
"""The `read` function below is sourced from: https://packaging.python.org/en/latest/specifications/inline-script-metadata/#inline-script-metadata."""

import re

Expand Down
2 changes: 1 addition & 1 deletion .github/ci-scripts/wheellib.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@


def get_platform_tag(wheelname: str) -> str:
distribution, version, build_tag, tags = parse_wheel_filename(wheelname)
_, _, _, tags = parse_wheel_filename(wheelname)
assert len(tags) == 1, "Multiple tags found"
return next(iter(tags)).platform
16 changes: 7 additions & 9 deletions .github/working-dir/shuffle_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import random
import time
from functools import partial
from typing import Any, Dict
from typing import Any, Dict, Optional

import numpy as np
import pyarrow as pa
Expand All @@ -32,8 +32,8 @@ def parse_size(size_str: str) -> int:


def get_skewed_distribution(num_partitions: int, skew_factor: float) -> np.ndarray:
"""
Generate a skewed distribution using a power law.
"""Generate a skewed distribution using a power law.

Higher skew_factor means more skewed distribution.
"""
if skew_factor <= 0:
Expand All @@ -46,8 +46,7 @@ def get_skewed_distribution(num_partitions: int, skew_factor: float) -> np.ndarr


def get_partition_size(base_size: int, size_variation: float, partition_idx: int) -> int:
"""
Calculate size for a specific partition with variation.
"""Calculate size for a specific partition with variation.

Args:
base_size: The base partition size in bytes
Expand Down Expand Up @@ -80,7 +79,6 @@ def generate(
partition_idx: int,
):
"""Generate data for a single partition with optional skew, timing and size variations."""

# Calculate actual partition size with variation
actual_partition_size = get_partition_size(base_partition_size, size_variation, partition_idx)
num_rows = actual_partition_size // ROW_SIZE
Expand Down Expand Up @@ -135,7 +133,7 @@ def generator(
)


def setup_daft(shuffle_algorithm: str = None):
def setup_daft(shuffle_algorithm: Optional[str] = None):
"""Configure Daft execution settings."""
daft.context.set_runner_ray()
daft.context.set_execution_config(shuffle_algorithm=shuffle_algorithm, pre_shuffle_merge_threshold=8 * GB)
Expand All @@ -152,7 +150,7 @@ def run_benchmark(
skew_factor: float,
timing_variation: float,
size_variation: float,
shuffle_algorithm: str = None,
shuffle_algorithm: Optional[str] = None,
) -> Dict[str, Any]:
"""Run the memory benchmark and return statistics."""
setup_daft(shuffle_algorithm)
Expand Down Expand Up @@ -249,7 +247,7 @@ def main():
print(f"Total time: {timing:.2f}s")

except Exception as e:
print(f"Error running benchmark: {str(e)}")
print(f"Error running benchmark: {e!s}")
raise


Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ repos:

- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.6.2
rev: v0.8.2
hooks:
# Run the linter.
- id: ruff
Expand Down
27 changes: 25 additions & 2 deletions .ruff.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ line-length = 120
target-version = "py38"

[format]
docstring-code-format = true
# Like Black, indent with spaces, rather than tabs.
indent-style = "space"
# Like Black, automatically detect the appropriate line ending.
Expand All @@ -20,15 +21,37 @@ extend-select = [
"LOG", # flake8-logging
"G", # flake8-logging-format
"I", # isort
"RUF010", # Use explicit conversion flag
"RUF013", # PEP 484 prohibits implicit Optional
"RUF015", # Prefer next({iterable}) over single element slice
"RUF017", # Avoid quadratic list summation
"RUF022", # __all__ is not sorted
"RUF032", # Decimal() called with float literal argument
"RUF034", # Useless if-else condition
"RUF041", # Unnecessary nested Literal
"RUF100", # unused-noqa"
"T10" # flake8-debugger
"T10", # flake8-debugger
"D" # pydocstyle rules
]
ignore = [
"E402" # Module level import not at top of file [TODO(sammy): We want to fix this]
"E402", # Module level import not at top of file [TODO(sammy): We want to fix this]
"D417", # requires documentation for every function parameter.
"D100", # Missing docstring in public module
"D101", # Missing docstring in public class
"D102", # Missing docstring in public method
"D103", # Missing docstring in public function
"D104", # Missing docstring in public package
"D105", # Missing docstring in magic method
"D106", # Missing docstring in public nested class
"D107" # Missing docstring in __init__
]
preview = true

[lint.per-file-ignores]
# Do not enforce usage and import order rules in init files
"__init__.py" = ["E402", "F401", "I"]
# Allow wild imports in conftest
"tests/conftest.py" = ["F405", "E402", "F403"]

[lint.pydocstyle]
convention = "google"
4 changes: 2 additions & 2 deletions benchmarking/parquet/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def daft_dataframe_read(path: str, columns: list[str] | None = None) -> pa.Table
],
)
def read_fn(request):
"""Fixture which returns the function to read a PyArrow table from a path"""
"""Fixture which returns the function to read a PyArrow table from a path."""
return request.param


Expand Down Expand Up @@ -116,5 +116,5 @@ def boto_bulk_read(paths: list[str], columns: list[str] | None = None) -> list[p
],
)
def bulk_read_fn(request):
"""Fixture which returns the function to read a PyArrow table from a path"""
"""Fixture which returns the function to read a PyArrow table from a path."""
return request.param
6 changes: 3 additions & 3 deletions benchmarking/tpch/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def run_all_benchmarks(


def generate_parquet_data(tpch_gen_folder: str, scale_factor: float, num_parts: int) -> str:
"""Generates Parquet data and returns the path to the folder
"""Generates Parquet data and returns the path to the folder.

Args:
tpch_gen_folder (str): Path to the folder containing the TPCH dbgen tool and generated data
Expand All @@ -193,7 +193,7 @@ def get_daft_version() -> str:


def get_daft_benchmark_runner_name() -> Literal["ray"] | Literal["py"] | Literal["native"]:
"""Test utility that checks the environment variable for the runner that is being used for the benchmarking"""
"""Test utility that checks the environment variable for the runner that is being used for the benchmarking."""
name = os.getenv("DAFT_RUNNER")
assert name is not None, "Tests must be run with $DAFT_RUNNER env var"
name = name.lower()
Expand All @@ -217,7 +217,7 @@ def get_ray_runtime_env(requirements: str | None) -> dict:


def warmup_environment(requirements: str | None, parquet_folder: str):
"""Performs necessary setup of Daft on the current benchmarking environment"""
"""Performs necessary setup of Daft on the current benchmarking environment."""
if get_daft_benchmark_runner_name() == "ray":
runtime_env = get_ray_runtime_env(requirements)

Expand Down
6 changes: 3 additions & 3 deletions benchmarking/tpch/data_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@


def gen_sqlite_db(csv_filepath: str, num_parts: int) -> str:
"""Generates a SQLite DB from a folder filled with generated CSVs
"""Generates a SQLite DB from a folder filled with generated CSVs.

Args:
csv_filepath (str): path to folder with generated CSVs
Expand Down Expand Up @@ -243,7 +243,7 @@ def import_table(table, table_path):


def gen_csv_files(basedir: str, num_parts: int, scale_factor: float) -> str:
"""Generates CSV files
"""Generates CSV files.

Args:
basedir (str): path to generate files into
Expand Down Expand Up @@ -302,7 +302,7 @@ def gen_csv_files(basedir: str, num_parts: int, scale_factor: float) -> str:


def gen_parquet(csv_files_location: str) -> str:
"""Generates Parquet from generated CSV files
"""Generates Parquet from generated CSV files.

Args:
csv_files_location (str): path to folder with generated CSV files
Expand Down
4 changes: 2 additions & 2 deletions benchmarking/tpch/pipelined_data_generation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""This script provides a pipelined data generation implementation of data_generation.py
"""This script provides a pipelined data generation implementation of data_generation.py.

Note that after running this script, data will no longer be coherent/exist locally. This is used for generating large amounts
of benchmarking data that lands directly in AWS S3, but for local benchmarking/testing use-cases use data_generation.py instead.
Expand Down Expand Up @@ -52,7 +52,7 @@ def pipelined_data_generation(

if not cachedir.exists():
logger.info("Cloning tpch dbgen repo")
subprocess.check_output(shlex.split(f"git clone https://github.com/electrum/tpch-dbgen {str(cachedir)}"))
subprocess.check_output(shlex.split(f"git clone https://github.com/electrum/tpch-dbgen {cachedir!s}"))
subprocess.check_output("make", cwd=str(cachedir))

for i, part_indices in enumerate(batch(range(1, num_parts + 1), n=parallelism)):
Expand Down
5 changes: 2 additions & 3 deletions benchmarking/tpch/ray_job_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ async def wait_on_job(logs, timeout_s):


def run_on_ray(ray_address: str, job_params: dict, timeout_s: int = 1500):
"""Submits a job to run in the Ray cluster"""

"""Submits a job to run in the Ray cluster."""
print("Submitting benchmarking job to Ray cluster...")
print("Parameters:")
print(job_params)
Expand Down Expand Up @@ -57,7 +56,7 @@ def ray_job_params(
) -> dict:
return dict(
submission_id=f"tpch-q{tpch_qnum}-{str(uuid.uuid4())[:4]}",
entrypoint=f"python3 {str(entrypoint.relative_to(working_dir))} --parquet-folder {parquet_folder_path} --question-number {tpch_qnum}",
entrypoint=f"python3 {entrypoint.relative_to(working_dir)!s} --parquet-folder {parquet_folder_path} --question-number {tpch_qnum}",
runtime_env={
"working_dir": str(working_dir),
**runtime_env,
Expand Down
6 changes: 3 additions & 3 deletions benchmarking/tpch/subprefix_s3_files.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
Introduces more prefixes into TPCH data files hosted on S3.
"""Introduces more prefixes into TPCH data files hosted on S3.

This improves S3 read performance. For more details, see:
https://docs.aws.amazon.com/AmazonS3/latest/userguide/optimizing-performance.html
https://docs.aws.amazon.com/AmazonS3/latest/userguide/optimizing-performance.html.

Does this by copying existing files into subfolders, based on file prefix.
e.g. copies
Expand Down
62 changes: 31 additions & 31 deletions daft/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def get_build_type() -> str:


def refresh_logger() -> None:
"""Refreshes Daft's internal rust logging to the current python log level"""
"""Refreshes Daft's internal rust logging to the current python log level."""
_refresh_logger()


Expand Down Expand Up @@ -94,46 +94,46 @@ def refresh_logger() -> None:
to_struct = Expression.to_struct

__all__ = [
"from_pylist",
"from_pydict",
"from_arrow",
"from_pandas",
"from_ray_dataset",
"from_dask_dataframe",
"from_glob_path",
"read_csv",
"read_json",
"read_parquet",
"read_hudi",
"read_iceberg",
"read_deltalake",
"read_sql",
"read_lance",
"DataCatalogType",
"DataCatalogTable",
"DataCatalogType",
"DataFrame",
"Expression",
"col",
"interval",
"DataType",
"ImageMode",
"Expression",
"ImageFormat",
"lit",
"Series",
"TimeUnit",
"register_viz_hook",
"refresh_logger",
"udf",
"ImageMode",
"ResourceRequest",
"Schema",
"set_planning_config",
"set_execution_config",
"planning_config_ctx",
"Series",
"TimeUnit",
"coalesce",
"col",
"execution_config_ctx",
"from_arrow",
"from_dask_dataframe",
"from_glob_path",
"from_pandas",
"from_pydict",
"from_pylist",
"from_ray_dataset",
"interval",
"lit",
"planning_config_ctx",
"read_csv",
"read_deltalake",
"read_hudi",
"read_iceberg",
"read_json",
"read_lance",
"read_parquet",
"read_sql",
"read_table",
"refresh_logger",
"register_table",
"register_viz_hook",
"set_execution_config",
"set_planning_config",
"sql",
"sql_expr",
"to_struct",
"coalesce",
"udf",
]
Loading
Loading