From 06ae3f678651bfbb3ca7dd3274ee2f38e0e0237e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20=C5=A0a=C5=A1ko?= Date: Tue, 14 Feb 2023 17:18:37 +0100 Subject: [PATCH] Format code with `ruff` (#5519) * Update config files * Format code * Some manual fixes * Fix --- .github/workflows/ci.yml | 3 +-- .gitignore | 5 ++++- CONTRIBUTING.md | 2 +- Makefile | 9 +++++---- benchmarks/format.py | 1 - metrics/bleurt/bleurt.py | 2 -- metrics/code_eval/execute.py | 2 -- metrics/comet/comet.py | 1 - metrics/coval/coval.py | 3 +-- metrics/exact_match/exact_match.py | 1 - metrics/indic_glue/indic_glue.py | 1 - metrics/mae/mae.py | 1 - metrics/mahalanobis/mahalanobis.py | 1 - metrics/mauve/mauve.py | 10 +++++----- metrics/mse/mse.py | 1 - metrics/perplexity/perplexity.py | 1 - metrics/rouge/rouge.py | 8 ++++---- metrics/sari/sari.py | 2 -- metrics/super_glue/super_glue.py | 2 +- metrics/wiki_split/wiki_split.py | 2 -- metrics/xtreme_s/xtreme_s.py | 1 - pyproject.toml | 12 ++++++++++++ setup.cfg | 18 ------------------ setup.py | 2 +- src/datasets/commands/dummy_data.py | 1 - src/datasets/features/features.py | 4 +--- src/datasets/fingerprint.py | 2 -- src/datasets/formatting/np_formatter.py | 1 - src/datasets/io/csv.py | 1 - src/datasets/io/sql.py | 1 - src/datasets/naming.py | 1 - .../folder_based_builder.py | 2 +- src/datasets/packaged_modules/json/json.py | 1 - src/datasets/utils/file_utils.py | 2 -- src/datasets/utils/filelock.py | 2 +- src/datasets/utils/logging.py | 18 ++++++++++-------- src/datasets/utils/metadata.py | 1 - src/datasets/utils/py_utils.py | 5 +---- src/datasets/utils/readme.py | 1 - tests/features/test_audio.py | 1 - tests/features/test_features.py | 3 +-- tests/test_arrow_dataset.py | 4 ---- tests/test_builder.py | 1 - tests/test_filesystem.py | 2 -- tests/test_fingerprint.py | 3 --- tests/test_hf_gcp.py | 1 - tests/test_iterable_dataset.py | 1 - tests/test_metadata_util.py | 1 - tests/test_patching.py | 1 - tests/test_py_utils.py | 1 - tests/utils.py | 4 ++-- 51 files changed, 52 insertions(+), 105 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index abb6966504a..9cec7131cd8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,8 +28,7 @@ jobs: - name: Check quality run: | black --check tests src benchmarks metrics - isort --check-only tests src benchmarks metrics - flake8 tests src benchmarks metrics + ruff tests src benchmarks metrics test: needs: check_code_quality diff --git a/.gitignore b/.gitignore index b4b4d526dd9..02f80e2f0cd 100644 --- a/.gitignore +++ b/.gitignore @@ -61,4 +61,7 @@ docs/source/_build/ # Benchmark results report.json -report.md \ No newline at end of file +report.md + +# Ruff +.ruff_cache diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1115c5f2c2f..0d61bb0f697 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -57,7 +57,7 @@ If you want to add a dataset see specific instructions in the section [*How to a 5. Develop the features on your branch. -6. Format your code. Run black and isort so that your newly added files look nice with the following command: +6. Format your code. Run black and ruff so that your newly added files look nice with the following command: ```bash make style diff --git a/Makefile b/Makefile index bfa3dbbbc36..179f23f5d45 100644 --- a/Makefile +++ b/Makefile @@ -1,17 +1,18 @@ .PHONY: quality style test +check_dirs := tests src benchmarks metrics + # Check that source code meets quality standards quality: - black --check tests src benchmarks metrics - isort --check-only tests src benchmarks metrics - flake8 tests src benchmarks metrics + black --check $(check_dirs) + ruff $(check_dirs) # Format source code automatically style: black tests src benchmarks metrics - isort tests src benchmarks metrics + ruff $(check_dirs) --fix # Run tests for the library diff --git a/benchmarks/format.py b/benchmarks/format.py index 09096551113..f37043bfb73 100644 --- a/benchmarks/format.py +++ b/benchmarks/format.py @@ -9,7 +9,6 @@ def format_json_to_md(input_json_file, output_md_file): output_md = ["
", "Show updated benchmarks!", " "] for benchmark_name in sorted(results): - benchmark_res = results[benchmark_name] benchmark_file_name = benchmark_name.split("/")[-1] diff --git a/metrics/bleurt/bleurt.py b/metrics/bleurt/bleurt.py index 3a791a5cf1d..7df26379daa 100644 --- a/metrics/bleurt/bleurt.py +++ b/metrics/bleurt/bleurt.py @@ -78,7 +78,6 @@ @datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class BLEURT(datasets.Metric): def _info(self): - return datasets.MetricInfo( description=_DESCRIPTION, citation=_CITATION, @@ -95,7 +94,6 @@ def _info(self): ) def _download_and_prepare(self, dl_manager): - # check that config name specifies a valid BLEURT model if self.config_name == "default": logger.warning( diff --git a/metrics/code_eval/execute.py b/metrics/code_eval/execute.py index 53517a805cf..99acdff7d68 100644 --- a/metrics/code_eval/execute.py +++ b/metrics/code_eval/execute.py @@ -54,9 +54,7 @@ def check_correctness(check_program, timeout, task_id, completion_id): def unsafe_execute(check_program, result, timeout): - with create_tempdir(): - # These system calls are needed when cleaning up tempdir. import os import shutil diff --git a/metrics/comet/comet.py b/metrics/comet/comet.py index 585bb309102..7be09cae87d 100644 --- a/metrics/comet/comet.py +++ b/metrics/comet/comet.py @@ -108,7 +108,6 @@ @datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class COMET(datasets.Metric): def _info(self): - return datasets.MetricInfo( description=_DESCRIPTION, citation=_CITATION, diff --git a/metrics/coval/coval.py b/metrics/coval/coval.py index 9db06daea2c..297be086ff5 100644 --- a/metrics/coval/coval.py +++ b/metrics/coval/coval.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ CoVal metric. """ -import coval # From: git+https://github.com/ns-moosavi/coval.git noqa: F401 +import coval # From: git+https://github.com/ns-moosavi/coval.git # noqa: F401 from coval.conll import reader, util from coval.eval import evaluator @@ -167,7 +167,6 @@ def get_coref_infos( key_lines, sys_lines, NP_only=False, remove_nested=False, keep_singletons=True, min_span=False, doc="dummy_doc" ): - key_doc_lines = {doc: key_lines} sys_doc_lines = {doc: sys_lines} diff --git a/metrics/exact_match/exact_match.py b/metrics/exact_match/exact_match.py index 00f8619d141..5c770bc7f0d 100644 --- a/metrics/exact_match/exact_match.py +++ b/metrics/exact_match/exact_match.py @@ -108,7 +108,6 @@ def _compute( ignore_punctuation=False, ignore_numbers=False, ): - if regexes_to_ignore is not None: for s in regexes_to_ignore: predictions = np.array([re.sub(s, "", x) for x in predictions]) diff --git a/metrics/indic_glue/indic_glue.py b/metrics/indic_glue/indic_glue.py index 0cbf3aaf548..88a1e5dcf7e 100644 --- a/metrics/indic_glue/indic_glue.py +++ b/metrics/indic_glue/indic_glue.py @@ -15,7 +15,6 @@ import numpy as np from scipy.spatial.distance import cdist -from scipy.stats import pearsonr, spearmanr from sklearn.metrics import f1_score import datasets diff --git a/metrics/mae/mae.py b/metrics/mae/mae.py index aa51cd0241f..0c25c69cd1c 100644 --- a/metrics/mae/mae.py +++ b/metrics/mae/mae.py @@ -106,7 +106,6 @@ def _get_feature_types(self): } def _compute(self, predictions, references, sample_weight=None, multioutput="uniform_average"): - mae_score = mean_absolute_error(references, predictions, sample_weight=sample_weight, multioutput=multioutput) return {"mae": mae_score} diff --git a/metrics/mahalanobis/mahalanobis.py b/metrics/mahalanobis/mahalanobis.py index 3cb4309b6be..b5d58ab7f9c 100644 --- a/metrics/mahalanobis/mahalanobis.py +++ b/metrics/mahalanobis/mahalanobis.py @@ -71,7 +71,6 @@ def _info(self): ) def _compute(self, X, reference_distribution): - # convert to numpy arrays X = np.array(X) reference_distribution = np.array(reference_distribution) diff --git a/metrics/mauve/mauve.py b/metrics/mauve/mauve.py index c281fd880ee..1b18204a5a0 100644 --- a/metrics/mauve/mauve.py +++ b/metrics/mauve/mauve.py @@ -14,11 +14,11 @@ # limitations under the License. """ MAUVE metric from https://github.com/krishnap25/mauve. """ -import faiss # Here to have a nice missing dependency error message early on -import numpy # Here to have a nice missing dependency error message early on -import requests # Here to have a nice missing dependency error message early on -import sklearn # Here to have a nice missing dependency error message early on -import tqdm # Here to have a nice missing dependency error message early on +import faiss # noqa: F401 # Here to have a nice missing dependency error message early on +import numpy # noqa: F401 # Here to have a nice missing dependency error message early on +import requests # noqa: F401 # Here to have a nice missing dependency error message early on +import sklearn # noqa: F401 # Here to have a nice missing dependency error message early on +import tqdm # noqa: F401 # Here to have a nice missing dependency error message early on from mauve import compute_mauve # From: mauve-text import datasets diff --git a/metrics/mse/mse.py b/metrics/mse/mse.py index b395c4658fe..d1b8d002b03 100644 --- a/metrics/mse/mse.py +++ b/metrics/mse/mse.py @@ -110,7 +110,6 @@ def _get_feature_types(self): } def _compute(self, predictions, references, sample_weight=None, multioutput="uniform_average", squared=True): - mse = mean_squared_error( references, predictions, sample_weight=sample_weight, multioutput=multioutput, squared=squared ) diff --git a/metrics/perplexity/perplexity.py b/metrics/perplexity/perplexity.py index 9197b27b6c0..5fb8184983b 100644 --- a/metrics/perplexity/perplexity.py +++ b/metrics/perplexity/perplexity.py @@ -101,7 +101,6 @@ def _info(self): ) def _compute(self, input_texts, model_id, batch_size: int = 16, add_start_token: bool = True, device=None): - if device is not None: assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu." if device == "gpu": diff --git a/metrics/rouge/rouge.py b/metrics/rouge/rouge.py index df7a5e40dea..d26e3d38a2f 100644 --- a/metrics/rouge/rouge.py +++ b/metrics/rouge/rouge.py @@ -14,10 +14,10 @@ """ ROUGE metric from Google Research github repo. """ # The dependencies in https://github.com/google-research/google-research/blob/master/rouge/requirements.txt -import absl # Here to have a nice missing dependency error message early on -import nltk # Here to have a nice missing dependency error message early on -import numpy # Here to have a nice missing dependency error message early on -import six # Here to have a nice missing dependency error message early on +import absl # noqa: F401 # Here to have a nice missing dependency error message early on +import nltk # noqa: F401 # Here to have a nice missing dependency error message early on +import numpy # noqa: F401 # Here to have a nice missing dependency error message early on +import six # noqa: F401 # Here to have a nice missing dependency error message early on from rouge_score import rouge_scorer, scoring import datasets diff --git a/metrics/sari/sari.py b/metrics/sari/sari.py index d20aa080614..c53b1c60680 100644 --- a/metrics/sari/sari.py +++ b/metrics/sari/sari.py @@ -227,7 +227,6 @@ def SARIsent(ssent, csent, rsents): def normalize(sentence, lowercase: bool = True, tokenizer: str = "13a", return_str: bool = True): - # Normalization is requried for the ASSET dataset (one of the primary # datasets in sentence simplification) to allow using space # to split the sentence. Even though Wiki-Auto and TURK datasets, @@ -278,7 +277,6 @@ def _info(self): ) def _compute(self, sources, predictions, references): - if not (len(sources) == len(predictions) == len(references)): raise ValueError("Sources length must match predictions and references lengths.") sari_score = 0 diff --git a/metrics/super_glue/super_glue.py b/metrics/super_glue/super_glue.py index 5be291bf03a..aeb6eb56ca3 100644 --- a/metrics/super_glue/super_glue.py +++ b/metrics/super_glue/super_glue.py @@ -135,7 +135,7 @@ def evaluate_multirc(ids_preds, labels): question_preds, question_labels = zip(*preds_labels) f1 = f1_score(y_true=question_labels, y_pred=question_preds, average="macro") f1s.append(f1) - em = int(sum(p == l for p, l in preds_labels) == len(preds_labels)) + em = int(sum(pred == label for pred, label in preds_labels) == len(preds_labels)) ems.append(em) f1_m = float(sum(f1s) / len(f1s)) em = sum(ems) / len(ems) diff --git a/metrics/wiki_split/wiki_split.py b/metrics/wiki_split/wiki_split.py index f985d4d1f18..fea0459e914 100644 --- a/metrics/wiki_split/wiki_split.py +++ b/metrics/wiki_split/wiki_split.py @@ -254,7 +254,6 @@ def SARIsent(ssent, csent, rsents): def normalize(sentence, lowercase: bool = True, tokenizer: str = "13a", return_str: bool = True): - # Normalization is requried for the ASSET dataset (one of the primary # datasets in sentence simplification) to allow using space # to split the sentence. Even though Wiki-Auto and TURK datasets, @@ -284,7 +283,6 @@ def normalize(sentence, lowercase: bool = True, tokenizer: str = "13a", return_s def compute_sari(sources, predictions, references): - if not (len(sources) == len(predictions) == len(references)): raise ValueError("Sources length must match predictions and references lengths.") sari_score = 0 diff --git a/metrics/xtreme_s/xtreme_s.py b/metrics/xtreme_s/xtreme_s.py index 74083b1166b..9e88bfabe6b 100644 --- a/metrics/xtreme_s/xtreme_s.py +++ b/metrics/xtreme_s/xtreme_s.py @@ -238,7 +238,6 @@ def _info(self): ) def _compute(self, predictions, references, bleu_kwargs=None, wer_kwargs=None): - bleu_kwargs = bleu_kwargs if bleu_kwargs is not None else {} wer_kwargs = wer_kwargs if wer_kwargs is not None else {} diff --git a/pyproject.toml b/pyproject.toml index eba4a693128..26608ab3fd6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,15 @@ [tool.black] line-length = 119 target_version = ['py37'] + +[tool.ruff] +# Ignored rules: +# "E501" -> line length violation +# "F821" -> undefined named in type annotation (e.g. Literal["something"]) +ignore = ["E501", "F821"] +select = ["E", "F", "I", "W"] +line-length = 119 + +[tool.ruff.isort] +lines-after-imports = 2 +known-first-party = ["datasets"] diff --git a/setup.cfg b/setup.cfg index 126ba4e5393..762c66b0dfa 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,24 +1,6 @@ [metadata] license_file = LICENSE -[isort] -ensure_newline_before_comments = True -force_grid_wrap = 0 -include_trailing_comma = True -line_length = 119 -lines_after_imports = 2 -multi_line_output = 3 -use_parentheses = True - -[flake8] -ignore = E203, E501, W503 -max-line-length = 119 -exclude = - src/datasets/datasets - src/datasets/metrics -per-file-ignores = - metrics/*:F401 - [tool:pytest] markers = unit: unit test diff --git a/setup.py b/setup.py index 64572064014..7ec385e8179 100644 --- a/setup.py +++ b/setup.py @@ -211,7 +211,7 @@ TESTS_REQUIRE.extend(VISION_REQUIRE) TESTS_REQUIRE.extend(AUDIO_REQUIRE) -QUALITY_REQUIRE = ["black~=22.0", "flake8>=3.8.3", "isort>=5.0.0", "pyyaml>=5.3.1"] +QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241", "pyyaml>=5.3.1"] DOCS_REQUIRE = [ # Might need to add doc-builder and some specific deps in the future diff --git a/src/datasets/commands/dummy_data.py b/src/datasets/commands/dummy_data.py index a2c8ef517f8..c4321696e67 100644 --- a/src/datasets/commands/dummy_data.py +++ b/src/datasets/commands/dummy_data.py @@ -394,7 +394,6 @@ def _print_dummy_data_instructions(self, dataset_builder, mock_dl_manager): try: generator_splits = dataset_builder._split_generators(mock_dl_manager) except FileNotFoundError as e: - print( f"Dataset {self._dataset_name} with config {mock_dl_manager.config} seems to already open files in the method `_split_generators(...)`. You might consider to instead only open files in the method `_generate_examples(...)` instead. If this is not possible the dummy data has to be created with less guidance. Make sure you create the file {e.filename}." ) diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 78ca5ab85fd..97a58215543 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -23,9 +23,8 @@ from dataclasses import InitVar, dataclass, field, fields from functools import reduce, wraps from operator import mul -from typing import Any, ClassVar, Dict, List, Optional +from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union from typing import Sequence as Sequence_ -from typing import Tuple, Union import numpy as np import pandas as pd @@ -1763,7 +1762,6 @@ def unsimplify(feature: dict) -> dict: return feature def from_yaml_inner(obj: Union[dict, list]) -> Union[dict, list]: - if isinstance(obj, dict): if not obj: return {} diff --git a/src/datasets/fingerprint.py b/src/datasets/fingerprint.py index e02c6df52e0..92488293504 100644 --- a/src/datasets/fingerprint.py +++ b/src/datasets/fingerprint.py @@ -184,7 +184,6 @@ def get_temporary_cache_files_directory() -> str: """Return a directory that is deleted when session closes.""" global _TEMP_DIR_FOR_TEMP_CACHE_FILES if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None: - # Avoids a PermissionError on Windows caused by the datasets referencing # the files from the cache directory on clean-up def cleanup_func(): @@ -466,7 +465,6 @@ def fingerprint_transform( fingerprint_names = fingerprint_names if fingerprint_names is not None else ["new_fingerprint"] def _fingerprint(func): - if not inplace and not all(name in func.__code__.co_varnames for name in fingerprint_names): raise ValueError("function {func} is missing parameters {fingerprint_names} in signature") diff --git a/src/datasets/formatting/np_formatter.py b/src/datasets/formatting/np_formatter.py index 8d83299c127..45bd76ad7a1 100644 --- a/src/datasets/formatting/np_formatter.py +++ b/src/datasets/formatting/np_formatter.py @@ -44,7 +44,6 @@ def _consolidate(self, column): return column def _tensorize(self, value): - if isinstance(value, (str, bytes, type(None))): return value elif isinstance(value, (np.character, np.ndarray)) and np.issubdtype(value.dtype, np.character): diff --git a/src/datasets/io/csv.py b/src/datasets/io/csv.py index a7d6628129a..111b4ff3e00 100644 --- a/src/datasets/io/csv.py +++ b/src/datasets/io/csv.py @@ -74,7 +74,6 @@ def __init__( num_proc: Optional[int] = None, **to_csv_kwargs, ): - if num_proc is not None and num_proc <= 0: raise ValueError(f"num_proc {num_proc} must be an integer > 0.") diff --git a/src/datasets/io/sql.py b/src/datasets/io/sql.py index 35eb33f550b..a236f997507 100644 --- a/src/datasets/io/sql.py +++ b/src/datasets/io/sql.py @@ -64,7 +64,6 @@ def __init__( num_proc: Optional[int] = None, **to_sql_kwargs, ): - if num_proc is not None and num_proc <= 0: raise ValueError(f"num_proc {num_proc} must be an integer > 0.") diff --git a/src/datasets/naming.py b/src/datasets/naming.py index 2bfd8d82694..182673b53f5 100644 --- a/src/datasets/naming.py +++ b/src/datasets/naming.py @@ -67,7 +67,6 @@ def filepattern_for_dataset_split(dataset_name, split, data_dir, filetype_suffix def filenames_for_dataset_split(path, dataset_name, split, filetype_suffix=None, shard_lengths=None): - prefix = filename_prefix_for_split(dataset_name, split) prefix = os.path.join(path, prefix) diff --git a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py index 6ecf9ebb32b..ef067148ffa 100644 --- a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py +++ b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py @@ -133,7 +133,7 @@ def analyze(files_or_archives, downloaded_files_or_dirs, split): if metadata_files: # add metadata if `metadata_files` are found and `drop_metadata` is None (default) or False - add_metadata = not (self.config.drop_metadata is True) + add_metadata = not self.config.drop_metadata # if `metadata_files` are found, add labels only if # `drop_labels` is set up to False explicitly (not-default behavior) add_labels = self.config.drop_labels is False diff --git a/src/datasets/packaged_modules/json/json.py b/src/datasets/packaged_modules/json/json.py index fba42671c3f..6718f7694d2 100644 --- a/src/datasets/packaged_modules/json/json.py +++ b/src/datasets/packaged_modules/json/json.py @@ -93,7 +93,6 @@ def _cast_table(self, pa_table: pa.Table) -> pa.Table: def _generate_tables(self, files): for file_idx, file in enumerate(itertools.chain.from_iterable(files)): - # If the file is one json object and if we need to look at the list of items in one specific field if self.config.field is not None: with open(file, encoding="utf-8") as f: diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index b6099e3e527..48242e8a4a1 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -100,7 +100,6 @@ def head_hf_s3( def hf_github_url(path: str, name: str, dataset=True, revision: Optional[str] = None) -> str: - default_revision = "main" if version.parse(__version__).is_devrelease else __version__ revision = revision or default_revision if dataset: @@ -547,7 +546,6 @@ def get_from_cache( # Prevent parallel downloads of the same file with a lock. lock_path = cache_path + ".lock" with FileLock(lock_path): - if resume_download: incomplete_path = cache_path + ".incomplete" diff --git a/src/datasets/utils/filelock.py b/src/datasets/utils/filelock.py index 2a6a44c58de..e9087ceecb7 100644 --- a/src/datasets/utils/filelock.py +++ b/src/datasets/utils/filelock.py @@ -106,6 +106,7 @@ def __str__(self): # Classes # ------------------------------------------------ + # This is a helper class which is returned by :meth:`BaseFileLock.acquire` # and wraps the lock to make sure __enter__ is not called twice when entering # the with statement. @@ -301,7 +302,6 @@ def release(self, force=False): every case. """ with self._thread_lock: - if self.is_locked: self._lock_counter -= 1 diff --git a/src/datasets/utils/logging.py b/src/datasets/utils/logging.py index 5ab3961b70c..d6ee692a5f5 100644 --- a/src/datasets/utils/logging.py +++ b/src/datasets/utils/logging.py @@ -15,14 +15,16 @@ import logging import os -from logging import CRITICAL # NOQA -from logging import DEBUG # NOQA -from logging import ERROR # NOQA -from logging import FATAL # NOQA -from logging import INFO # NOQA -from logging import NOTSET # NOQA -from logging import WARN # NOQA -from logging import WARNING # NOQA +from logging import ( + CRITICAL, # NOQA + DEBUG, # NOQA + ERROR, # NOQA + FATAL, # NOQA + INFO, # NOQA + NOTSET, # NOQA + WARN, # NOQA + WARNING, # NOQA +) from typing import Optional from tqdm import auto as tqdm_lib diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index f8d90cee8fb..a309cd57caa 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -31,7 +31,6 @@ def _split_yaml_from_readme(readme_content: str) -> Tuple[Optional[str], str]: class DatasetMetadata(dict): - # class attributes _FIELDS_WITH_DASHES = {"train_eval_index"} # train-eval-index in the YAML metadata diff --git a/src/datasets/utils/py_utils.py b/src/datasets/utils/py_utils.py index ec2fa75b4c7..3388a7b6b46 100644 --- a/src/datasets/utils/py_utils.py +++ b/src/datasets/utils/py_utils.py @@ -81,7 +81,7 @@ def size_str(size_in_bytes): _NAME_LIST = [("PiB", 2**50), ("TiB", 2**40), ("GiB", 2**30), ("MiB", 2**20), ("KiB", 2**10)] size_in_bytes = float(size_in_bytes) - for (name, size_bytes) in _NAME_LIST: + for name, size_bytes in _NAME_LIST: value = size_in_bytes / size_bytes if value >= 1.0: return f"{value:.2f} {name}" @@ -816,7 +816,6 @@ def _save_code(pickler, obj): return elif config.DILL_VERSION.release[:3] == version.parse("0.3.6").release: - # From: https://github.com/uqfoundation/dill/blob/dill-0.3.6/dill/_dill.py#L1104 @pklregister(CodeType) def save_code(pickler, obj): @@ -1041,7 +1040,6 @@ def save_function(pickler, obj): return elif config.DILL_VERSION.release[:3] == version.parse("0.3.5").release: # 0.3.5, 0.3.5.1 - # https://github.com/uqfoundation/dill/blob/dill-0.3.5.1/dill/_dill.py @pklregister(FunctionType) def save_function(pickler, obj): @@ -1185,7 +1183,6 @@ def save_function(pickler, obj): return elif config.DILL_VERSION.release[:3] == version.parse("0.3.6").release: - # From: https://github.com/uqfoundation/dill/blob/dill-0.3.6/dill/_dill.py#L1739 @pklregister(FunctionType) def save_function(pickler, obj): diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py index 3bbef2e246d..055b87a8028 100644 --- a/src/datasets/utils/readme.py +++ b/src/datasets/utils/readme.py @@ -253,7 +253,6 @@ def _validate(self, readme_structure): # If one exactly start_key = list(self.content.keys())[0] # Get the key if start_key.startswith("Dataset Card for"): # Check correct start - # If the starting is correct, validate all the sections _, sec_error_list, sec_warning_list = self.content[start_key].validate( readme_structure["subsections"][0] diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index d733e7e442d..23689ef6189 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -151,7 +151,6 @@ def test_audio_decode_example_mp3_torchaudio_latest(shared_datadir, torchaudio_f audio = Audio() with patch("torchaudio.load") if torchaudio_failed else nullcontext() as load_mock: - if torchaudio_failed: load_mock.side_effect = RuntimeError() diff --git a/tests/features/test_features.py b/tests/features/test_features.py index c08b37c046a..83c13931eff 100644 --- a/tests/features/test_features.py +++ b/tests/features/test_features.py @@ -445,7 +445,6 @@ def iternumpy(key1, value1, value2): def dict_diff(d1: dict, d2: dict): # check if 2 dictionaries are equal - np.testing.assert_equal(d1, d2) # sanity check if dict values are equal or not for (k1, v1), (k2, v2) in zip(d1.items(), d2.items()): # check if their values have same dtype or not @@ -454,7 +453,7 @@ def dict_diff(d1: dict, d2: dict): # check if 2 dictionaries are equal elif isinstance(v1, np.ndarray): # checks if dtype and value of np.ndarray is equal iternumpy(k1, v1, v2) elif isinstance(v1, list): - for (element1, element2) in zip(v1, v2): # iterates over all elements of list + for element1, element2 in zip(v1, v2): # iterates over all elements of list if isinstance(element1, dict): dict_diff(element1, element2) elif isinstance(element1, np.ndarray): diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 90c1c5f3cb7..61c143b3346 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -161,7 +161,6 @@ def _to(self, in_memory, tmp_dir, *datasets): def test_dummy_dataset(self, in_memory): with tempfile.TemporaryDirectory() as tmp_dir: - with self._create_dummy_dataset(in_memory, tmp_dir) as dset: self.assertDictEqual(dset.features, Features({"filename": Value("string")})) self.assertEqual(dset[0]["filename"], "my_name-train_0") @@ -347,7 +346,6 @@ def test_dummy_dataset_serialize(self, in_memory): def test_dummy_dataset_load_from_disk(self, in_memory): with tempfile.TemporaryDirectory() as tmp_dir: - with self._create_dummy_dataset(in_memory, tmp_dir).select(range(10)) as dset: dataset_path = os.path.join(tmp_dir, "my_dataset") dset.save_to_disk(dataset_path) @@ -360,7 +358,6 @@ def test_dummy_dataset_load_from_disk(self, in_memory): def test_restore_saved_format(self, in_memory): with tempfile.TemporaryDirectory() as tmp_dir: - with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset: dset.set_format(type="numpy", columns=["col_1"], output_all_columns=True) dataset_path = os.path.join(tmp_dir, "my_dataset") @@ -1531,7 +1528,6 @@ def __call__(self, example): with tempfile.TemporaryDirectory() as tmp_dir: with self._create_dummy_dataset(in_memory, tmp_dir) as dset: - ex_cnt = ExampleCounter() dset.map(ex_cnt) self.assertEqual(ex_cnt.cnt, len(dset)) diff --git a/tests/test_builder.py b/tests/test_builder.py index 37f0a3e83c5..84580a1167d 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -784,7 +784,6 @@ def test_cache_dir_for_config_kwargs(self): def test_config_names(self): with tempfile.TemporaryDirectory() as tmp_dir: - with self.assertRaises(ValueError) as error_context: DummyBuilderWithMultipleConfigs(cache_dir=tmp_dir, data_files=None, data_dir=None) self.assertIn("Please pick one among the available configs", str(error_context.exception)) diff --git a/tests/test_filesystem.py b/tests/test_filesystem.py index dbea4f8f64e..e99216a8394 100644 --- a/tests/test_filesystem.py +++ b/tests/test_filesystem.py @@ -10,7 +10,6 @@ def test_extract_path_from_uri(): - mock_bucket = "mock-s3-bucket" dataset_path = f"s3://{mock_bucket}" dataset_path = extract_path_from_uri(dataset_path) @@ -22,7 +21,6 @@ def test_extract_path_from_uri(): def test_is_remote_filesystem(mockfs): - is_remote = is_remote_filesystem(mockfs) assert is_remote is True diff --git a/tests/test_fingerprint.py b/tests/test_fingerprint.py index 853e5d205bf..339badb8d3b 100644 --- a/tests/test_fingerprint.py +++ b/tests/test_fingerprint.py @@ -132,7 +132,6 @@ def func(): self.assertEqual(hash1, hash2) def test_recurse_dump_for_class(self): - hash1 = md5(datasets.utils.py_utils.dumps(Foo([0]))).hexdigest() hash2 = md5(datasets.utils.py_utils.dumps(Foo([1]))).hexdigest() hash3 = md5(datasets.utils.py_utils.dumps(Foo([0]))).hexdigest() @@ -140,7 +139,6 @@ def test_recurse_dump_for_class(self): self.assertNotEqual(hash1, hash2) def test_recurse_dump_for_method(self): - hash1 = md5(datasets.utils.py_utils.dumps(Foo([0]).__call__)).hexdigest() hash2 = md5(datasets.utils.py_utils.dumps(Foo([1]).__call__)).hexdigest() hash3 = md5(datasets.utils.py_utils.dumps(Foo([0]).__call__)).hexdigest() @@ -148,7 +146,6 @@ def test_recurse_dump_for_method(self): self.assertNotEqual(hash1, hash2) def test_dump_ipython_function(self): - code_args_py37 = ( "co_argcount", "co_kwonlyargcount", diff --git a/tests/test_hf_gcp.py b/tests/test_hf_gcp.py index c29ca19a2fe..aa0254f3fe0 100644 --- a/tests/test_hf_gcp.py +++ b/tests/test_hf_gcp.py @@ -51,7 +51,6 @@ class TestDatasetOnHfGcp(TestCase): config_name = None def test_dataset_info_available(self, dataset, config_name): - with TemporaryDirectory() as tmp_dir: dataset_module = dataset_module_factory(dataset, cache_dir=tmp_dir) diff --git a/tests/test_iterable_dataset.py b/tests/test_iterable_dataset.py index 6b5d70ae16c..11bdd28682b 100644 --- a/tests/test_iterable_dataset.py +++ b/tests/test_iterable_dataset.py @@ -647,7 +647,6 @@ def gen(shard_names): @require_torch def test_iterable_dataset_torch_integration(): - ex_iterable = ExamplesIterable(generate_examples_fn, {}) dataset = IterableDataset(ex_iterable) import torch.utils.data diff --git a/tests/test_metadata_util.py b/tests/test_metadata_util.py index 7222a963ce3..a18378a6889 100644 --- a/tests/test_metadata_util.py +++ b/tests/test_metadata_util.py @@ -60,7 +60,6 @@ def test_metadata_dict_from_readme(self): self.assertEqual(metadata_dict, {}) def test_from_yaml_string(self): - valid_yaml_string = _dedent( """\ annotations_creators: diff --git a/tests/test_patching.py b/tests/test_patching.py index a8adf42de6c..42c592648f8 100644 --- a/tests/test_patching.py +++ b/tests/test_patching.py @@ -20,7 +20,6 @@ def test_patch_submodule(): mock = "__test_patch_submodule_mock__" with patch_submodule(_test_patching, "os.path.join", mock): - # Every way to access os.path.join must be patched, and the rest must stay untouched # check os.path.join diff --git a/tests/test_py_utils.py b/tests/test_py_utils.py index 57091b22bfd..7d44c888828 100644 --- a/tests/test_py_utils.py +++ b/tests/test_py_utils.py @@ -251,7 +251,6 @@ def _2seconds_generator_of_2items_with_timing(content): def test_iflatmap_unordered(): - with Pool(2) as pool: out = list(iflatmap_unordered(pool, _split_text, kwargs_iterable=[{"text": "hello there"}] * 10)) assert out.count("hello") == 10 diff --git a/tests/utils.py b/tests/utils.py index 7168c7f7619..644efcef875 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -476,8 +476,8 @@ def tee(line, sink, pipe, label=""): # XXX: the timeout doesn't seem to make any difference here await asyncio.wait( [ - _read_stream(p.stdout, lambda l: tee(l, out, sys.stdout, label="stdout:")), - _read_stream(p.stderr, lambda l: tee(l, err, sys.stderr, label="stderr:")), + _read_stream(p.stdout, lambda line: tee(line, out, sys.stdout, label="stdout:")), + _read_stream(p.stderr, lambda line: tee(line, err, sys.stderr, label="stderr:")), ], timeout=timeout, )