-
Notifications
You must be signed in to change notification settings - Fork 178
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* init * lint * fix * lint * lint * rm print stmt * Update dataset_utils.py
- Loading branch information
1 parent
779c7b9
commit c39b68a
Showing
3 changed files
with
103 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
"""Util functions for datasets.""" | ||
|
||
import requests | ||
|
||
from prompt2model.utils.logging_utils import get_formatted_logger | ||
|
||
logger = get_formatted_logger("dataset_utils") | ||
|
||
|
||
def query(API_URL): | ||
"""Returns a response json for a URL.""" | ||
try: | ||
response = requests.get(API_URL) | ||
if response.status_code == 200: | ||
return response.json() | ||
else: | ||
logger.error(f"Error occurred in fetching size: {response.status_code}") | ||
except requests.exceptions.RequestException as e: | ||
logger.error("Error occurred in making the request: " + str(e)) | ||
|
||
return {} | ||
|
||
|
||
def get_dataset_size(dataset_name): | ||
"""Fetches dataset size for a dataset in MB from hugging face API.""" | ||
API_URL = f"https://datasets-server.huggingface.co/size?dataset={dataset_name}" | ||
data = query(API_URL) | ||
size_dict = data.get("size", {}) | ||
return ( | ||
"NA" | ||
if size_dict is {} | ||
else "{:.2f}".format(size_dict["dataset"]["num_bytes_memory"] / 1024 / 1024) | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
"""Testing dataset utility functions.""" | ||
from unittest.mock import patch | ||
|
||
from prompt2model.utils import dataset_utils | ||
|
||
|
||
@patch("prompt2model.utils.dataset_utils.query") | ||
def test_get_dataset_size(mock_request): | ||
"""Test function for get_dataset_size.""" | ||
mock_request.return_value = { | ||
"size": { | ||
"dataset": { | ||
"dataset": "rotten_tomatoes", | ||
"num_bytes_original_files": 487770, | ||
"num_bytes_parquet_files": 881052, | ||
"num_bytes_memory": 1345449, | ||
"num_rows": 10662, | ||
}, | ||
"configs": [ | ||
{ | ||
"dataset": "rotten_tomatoes", | ||
"config": "default", | ||
"num_bytes_original_files": 487770, | ||
"num_bytes_parquet_files": 881052, | ||
"num_bytes_memory": 1345449, | ||
"num_rows": 10662, | ||
"num_columns": 2, | ||
} | ||
], | ||
"splits": [ | ||
{ | ||
"dataset": "rotten_tomatoes", | ||
"config": "default", | ||
"split": "train", | ||
"num_bytes_parquet_files": 698845, | ||
"num_bytes_memory": 1074806, | ||
"num_rows": 8530, | ||
"num_columns": 2, | ||
}, | ||
{ | ||
"dataset": "rotten_tomatoes", | ||
"config": "default", | ||
"split": "validation", | ||
"num_bytes_parquet_files": 90001, | ||
"num_bytes_memory": 134675, | ||
"num_rows": 1066, | ||
"num_columns": 2, | ||
}, | ||
{ | ||
"dataset": "rotten_tomatoes", | ||
"config": "default", | ||
"split": "test", | ||
"num_bytes_parquet_files": 92206, | ||
"num_bytes_memory": 135968, | ||
"num_rows": 1066, | ||
"num_columns": 2, | ||
}, | ||
], | ||
}, | ||
"pending": [], | ||
"failed": [], | ||
"partial": False, | ||
} | ||
assert dataset_utils.get_dataset_size("rotten_tomatoes") == "1.28" |