Skip to content

Commit

Permalink
Merge pull request #158 from crowdcent/feature/43-data
Browse files Browse the repository at this point in the history
v4.3+ data support for NumeraiClassicDownloader
  • Loading branch information
CarloLepelaars authored Jan 2, 2024
2 parents 7add4e9 + cfa30a0 commit 0fdcf47
Show file tree
Hide file tree
Showing 6 changed files with 451 additions and 514 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -146,5 +146,9 @@ sidebar.yml
# Test files
edu_nbs/*.h5
prod_requirements.txt
test_numclassic_general_*
some_path_*
test_kaggle_*
test_eod_*

examples/data
10 changes: 5 additions & 5 deletions docs/download.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@

## Numerai Classic

For `NumeraiClassicDownloader` versions `4`, `4.1` (sunshine) and `4.2` (rain) are supported. You can easily download the data with a few lines.
`NumeraiClassicDownloader` simplifies downloading of datasets from Numerai's API. It allows you to easily download data with a few lines and the data is automatically organized in directories.

NOTE: For the `4.2` data only an int8 version is available.
NOTE: Only int8 versions are available in this downloader. From v4.2 onwards, Numerai only provides int8 versions of the data.

```py
from numerblox.download import NumeraiClassicDownloader

dl = NumeraiClassicDownloader(directory_path="my_numerai_data_folder")

# Training and validation data
dl.download_training_data("train_val", version="4.2", int8=True)
dl.download_training_data("train_val", version="4.3")

# Inference data (live)
dl.download_inference_data("live", version="4.2", int8=True)
dl.download_inference_data("live", version="4.3")
```

Besides these common use cases you can also get feature sets and meta model predictions with `NumeraiClassicDownloader`.
Expand All @@ -25,7 +25,7 @@ from numerblox.download import NumeraiClassicDownloader

dl = NumeraiClassicDownloader(directory_path="my_numerai_data_folder")

# Get feature sets (v4.2)
# Get feature sets (v4.3)
features = dl.get_classic_features()

# Get meta model predictions
Expand Down
163 changes: 45 additions & 118 deletions numerblox/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,101 +168,36 @@ class NumeraiClassicDownloader(BaseDownloader):
:param directory_path: Base folder to download files to. \n
All *args, **kwargs will be passed to NumerAPI initialization.
"""
TRAIN_DATASET_NAME = "train_int8.parquet"
VALIDATION_DATASET_NAME = "validation_int8.parquet"
LIVE_DATASET_NAME = "live_int8.parquet"
LIVE_EXAMPLE_PREDS_NAME = "live_example_preds.parquet"
VALIDATION_EXAMPLE_PREDS_NAME = "validation_example_preds.parquet"

def __init__(self, directory_path: str, *args, **kwargs):
super().__init__(directory_path=directory_path)
self.napi = NumerAPI(*args, **kwargs)
self.current_round = self.napi.get_current_round()
# NumerAPI filenames corresponding to version, class and data type
self.version_mapping = {"4": {
"train": {
"int8": [
"v4/train_int8.parquet",
"v4/validation_int8.parquet"
],
"float": [
"v4/train.parquet",
"v4/validation.parquet"
]
},
"inference": {
"int8": ["v4/live_int8.parquet"],
"float": ["v4/live.parquet"]
},
"live": {
"int8": ["v4/live_int8.parquet"],
"float": ["v4/live.parquet"]
},
"example": [
"v4/live_example_preds.parquet",
"v4/validation_example_preds.parquet"
]
},
"4.1": {
"train": {
"int8": [
"v4.1/train_int8.parquet",
"v4.1/validation_int8.parquet"
],
"float": [
"v4.1/train.parquet",
"v4.1/validation.parquet"
]
},
"inference": {
"int8": ["v4.1/live_int8.parquet"],
"float": ["v4.1/live.parquet"]
},
"live": {
"int8": ["v4.1/live_int8.parquet"],
"float": ["v4.1/live.parquet"]
},
"example": [
"v4.1/live_example_preds.parquet",
"v4.1/validation_example_preds.parquet"
],
},
"4.2": {
"train": {
"int8": [
"v4.2/train_int8.parquet",
"v4.2/validation_int8.parquet"
],
},
"inference": {
"int8": ["v4.2/live_int8.parquet"],
"float": ["v4.2/live.parquet"]
},
"live": {
"int8": ["v4.2/live_int8.parquet"],
"float": ["v4.2/live.parquet"]
},
"example": [
"v4.2/live_example_preds.parquet",
"v4.2/validation_example_preds.parquet"
],
}
}
# Get all available versions available for Numerai.
self.dataset_versions = set(s.split("/")[0] for s in NumerAPI().list_datasets())
self.dataset_versions.discard("signals")

def download_training_data(
self, subfolder: str = "", version: str = "4.2", int8: bool = True
self, subfolder: str = "", version: str = "4.3"
):
"""
Get Numerai classic training and validation data.
:param subfolder: Specify folder to create folder within base directory root.
Saves in base directory root by default.
:param version: Numerai dataset version.
4 = April 2022 dataset
4.1 = Sunshine dataset
4.2 (default) = Rain Dataset
NOTE: 4.2 is only available as int8 version so explicitly pass
int8 = True if you want to download 4.2 data.
:param int8: Integer version of data
"""
data_type = "int8" if int8 else "float"
# 4.2 data is only (currently) available in int8 format.
# Raise exception to avoid confusion about the 4.2. dataset.
if data_type == "float" and version == "4.2":
raise NotImplementedError("""No float version of training data is available for version 4.2. If you would like to download the 4.2 (Rain) dataset make sure to explicitly pass `int8=True`.""")
train_val_files = self._get_version_mapping(str(version))["train"][data_type]
4.3 = Midnight dataset
"""
self._check_dataset_version(version)
train_val_files = [f"v{version}/{self.TRAIN_DATASET_NAME}",
f"v{version}/{self.VALIDATION_DATASET_NAME}"]
for file in train_val_files:
dest_path = self.__get_dest_path(subfolder, file)
self.download_single_dataset(
Expand All @@ -273,32 +208,21 @@ def download_training_data(
def download_inference_data(
self,
subfolder: str = "",
version: str = "4.2",
int8: bool = True,
version: str = "4.3",
round_num: int = None,
):
"""
Get Numerai classic inference (tournament) data.
If only minimal live data is needed, consider .download_live_data.
:param subfolder: Specify folder to create folder within base directory root.
Saves in base directory root by default.
:param version: Numerai dataset version
:param version: Numerai dataset version.
4 = April 2022 dataset
4.1 = Sunshine dataset
4.2 (default) = Rain Dataset
:param int8: Integer version of data
4.3 = Midnight dataset
:param round_num: Numerai tournament round number. Downloads latest round by default.
"""
data_type = "int8" if int8 else "float"
if data_type == "float" and version == "4.2":
raise NotImplementedError("""No float version of training data is available for version 4.2. onwards. If you would like to download the 4.2 (Rain) dataset make sure to explicitly pass `int8=True`.""")
inference_files = self._get_version_mapping(str(version))["inference"][data_type]
for file in inference_files:
dest_path = self.__get_dest_path(subfolder, file)
self.download_single_dataset(
filename=file,
dest_path=dest_path,
round_num=round_num
)
self.download_live_data(subfolder=subfolder, version=version, round_num=round_num)

def download_single_dataset(
self, filename: str, dest_path: str, round_num: int = None
Expand All @@ -322,23 +246,23 @@ def download_single_dataset(
def download_live_data(
self,
subfolder: str = "",
version: str = "4.2",
int8: bool = False,
version: str = "4.3",
round_num: int = None
):
"""
Download all live data in specified folder for given version (i.e. minimal data needed for inference).
:param subfolder: Specify folder to create folder within directory root.
Saves in directory root by default.
:param version: Numerai dataset version
:param version: Numerai dataset version.
4 = April 2022 dataset
4.1 = Sunshine dataset
4.2 (default) = Rain Dataset
:param int8: Integer version of data
4.3 = Midnight dataset
:param round_num: Numerai tournament round number. Downloads latest round by default.
"""
data_type = "int8" if int8 else "float"
live_files = self._get_version_mapping(str(version))["live"][data_type]
self._check_dataset_version(version)
live_files = [f"v{version}/{self.LIVE_DATASET_NAME}"]
for file in live_files:
dest_path = self.__get_dest_path(subfolder, file)
self.download_single_dataset(
Expand All @@ -348,17 +272,23 @@ def download_live_data(
)

def download_example_data(
self, subfolder: str = "", version: str = "4.2", round_num: int = None
self, subfolder: str = "", version: str = "4.3", round_num: int = None
):
"""
Download all example prediction data in specified folder for given version.
:param subfolder: Specify folder to create folder within base directory root.
Saves in base directory root by default.
:param version: Numerai dataset version (4.1=Sunshine dataset)
:param version: Numerai dataset version.
4 = April 2022 dataset
4.1 = Sunshine dataset
4.2 (default) = Rain Dataset
4.3 = Midnight dataset
:param round_num: Numerai tournament round number. Downloads latest round by default.
"""
example_files = self._get_version_mapping(str(version))["example"]
self._check_dataset_version(version)
example_files = [f"v{version}/{self.LIVE_EXAMPLE_PREDS_NAME}",
f"v{version}/{self.VALIDATION_EXAMPLE_PREDS_NAME}"]
for file in example_files:
dest_path = self.__get_dest_path(subfolder, file)
self.download_single_dataset(
Expand All @@ -367,7 +297,7 @@ def download_example_data(
round_num=round_num
)

def get_classic_features(self, subfolder: str = "", filename="v4.2/features.json", *args, **kwargs) -> dict:
def get_classic_features(self, subfolder: str = "", filename="v4.3/features.json", *args, **kwargs) -> dict:
"""
Download feature overview (stats and feature sets) through NumerAPI and load as dict.
:param subfolder: Specify folder to create folder within base directory root.
Expand All @@ -376,42 +306,39 @@ def get_classic_features(self, subfolder: str = "", filename="v4.2/features.json
*args, **kwargs will be passed to the JSON loader.
:return: Feature overview dict
"""
version = filename.split("/")[0].replace("v", "")
self._check_dataset_version(version)
dest_path = self.__get_dest_path(subfolder, filename)
self.download_single_dataset(filename=filename,
dest_path=dest_path)
json_data = self._load_json(dest_path, *args, **kwargs)
return json_data

def download_meta_model_preds(self, subfolder: str = "", filename="v4.2/meta_model.parquet") -> pd.DataFrame:
def download_meta_model_preds(self, subfolder: str = "", filename="v4.3/meta_model.parquet") -> pd.DataFrame:
"""
Download Meta model predictions through NumerAPI.
:param subfolder: Specify folder to create folder within base directory root.
Saves in base directory root by default.
:param filename: name for meta model predictions file.
:return: Meta model predictions as DataFrame.
"""
version = filename.split("/")[0].replace("v", "")
self._check_dataset_version(version)
dest_path = self.__get_dest_path(subfolder, filename)
self.download_single_dataset(
filename=filename,
dest_path=dest_path,
)
return pd.read_parquet(dest_path)

def _get_version_mapping(self, version: int) -> dict:
""" Check if data version is supported and return file mapping for version. """
try:
mapping_dictionary = self.version_mapping[str(version)]
except KeyError:
raise NotImplementedError(
f"Version '{version}' is not available. Available versions are {list(self.version_mapping.keys())}"
)
return mapping_dictionary

def __get_dest_path(self, subfolder: str, filename: str) -> str:
""" Prepare destination path for downloading. """
dir = self._append_folder(subfolder)
dest_path = str(dir.joinpath(filename.split("/")[-1]))
return dest_path

def _check_dataset_version(self, version: str):
assert f"v{version}" in self.dataset_versions, f"Version '{version}' is not available in NumerAPI."


class KaggleDownloader(BaseDownloader):
Expand Down
Loading

0 comments on commit 0fdcf47

Please sign in to comment.