Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make padding compatible with old S2 data #57

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ The $\texttt{EuroCropsML}$ dataset allows users to customize options for various
| `data_dir` | Folder inside the data directory where pre-processed data is stored. |
| `random_seed` | Random seed used for generating training-testing-splits and further random numbers. |
| `num_samples` | Number of samples per class used for the fine-tuning subsets. The default will create the shots currently present on [Zenodo](https://zenodo.org/doi/10.5281/zenodo.10629609) for the training set. It will samples 1000 samples for validation and keep all available data from the test set. |
| `satellites` | List of satellites whose data is to be used.|
| `benchmark` | Whether to download the pre-existing benchmark split from Zenodo. For more information see docstrings in the {any}`splits module<eurocropsml.datasets.splits>` |
jmaces marked this conversation as resolved.
Show resolved Hide resolved
| `meadow_class` | Class that represents the ${\texttt{pasture_meadow_grassland_grass}}$ class. If provided, then this class will be downsampled to the median frequency of all other classes for the pre-training dataset since it represents an imbalanced majority class. |
| `pretrain_classes` | Classes that make up the pre-train dataset. |
| `finetune_classes` | Classes that make up the pre-train dataset. |
Expand Down
4 changes: 3 additions & 1 deletion eurocropsml/dataset/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ def build_splits(
overrides: OverridesT = typer.Argument(None, help="Overrides to split config"),
) -> None:
config = build_config(overrides, config_path)
create_splits(config.split, config.preprocess.raw_data_dir.parent)
create_splits(
config.split, config.preprocess.raw_data_dir.parent, config.preprocess.download_url
)

return app

Expand Down
81 changes: 48 additions & 33 deletions eurocropsml/dataset/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,32 @@
logger = logging.getLogger(__name__)


def _get_zenodo_record(
base_url: str, version_number: int | None = None
) -> tuple[dict, list[str]] | dict:
response: requests.models.Response = requests.get(base_url)
response.raise_for_status()
data = response.json()
versions: list[dict] = data["hits"]["hits"]

if versions:
if version_number is not None:
selected_version = next(
(v for v in versions if v["metadata"]["version"] == str(version_number)), None
)
if selected_version is not None:
return selected_version
else:
logger.error(f"Version {version_number} could not be found on Zenodo.")
sys.exit(1)
else:
selected_version, files_to_download = select_version(versions)
return selected_version, files_to_download
else:
logger.error("No data found on Zenodo. Please download manually.")
sys.exit(1)


def _download_file(
file_name: str, file_url: str, local_path: Path, downloadfile_md5_hash: str
) -> None:
Expand Down Expand Up @@ -146,40 +172,29 @@ def download_dataset(preprocess_config: EuroCropsDatasetPreprocessConfig) -> Non
data_dir: Path = Path(preprocess_config.raw_data_dir.parent)
data_dir.mkdir(exist_ok=True, parents=True)

response: requests.models.Response = requests.get(base_url)

try:
response.raise_for_status()
data = response.json()
versions: list[dict] = data["hits"]["hits"]

if versions:
selected_version, files_to_download = select_version(versions)

# let user decide what data to download
selected_files = get_user_choice(files_to_download)

for file_entry in selected_version["files"]:
file_url: str = file_entry["links"]["self"]
zip_file: str = file_entry["key"]
if zip_file in selected_files:
local_path: Path = data_dir.joinpath(zip_file)
_download_file(zip_file, file_url, local_path, file_entry.get("checksum", ""))
logger.info(f"Unzipping {local_path}...")
_unzip_file(local_path, data_dir)

# move S1 and S2 data
if zip_file in ["S1.zip", "S2.zip"]:
unzipped_path: Path = local_path.with_suffix("")
for folder in unzipped_path.iterdir():
rel_target_folder: Path = folder.relative_to(unzipped_path)
_move_files(
folder, data_dir.joinpath(rel_target_folder, zip_file.split(".")[0])
)
shutil.rmtree(unzipped_path)
else:
logger.error("No data found on Zenodo. Please download manually.")
sys.exit(1)
selected_version, files_to_download = _get_zenodo_record(base_url)
# let user decide what data to download
selected_files = get_user_choice(files_to_download)

for file_entry in selected_version["files"]:
file_url: str = file_entry["links"]["self"]
zip_file: str = file_entry["key"]
if zip_file in selected_files:
local_path: Path = data_dir.parent.joinpath(zip_file)
_download_file(zip_file, file_url, local_path, file_entry.get("checksum", ""))
logger.info(f"Unzipping {local_path}...")
_unzip_file(local_path, data_dir)

# move S1 and S2 data
if zip_file in ["S1.zip", "S2.zip"]:
unzipped_path: Path = local_path.with_suffix("")
for folder in unzipped_path.iterdir():
rel_target_folder: Path = folder.relative_to(unzipped_path)
_move_files(
folder, data_dir.joinpath(rel_target_folder, zip_file.split(".")[0])
)
shutil.rmtree(unzipped_path)

except requests.exceptions.HTTPError as err:
logger.warning(f"There was an error when trying to access the Zenodo record: {err}")
Loading
Loading