Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make padding compatible with old S2 data #57

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ Changes from previous releases are listed below.
- Fasten up padding to 366 days _(see #54)_
- Radiometric calibration of Sentinel-1 _(see #47)_
- Downloading Sentinel-1 data and make it usable together with Sentinel-2 _(see #43)_
- Make padding compatible with old S2 data _(see #56)_
- Enhancing download process _(see #58)_
jmaces marked this conversation as resolved.
Show resolved Hide resolved

## 0.3.1 (2024-07-29)
- Remove country_code variable in collector downloader _(see #33)_
Expand Down
38 changes: 14 additions & 24 deletions eurocropsml/dataset/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,23 +47,19 @@ def _download_file(
logger.info(f"{local_path} will not be downloaded again.")


def get_user_choice() -> list[str]:
def get_user_choice(files_to_download: list[str]) -> list[str]:
"""Get user choice for which files to download."""
choice = input(
"Would you like to download Sentinel-1 and/or Sentinel-2 data? Please enter "
" 'S1', 'S2', or 'both': "
)
if choice not in {"S1", "S2", "both"}:
print("Choose one or more of the following options by typing their numbers (e.g., 1 3):")
for i, file in enumerate(files_to_download, 1):
print(f"{i}. {file}")
choice = input("Enter your choices separated by spaces: ")
jmaces marked this conversation as resolved.
Show resolved Hide resolved
selected_indices = [int(choice) - 1 for choice in choice.split()]
if bool(set(selected_indices) - set(range(0, len(files_to_download)))):
logger.error("Invalid input. Please enter 'S1', 'S2', or 'both'.")
jmaces marked this conversation as resolved.
Show resolved Hide resolved
sys.exit(1)
elif choice == "both":
choice_list = ["S1", "S2"]
logger.info("Downloading both S1 and S2 data.")
else:
logger.info(f"Downloading only {choice} data.")
choice_list = [choice]
selected_options = [files_to_download[i] for i in selected_indices]

return choice_list
return selected_options


def select_version(versions: list[dict]) -> tuple[dict, list[str]]:
Expand Down Expand Up @@ -98,8 +94,8 @@ def select_version(versions: list[dict]) -> tuple[dict, list[str]]:
logger.warning(
"Please be aware that Zenodo version 8 or older and this package version "
"(eurocropsml>=0.4.0) are not compatible anymore in terms of "
"eurocropsml.preprocess.preprocess. The already preprocessed version of Sentinel-2 "
"can still be used, but re-running the preprocessing will not filter out all "
"eurocropsml.preprocess.preprocess. The already peprocessed version of Sentinel-2 "
"can still be used, but re-running the preprocessing will will not filter out all "
jsreuss marked this conversation as resolved.
Show resolved Hide resolved
"outliers from the raw data."
"\n"
"Furthermore, the folder structure of Zenodo version 8 or older is not supported "
Expand Down Expand Up @@ -160,19 +156,13 @@ def download_dataset(preprocess_config: EuroCropsDatasetPreprocessConfig) -> Non
if versions:
selected_version, files_to_download = select_version(versions)

# older version do only have S2 data
# if S1 data is available, let user decide
if "S1.zip" in files_to_download:
user_choice = get_user_choice()
if "S1" not in user_choice:
files_to_download.remove("S1.zip")
if "S2" not in user_choice:
files_to_download.remove("S2.zip")
# let user decide what data to download
selected_files = get_user_choice(files_to_download)

for file_entry in selected_version["files"]:
file_url: str = file_entry["links"]["self"]
zip_file: str = file_entry["key"]
if zip_file in files_to_download:
if zip_file in selected_files:
local_path: Path = data_dir.joinpath(zip_file)
_download_file(zip_file, file_url, local_path, file_entry.get("checksum", ""))
logger.info(f"Unzipping {local_path}...")
Expand Down
20 changes: 13 additions & 7 deletions eurocropsml/dataset/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,10 +193,11 @@ def preprocess(
) -> None:
"""Run preprocessing."""

raw_data_dir = preprocess_config.raw_data_dir
num_workers = preprocess_config.num_workers
satellite = preprocess_config.satellite
preprocess_dir = preprocess_config.preprocess_dir / satellite
num_workers: int | None = preprocess_config.num_workers
satellite: str = preprocess_config.satellite
raw_data_dir: Path = preprocess_config.raw_data_dir
raw_data_dir_satellite: Path = preprocess_config.raw_data_dir / satellite
preprocess_dir: Path = preprocess_config.preprocess_dir / satellite

if preprocess_config.bands is None:
if satellite == "S2":
Expand All @@ -206,19 +207,18 @@ def preprocess(
else:
bands = preprocess_config.bands

if preprocess_dir.exists() and len(list((preprocess_dir.iterdir()))) > 0:
if preprocess_dir.exists() and any(preprocess_dir.iterdir()):
logger.info(
f"Preprocessing directory {preprocess_dir} already exists and contains data. "
"Nothing to do."
)
sys.exit(0)

if raw_data_dir.exists():
if raw_data_dir_satellite.exists():
logger.info("Raw data directory exists. Skipping download.")

logger.info("Starting preprocessing. Compiling labels and centerpoints of parcels")
preprocess_dir.mkdir(exist_ok=True, parents=True)
raw_data_dir_satellite: Path = raw_data_dir / satellite
for file_path in raw_data_dir_satellite.glob("*.parquet"):
country_file: pd.DataFrame = pd.read_parquet(file_path).set_index("parcel_id")
cols = country_file.columns.tolist()
Expand Down Expand Up @@ -251,6 +251,12 @@ def preprocess(
lambda y: np.array([-999] * b) if y is None else y
)
)
if satellite == "S2":
region_data = region_data.apply(
lambda x, b=len(bands): x.map(
lambda y: np.array([-999] * b) if y == [0] * b else y
)
)
with Pool(processes=num_workers) as p:
func = partial(
_save_row,
Expand Down
42 changes: 37 additions & 5 deletions eurocropsml/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,25 @@
import zipfile
from pathlib import Path

import typer


def _move_files(src_dir: Path, dest_dir: Path) -> None:
"""Move files from src_dir to dest_dir."""
dest_dir.mkdir(exist_ok=True, parents=True)
move_files: bool = True
if dest_dir.exists() and _compare_folders(src_dir, dest_dir):
move_files = typer.confirm(
f"{dest_dir} already exists and the content is different from the new data. Do you"
" want to replace the existing files?"
)
if move_files:
dest_dir.mkdir(exist_ok=True, parents=True)

for item in src_dir.iterdir():
dest_item = dest_dir.joinpath(item.name)
for item in src_dir.iterdir():
dest_item = dest_dir.joinpath(item.name)

if item.is_file():
shutil.move(item, dest_item)
if item.is_file():
shutil.move(item, dest_item)


def _create_md5_hash(filepath: Path) -> str:
Expand All @@ -26,6 +35,29 @@ def _create_md5_hash(filepath: Path) -> str:
return hash_md5.hexdigest()


def _hash_folder(folder_path: Path) -> str:
"""Compute a combined hash for all files in a folder."""
hash_obj = hashlib.md5()

for file_path in sorted(folder_path.rglob("*")):
# hash only files
if file_path.is_file():
file_hash = _create_md5_hash(file_path)
# Update folder hash with the relative path and file hash for consistency
relative_path = file_path.relative_to(folder_path)
hash_obj.update(str(relative_path).encode())
hash_obj.update(file_hash.encode())

return hash_obj.hexdigest()


def _compare_folders(folder1: Path, folder2: Path) -> bool:
"""Compare two folders by their combined hash values."""
hash_folder1 = _hash_folder(folder1)
hash_folder2 = _hash_folder(folder2)
return hash_folder1 == hash_folder2


def _unzip_file(zip_filepath: Path, extract_to_path: Path, delete_zip: bool = True) -> None:
with zipfile.ZipFile(zip_filepath, "r") as zip_ref:
zip_ref.extractall(extract_to_path)
Expand Down