dida-do · jsreuss · Nov 12, 2024 · Nov 12, 2024 · Nov 14, 2024 · Nov 14, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,8 @@ Changes from previous releases are listed below.
 - Fasten up padding to 366 days _(see #54)_
 - Radiometric calibration of Sentinel-1 _(see #47)_
 - Downloading Sentinel-1 data and make it usable together with Sentinel-2 _(see #43)_
+- Make padding compatible with old S2 data _(see #56)_
+- Enhancing download process _(see #58)_
 
 ## 0.3.1 (2024-07-29)
 - Remove country_code variable in collector downloader _(see #33)_

diff --git a/eurocropsml/dataset/download.py b/eurocropsml/dataset/download.py
@@ -47,23 +47,19 @@ def _download_file(
         logger.info(f"{local_path} will not be downloaded again.")
 
 
-def get_user_choice() -> list[str]:
+def get_user_choice(files_to_download: list[str]) -> list[str]:
     """Get user choice for which files to download."""
-    choice = input(
-        "Would you like to download Sentinel-1 and/or Sentinel-2 data? Please enter "
-        " 'S1', 'S2', or 'both': "
-    )
-    if choice not in {"S1", "S2", "both"}:
+    print("Choose one or more of the following options by typing their numbers (e.g., 1 3):")
+    for i, file in enumerate(files_to_download, 1):
+        print(f"{i}. {file}")
+    choice = input("Enter your choices separated by spaces: ")
+    selected_indices = [int(choice) - 1 for choice in choice.split()]
+    if bool(set(selected_indices) - set(range(0, len(files_to_download)))):
         logger.error("Invalid input. Please enter 'S1', 'S2', or 'both'.")
         sys.exit(1)
-    elif choice == "both":
-        choice_list = ["S1", "S2"]
-        logger.info("Downloading both S1 and S2 data.")
-    else:
-        logger.info(f"Downloading only {choice} data.")
-        choice_list = [choice]
+    selected_options = [files_to_download[i] for i in selected_indices]
 
-    return choice_list
+    return selected_options
 
 
 def select_version(versions: list[dict]) -> tuple[dict, list[str]]:
@@ -98,8 +94,8 @@ def select_version(versions: list[dict]) -> tuple[dict, list[str]]:
             logger.warning(
                 "Please be aware that Zenodo version 8 or older and this package version "
                 "(eurocropsml>=0.4.0) are not compatible anymore in terms of "
-                "eurocropsml.preprocess.preprocess. The already preprocessed version of Sentinel-2 "
-                "can still be used, but re-running the preprocessing will not filter out all "
+                "eurocropsml.preprocess.preprocess. The already peprocessed version of Sentinel-2 "
+                "can still be used, but re-running the preprocessing will will not filter out all "
                 "outliers from the raw data."
                 "\n"
                 "Furthermore, the folder structure of Zenodo version 8 or older is not supported "
@@ -160,19 +156,13 @@ def download_dataset(preprocess_config: EuroCropsDatasetPreprocessConfig) -> Non
         if versions:
             selected_version, files_to_download = select_version(versions)
 
-            # older version do only have S2 data
-            # if S1 data is available, let user decide
-            if "S1.zip" in files_to_download:
-                user_choice = get_user_choice()
-                if "S1" not in user_choice:
-                    files_to_download.remove("S1.zip")
-                if "S2" not in user_choice:
-                    files_to_download.remove("S2.zip")
+            # let user decide what data to download
+            selected_files = get_user_choice(files_to_download)
 
             for file_entry in selected_version["files"]:
                 file_url: str = file_entry["links"]["self"]
                 zip_file: str = file_entry["key"]
-                if zip_file in files_to_download:
+                if zip_file in selected_files:
                     local_path: Path = data_dir.joinpath(zip_file)
                     _download_file(zip_file, file_url, local_path, file_entry.get("checksum", ""))
                     logger.info(f"Unzipping {local_path}...")

diff --git a/eurocropsml/dataset/preprocess.py b/eurocropsml/dataset/preprocess.py
@@ -193,10 +193,11 @@ def preprocess(
 ) -> None:
     """Run preprocessing."""
 
-    raw_data_dir = preprocess_config.raw_data_dir
-    num_workers = preprocess_config.num_workers
-    satellite = preprocess_config.satellite
-    preprocess_dir = preprocess_config.preprocess_dir / satellite
+    num_workers: int | None = preprocess_config.num_workers
+    satellite: str = preprocess_config.satellite
+    raw_data_dir: Path = preprocess_config.raw_data_dir
+    raw_data_dir_satellite: Path = preprocess_config.raw_data_dir / satellite
+    preprocess_dir: Path = preprocess_config.preprocess_dir / satellite
 
     if preprocess_config.bands is None:
         if satellite == "S2":
@@ -206,19 +207,18 @@ def preprocess(
     else:
         bands = preprocess_config.bands
 
-    if preprocess_dir.exists() and len(list((preprocess_dir.iterdir()))) > 0:
+    if preprocess_dir.exists() and any(preprocess_dir.iterdir()):
         logger.info(
             f"Preprocessing directory {preprocess_dir} already exists and contains data. "
             "Nothing to do."
         )
         sys.exit(0)
 
-    if raw_data_dir.exists():
+    if raw_data_dir_satellite.exists():
         logger.info("Raw data directory exists. Skipping download.")
 
         logger.info("Starting preprocessing. Compiling labels and centerpoints of parcels")
         preprocess_dir.mkdir(exist_ok=True, parents=True)
-        raw_data_dir_satellite: Path = raw_data_dir / satellite
         for file_path in raw_data_dir_satellite.glob("*.parquet"):
             country_file: pd.DataFrame = pd.read_parquet(file_path).set_index("parcel_id")
             cols = country_file.columns.tolist()
@@ -251,6 +251,12 @@ def preprocess(
                         lambda y: np.array([-999] * b) if y is None else y
                     )
                 )
+                if satellite == "S2":
+                    region_data = region_data.apply(
+                        lambda x, b=len(bands): x.map(
+                            lambda y: np.array([-999] * b) if y == [0] * b else y
+                        )
+                    )
                 with Pool(processes=num_workers) as p:
                     func = partial(
                         _save_row,

diff --git a/eurocropsml/utils.py b/eurocropsml/utils.py
@@ -5,16 +5,25 @@
 import zipfile
 from pathlib import Path
 
+import typer
+
 
 def _move_files(src_dir: Path, dest_dir: Path) -> None:
     """Move files from src_dir to dest_dir."""
-    dest_dir.mkdir(exist_ok=True, parents=True)
+    move_files: bool = True
+    if dest_dir.exists() and _compare_folders(src_dir, dest_dir):
+        move_files = typer.confirm(
+            f"{dest_dir} already exists and the content is different from the new data. Do you"
+            " want to replace the existing files?"
+        )
+    if move_files:
+        dest_dir.mkdir(exist_ok=True, parents=True)
 
-    for item in src_dir.iterdir():
-        dest_item = dest_dir.joinpath(item.name)
+        for item in src_dir.iterdir():
+            dest_item = dest_dir.joinpath(item.name)
 
-        if item.is_file():
-            shutil.move(item, dest_item)
+            if item.is_file():
+                shutil.move(item, dest_item)
 
 
 def _create_md5_hash(filepath: Path) -> str:
@@ -26,6 +35,29 @@ def _create_md5_hash(filepath: Path) -> str:
     return hash_md5.hexdigest()
 
 
+def _hash_folder(folder_path: Path) -> str:
+    """Compute a combined hash for all files in a folder."""
+    hash_obj = hashlib.md5()
+
+    for file_path in sorted(folder_path.rglob("*")):
+        # hash only files
+        if file_path.is_file():
+            file_hash = _create_md5_hash(file_path)
+            # Update folder hash with the relative path and file hash for consistency
+            relative_path = file_path.relative_to(folder_path)
+            hash_obj.update(str(relative_path).encode())
+            hash_obj.update(file_hash.encode())
+
+    return hash_obj.hexdigest()
+
+
+def _compare_folders(folder1: Path, folder2: Path) -> bool:
+    """Compare two folders by their combined hash values."""
+    hash_folder1 = _hash_folder(folder1)
+    hash_folder2 = _hash_folder(folder2)
+    return hash_folder1 == hash_folder2
+
+
 def _unzip_file(zip_filepath: Path, extract_to_path: Path, delete_zip: bool = True) -> None:
     with zipfile.ZipFile(zip_filepath, "r") as zip_ref:
         zip_ref.extractall(extract_to_path)