apache · alamb · Jan 23, 2025 · Jan 21, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
@@ -536,23 +536,52 @@ data_imdb() {
     done
 
     if [ "$convert_needed" = true ]; then
-        if [ ! -f "${imdb_dir}/imdb.tgz" ]; then
-            echo "Downloading IMDB dataset..."
+        # Expected size of the dataset
+        expected_size="1263193115" # 1.18 GB
+
+        echo -n "Looking for imdb.tgz... "
+        if [ -f "${imdb_temp_gz}" ]; then
+            echo "found"
+            echo -n "Checking size... "
+            OUTPUT_SIZE=$(wc -c "${imdb_temp_gz}" 2>/dev/null | awk '{print $1}' || true)
 
+            #Checking the size of the existing file
+            if [ "${OUTPUT_SIZE}" = "${expected_size}" ]; then
+                # Existing file is of the expected size, no need for download
+                echo "OK ($(numfmt --to=iec-i --suffix=B ${OUTPUT_SIZE}))"
+            else
+                # Existing file is partially installed, remove it and initiate a new download
+                echo "MISMATCH"
+                echo "Size less than expected: $(numfmt --to=iec-i --suffix=B ${OUTPUT_SIZE}) found, $(numfmt --to=iec-i --suffix=B ${expected_size}) required"
+                echo "Downloading IMDB dataset..."
+                rm -f "${imdb_temp_gz}"
+
+                # Download the dataset
+                curl -o "${imdb_temp_gz}" "${imdb_url}"
+
+                # Size check of the installed file
+                DOWNLOADED_SIZE=$(wc -c "${imdb_temp_gz}" | awk '{print $1}')
+                if [ "${DOWNLOADED_SIZE}" != "${expected_size}" ]; then
+                    echo "Error: Download size mismatch"
+                    echo "Expected: $(numfmt --to=iec-i --suffix=B ${expected_size})"
+                    echo "Got: $(numfmt --to=iec-i --suffix=B ${DOWNLOADED_SIZE})"
+                    echo "Please re-initiate the download"
+                    return 1
+                fi
+            fi
+        else
+            # No existing file found, initiate a new download
+            echo "not found"
+            echo "Downloading IMDB dataset ($(numfmt --to=iec-i --suffix=B ${expected_size}) expected)..."
             # Download the dataset
             curl -o "${imdb_temp_gz}" "${imdb_url}"
-
-            # Extract the dataset
-            tar -xzvf "${imdb_temp_gz}" -C "${imdb_dir}"
-            $CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet
-        else 
-            echo "IMDB.tgz already exists."
-
-            # Extract the dataset
-            tar -xzvf "${imdb_temp_gz}" -C "${imdb_dir}"
-            $CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet
-        fi
+
+        # Extract the dataset
+        tar -xzvf "${imdb_temp_gz}" -C "${imdb_dir}"
+        $CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet
         echo "IMDB dataset downloaded and extracted."
+    fi
+
     else
         echo "IMDB dataset already exists and contains required parquet files."
     fi