pyaisloader: Implemented performance benchmark tests for AISDataset a…

…nd AISIterDataset Signed-off-by: Tony Chen <[email protected]>
NVIDIA · Jun 12, 2024 · 14cddfe · 14cddfe
1 parent bc1ee69
commit 14cddfe
Show file tree

Hide file tree

Showing 11 changed files with 281 additions and 22 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -190,7 +190,6 @@ test:short:python:
     - cd python
     - make python_sdk_tests
     - make python_botocore_tests
-    - make PYAISLOADER_TEST_TYPE=short test-pyaisloader
 
   except:
     variables:
@@ -346,14 +345,16 @@ test:long:aisloader:
   stage: test-long
   tags:
     - ais
-  timeout: 10m
+  timeout: 15m
   variables:
     AIS_ENDPOINT: "http://localhost:8080"
   script:
     - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT
     - sleep 10 # make sure that cluster properly starts
     - FLAGS="--duration=5m" make test-aisloader
-    - cd ./python; make PYAISLOADER_TEST_TYPE=long test-pyaisloader
+    - cd ./python
+    - make PYAISLOADER_TEST_TYPE=short test-pyaisloader
+    - make PYAISLOADER_TEST_TYPE=long test-pyaisloader
   rules:
     - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main"'
       when: manual

diff --git a/python/pyaisloader/Makefile b/python/pyaisloader/Makefile
@@ -13,6 +13,12 @@ short_mixed: ## Run a short MIXED benchmark
 short_list: ## Run a short LIST benchmark
 	pyaisloader LIST --bucket ais://abc --cleanup true --objects 50000 --workers 16
 
+short_ais_dataset: ## Run a short AISDataset benchmark
+	pyaisloader AIS_DATASET --bucket ais://abc --duration 30s --workers 16 --totalsize 1GB --minsize 5KB --maxsize 10KB --cleanup
+
+short_ais_iter_dataset: ## Run a short AISIterDataset benchmark
+	pyaisloader AIS_ITER_DATASET --bucket ais://abc --iterations 1 --duration 30s --workers 16 --totalsize 1GB --minsize 5KB --maxsize 10KB --cleanup
+
 long_put: ## Run a long (and more intensive) PUT benchmark
 	pyaisloader PUT --bucket ais://abc --duration 30m --workers 32 --totalsize 10GB --minsize 50MB --maxsize 100MB --cleanup
 

diff --git a/python/pyaisloader/README.md b/python/pyaisloader/README.md
@@ -85,6 +85,39 @@ Runs a benchmark to LIST objects in the bucket.
 | --objects      | -o      | Number of objects bucket should contain prior to benchmark start                      | No       | N/A           |
 | --workers      | -w      | Number of workers (only for pre-population of bucket)                                 | Yes      | N/A           |
 
+#### Type: AISDataset
+
+Runs a time-based benchmark to randomly get objects in the bucket through AISDataset.
+
+> **Note:** If you want your AISDataset benchmark to include a more intensive GET load, you should consider using a pre-filled bucket. 
+
+| Option     | Aliases | Description                                                                                                 | Required | Default Value |
+|------------|---------|-------------------------------------------------------------------------------------------------------------|----------|---------------|
+| --bucket   | -b      | Bucket (e.g. ais://mybck, s3://mybck, gs://mybck)                                                           | Yes      | N/A           |
+| --cleanup  | -c      | Whether bucket (or objects) should be destroyed or not upon benchmark completion                                         | Yes      | N/A           |
+| --minsize  | -min    | Minimum size of objects to be PUT in bucket during the benchmark                                            | Yes      | N/A           |
+| --maxsize  | -max    | Maximum size of objects to be PUT in bucket during the benchmark                                            | Yes      | N/A           |
+| --putpct   | -p      | Percentage for PUT operations in MIXED benchmark                                                            | Yes      | N/A           |
+| --duration | -d      | Duration for which benchmark should be run                                                                  | Yes      | N/A           |
+| --workers  | -w      | Number of workers                                                                                           | Yes      | N/A           |
+
+#### Type: AISIterDataset
+
+Runs a time-based benchmark to sequentially iterate over objects in the bucket through AISIterDataset.
+
+> **Note:** If you want your AISIterDataset benchmark to include a more intensive GET load, you should consider using a pre-filled bucket. 
+
+| Option     | Aliases | Description                                                                                                 | Required | Default Value |
+|------------|---------|-------------------------------------------------------------------------------------------------------------|----------|---------------|
+| --bucket   | -b      | Bucket (e.g. ais://mybck, s3://mybck, gs://mybck)                                                           | Yes      | N/A           |
+| --cleanup  | -c      | Whether bucket (or objects) should be destroyed or not upon benchmark completion                                         | Yes      | N/A           |
+| --minsize  | -min    | Minimum size of objects to be PUT in bucket during the benchmark                                            | Yes      | N/A           |
+| --maxsize  | -max    | Maximum size of objects to be PUT in bucket during the benchmark                                            | Yes      | N/A           |
+| --putpct   | -p      | Percentage for PUT operations in MIXED benchmark                                                            | Yes      | N/A           |
+| --duration | -d      | Duration for which benchmark should be run                                                                  | Yes      | N/A           |
+| --iterations  | -i      | Number of iterations over the dataset should be run (only for AISIterDataset)                            | No       | N/A           |
+| --workers  | -w      | Number of workers                                                                                           | Yes      | N/A           |
+
 ### Examples
 
 There are a few sample benchmarks in the provided Makefile. Run `make help` for more information on the sample benchmarks.
@@ -106,17 +139,23 @@ This command runs a short `MIXED` benchmark on the ais://abc bucket. The paramet
 5. `short_list`
 This command runs a short `LIST` benchmark on the bucket `ais://abc`. If there are less than `objects` amount of objects in the bucket, the bucket will be pre-populated to contain `objects` number of objects.
 
-6. `long_put`
+6. `short_ais_dataset`
+This command runs a short benchmark to randomly get objects in the bucket `ais://abc` through AISDataset, an AIS Plugin for PyTorch map-style dataset. If the total size of contents of `ais://abc` are smaller than the specified `totalsize`, the bucket will be pre-populated up to `totalsize`, with the size of individual objects ranging from `minsize` to `maxsize`. The benchmark will terminate when `duration` amount of time has passed.
+
+7. `short_ais_iter_dataset`
+This command runs a short benchmark to sequentially iterate over objects in the bucket `ais://abc` through AISIterDataset, an AIS Plugin for PyTorch iteratable-style dataset. If the total size of contents of `ais://abc` are smaller than the specified `totalsize`, the bucket will be pre-populated up to `totalsize`, with the size of individual objects ranging from `minsize` to `maxsize`. The benchmark will terminate when the specified duration is reached or when the defined number of iterations is completed.
+
+8. `long_put`
 This command runs a long `PUT` benchmark on the bucket `ais://abc`. The benchmark will stop when the specified `duration` of 30 minutes has elapsed or when the total size of data `PUT` into the bucket reaches `totalsize` of 10GB. The size of individual objects ranges from `minsize` of 50MB to `maxsize` of 100MB, and the number of `worker` threads used is increased to 32 compared to the short `PUT` benchmark.
 
-7. `long_get`
+9. `long_get`
 This command runs a long `GET` benchmark on the bucket `ais://abc`. The primary differences are that this benchmark runs for a longer `duration` (30 minutes as opposed to 30 seconds) and uses more `worker` threads (32 instead of 16).
 
-8. `long_mixed`
+10. `long_mixed`
 This command runs a long `MIXED` benchmark on the bucket `ais://abc`. The `putpct` parameter still determines the ratio of `PUT` operations to `GET` operations. The differences here are the longer `duration` of 30 minutes and and the increased number of `worker` threads (32 instead of 16).
 
-9. `long_list`
+11. `long_list`
 This command runs a long `LIST` benchmark on the bucket `ais://abc`. If there are fewer than `objects` amount of objects in the bucket, the bucket will be pre-populated to contain `objects` number of objects. The `long_list` benchmark differs from `short_list` in the number of `objects` (500,000 instead of 50,000) and the number of `worker` threads used (32 instead of 16).
 
-10. `help`
+12. `help`
 This command displays a list of available targets in the Makefile along with their descriptions, providing a helpful guide for understanding and using the available commands.
diff --git a/python/pyaisloader/ci-test.sh b/python/pyaisloader/ci-test.sh
@@ -18,6 +18,8 @@ if [ "$1" == "short" ]
     yes "y" | head -n 2 | pyaisloader p -b ais://testpyaisloader -d 15s -min 1mb -max 10mb -s 1gb -w 16
     yes "y" | head -n 2 | pyaisloader g -b ais://testpyaisloader -d 15s -min 1mb -max 10mb -s 1gb -w 16
     yes "y" | head -n 2 | pyaisloader m -b ais://testpyaisloader -d 15s -min 1mb -max 10mb -w 16 -c
+    yes "y" | head -n 2 | pyaisloader ais_dataset -b ais://testpyaisloader -d 15s -min 1mb -max 10mb -w 16 -c
+    yes "y" | head -n 2 | pyaisloader ais_iter_dataset -b ais://testpyaisloader -i 1 -d 15s -min 1mb -max 10mb -w 16 -c
 elif [ "$1" == "long" ]
   then
     yes "y" | head -n 2 | pyaisloader p -b ais://testpyaisloader -d 3m -min 1mb -max 10mb -s 10gb -w 16

diff --git a/python/pyaisloader/pyaisloader/benchmark.py b/python/pyaisloader/pyaisloader/benchmark.py
@@ -197,7 +197,7 @@ def run(self):
             self.__run_put()
         elif self.put_pct == 0:
             if self.totalsize is not None:
-                self.__run_prepopulate()
+                self._run_prepopulate()
             self.__run_get()
         else:
             self.__run_mixed()
@@ -217,8 +217,7 @@ def __run_put(self):
         if self.cleanup:
             self.clean_up()
         print_sep()
-        print("\n" + underline(bold("Benchmark Results (100% PUT):")))
-        print_results(result)
+        print_results(result, title="Benchmark Results (100% PUT):")
 
     def __run_get(self):
         if bucket_obj_count(self.bucket) == 0:
@@ -231,8 +230,7 @@ def __run_get(self):
         if self.cleanup:
             self.clean_up()
         print_sep()
-        print("\n" + underline(bold("Benchmark Results (100% GET):")))
-        print_results(result)
+        print_results(result, title="Benchmark Results (100% GET):")
 
     def __run_mixed(self):
         if bucket_obj_count(self.bucket) == 0:
@@ -251,12 +249,10 @@ def __run_mixed(self):
         if self.cleanup:
             self.clean_up()
         print_sep()
-        print("\n" + underline(bold("Benchmark Results for PUT operations:")))
-        print_results(result_put)
-        print("\n" + underline(bold("Benchmark Results for GET operations:")))
-        print_results(result_get)
+        print_results(result_put, title="Benchmark Results for PUT operations:")
+        print_results(result_get, title="Benchmark Results for GET operations:")
 
-    def __run_prepopulate(self):
+    def _run_prepopulate(self):
         print_in_progress("Starting Pre-Population")
         curr_bck_size = bucket_size(self.bucket)
         if curr_bck_size < self.totalsize:

diff --git a/python/pyaisloader/pyaisloader/main.py b/python/pyaisloader/pyaisloader/main.py
@@ -2,6 +2,7 @@
 import pkg_resources
 
 from pyaisloader.benchmark import PutGetMixedBenchmark, ListBenchmark
+from pyaisloader.pytorch_benchmark import AISDatasetBenchmark, AISIterDatasetBenchmark
 from pyaisloader.const import PROVIDERS
 from pyaisloader.client_config import client
 
@@ -84,11 +85,25 @@ def main():
         help="LIST objects benchmark",
         description="This command runs a LIST benchmark.",
     )
+    ais_dataset_parser = subparsers.add_parser(
+        "AIS_DATASET",
+        aliases=["ais_dataset"],
+        help="Map-style AISDataset benchmark",
+        description="This command runs an AISDataset benchmark",
+    )
+    ais_iter_dataset_parser = subparsers.add_parser(
+        "AIS_ITER_DATASET",
+        aliases=["ais_iter_dataset"],
+        help="Iteratable-style AISIterDataset benchmark",
+        description="This command runs an AISDataset benchmark",
+    )
 
     put_parser = prepend_default_arguments(put_parser)
     get_parser = prepend_default_arguments(get_parser)
     mixed_parser = prepend_default_arguments(mixed_parser)
     list_parser = prepend_default_arguments(list_parser)
+    ais_dataset_parser = prepend_default_arguments(ais_dataset_parser)
+    ais_iter_dataset_parser = prepend_default_arguments(ais_iter_dataset_parser)
 
     put_parser.add_argument(
         "-min",
@@ -187,10 +202,77 @@ def main():
         help="Number of objects bucket should contain prior to benchmark start",
     )
 
+    ais_dataset_parser.add_argument(
+        "-min",
+        "--minsize",
+        type=parse_size,
+        required=False,
+        help="Minimum size of objects to be PUT in bucket (if bucket is smaller than total size)",
+    )
+    ais_dataset_parser.add_argument(
+        "-max",
+        "--maxsize",
+        type=parse_size,
+        required=False,
+        help="Maximum size of objects to be PUT in bucket (if bucket is smaller than total size)",
+    )
+    ais_dataset_parser.add_argument(
+        "-s",
+        "--totalsize",
+        type=parse_size,
+        required=False,
+        help="Total size to which the bucket should be filled prior to start",
+    )
+    ais_dataset_parser.add_argument(
+        "-d",
+        "--duration",
+        type=parse_time,
+        required=True,
+        help="Duration for which benchmark should be run",
+    )
+
+    ais_iter_dataset_parser.add_argument(
+        "-min",
+        "--minsize",
+        type=parse_size,
+        required=False,
+        help="Minimum size of objects to be PUT in bucket (if bucket is smaller than total size)",
+    )
+    ais_iter_dataset_parser.add_argument(
+        "-max",
+        "--maxsize",
+        type=parse_size,
+        required=False,
+        help="Maximum size of objects to be PUT in bucket (if bucket is smaller than total size)",
+    )
+    ais_iter_dataset_parser.add_argument(
+        "-s",
+        "--totalsize",
+        type=parse_size,
+        required=False,
+        help="Total size to which the bucket should be filled prior to start",
+    )
+    ais_iter_dataset_parser.add_argument(
+        "-d",
+        "--duration",
+        type=parse_time,
+        required=True,
+        help="Duration for which benchmark should be run",
+    )
+    ais_iter_dataset_parser.add_argument(
+        "-i",
+        "--iterations",
+        type=int,
+        required=False,
+        help="Iterations over the dataset for which benchmark should be run",
+    )
+
     put_parser = append_default_arguments(put_parser)
     get_parser = append_default_arguments(get_parser)
     mixed_parser = append_default_arguments(mixed_parser)
     list_parser = append_default_arguments(list_parser)
+    ais_dataset_parser = append_default_arguments(ais_dataset_parser)
+    ais_iter_dataset_parser = append_default_arguments(ais_iter_dataset_parser)
 
     args = parser.parse_args()
 
@@ -221,7 +303,16 @@ def main():
 
     benchmark_type = args.type.lower()
 
-    if benchmark_type in ["put", "get", "mixed", "p", "g", "m"]:
+    if benchmark_type in [
+        "put",
+        "get",
+        "mixed",
+        "p",
+        "g",
+        "m",
+        "ais_dataset",
+        "ais_iter_dataset",
+    ]:
         if benchmark_type in ["put", "p"]:
             benchmark = PutGetMixedBenchmark(
                 put_pct=100,
@@ -244,6 +335,27 @@ def main():
                 workers=args.workers,
                 cleanup=args.cleanup,
             )
+        elif benchmark_type == "ais_dataset":
+            benchmark = AISDatasetBenchmark(
+                minsize=args.minsize,
+                maxsize=args.maxsize,
+                duration=args.duration,
+                totalsize=args.totalsize,
+                bucket=bucket,
+                workers=args.workers,
+                cleanup=args.cleanup,
+            )
+        elif benchmark_type == "ais_iter_dataset":
+            benchmark = AISIterDatasetBenchmark(
+                minsize=args.minsize,
+                maxsize=args.maxsize,
+                duration=args.duration,
+                iterations=args.iterations,
+                totalsize=args.totalsize,
+                bucket=bucket,
+                workers=args.workers,
+                cleanup=args.cleanup,
+            )
         else:
             benchmark = PutGetMixedBenchmark(
                 put_pct=args.putpct,