Add HPU specific changes to benchmark_latency.py (#436)

Add support for HPU FP8 in `benchmark_latency.py` script. Limit `max_num_seqs` based on the `batch_size` as there will be no more requests.
HabanaAI · Oct 28, 2024 · 4fd5c4c · 4fd5c4c
1 parent 3a55e77
commit 4fd5c4c
Showing 1 changed file with 2 additions and 1 deletion.
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -47,6 +47,7 @@ def main(args: argparse.Namespace):
         distributed_executor_backend=args.distributed_executor_backend,
         otlp_traces_endpoint=args.otlp_traces_endpoint,
         enable_prefix_caching=args.enable_prefix_caching,
+        max_num_seqs=args.batch_size,
     )
 
     sampling_params = SamplingParams(
@@ -179,7 +180,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument(
         '--kv-cache-dtype',
         type=str,
-        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
+        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3', 'fp8_inc'],
         default="auto",
         help='Data type for kv cache storage. If "auto", will use model '
         'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '