diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 79a48b2a1a845..30373b119a2ca 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -47,6 +47,7 @@ def main(args: argparse.Namespace): distributed_executor_backend=args.distributed_executor_backend, otlp_traces_endpoint=args.otlp_traces_endpoint, enable_prefix_caching=args.enable_prefix_caching, + max_num_seqs=args.batch_size, ) sampling_params = SamplingParams( @@ -179,7 +180,7 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument( '--kv-cache-dtype', type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], + choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3', 'fp8_inc'], default="auto", help='Data type for kv cache storage. If "auto", will use model ' 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '