Skip to content

Commit

Permalink
Add HPU specific changes to benchmark_latency.py (#436)
Browse files Browse the repository at this point in the history
Add support for HPU FP8 in `benchmark_latency.py` script. Limit
`max_num_seqs` based on the `batch_size` as there will be no more
requests.
  • Loading branch information
kdamaszk authored Oct 28, 2024
1 parent 3a55e77 commit 4fd5c4c
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def main(args: argparse.Namespace):
distributed_executor_backend=args.distributed_executor_backend,
otlp_traces_endpoint=args.otlp_traces_endpoint,
enable_prefix_caching=args.enable_prefix_caching,
max_num_seqs=args.batch_size,
)

sampling_params = SamplingParams(
Expand Down Expand Up @@ -179,7 +180,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
parser.add_argument(
'--kv-cache-dtype',
type=str,
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3', 'fp8_inc'],
default="auto",
help='Data type for kv cache storage. If "auto", will use model '
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
Expand Down

0 comments on commit 4fd5c4c

Please sign in to comment.