From 0a477ab6e0cdb3ff468c9a1a6dd44e9494770041 Mon Sep 17 00:00:00 2001 From: Chendi Xue Date: Thu, 6 Feb 2025 15:12:35 +0000 Subject: [PATCH 1/2] update default envvar setting for deepseek-r1 Signed-off-by: Chendi Xue --- examples/offline_inference.py | 1 + vllm/envs.py | 2 +- vllm/model_executor/layers/quantization/fp8.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/offline_inference.py b/examples/offline_inference.py index c271b465d3af6..e76bfce2765f3 100644 --- a/examples/offline_inference.py +++ b/examples/offline_inference.py @@ -2,6 +2,7 @@ os.environ["HABANA_VISIBLE_DEVICES"] = "ALL" os.environ["PT_HPU_ENABLE_LAZY_COLLECTIVES"] = "true" +os.environ["VLLM_MLA_DISABLE_REQUANTIZATION"] = "1" os.environ["VLLM_RAY_DISABLE_LOG_TO_DRIVER"] = "1" os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1" diff --git a/vllm/envs.py b/vllm/envs.py index 25098070b00c9..da54f71672b5a 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -529,7 +529,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: # matrices to match the activation type. This can lead to higher memory and # compute usage but better preserves the accuracy of the original model. "VLLM_MLA_DISABLE_REQUANTIZATION": - lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))) + lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "1"))) } # end-env-vars-definition diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 23d9f49f8ddbb..8b136de33e496 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -415,7 +415,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): def __init__(self, quant_config: Fp8Config): self.quant_config = quant_config self.block_quant = self.quant_config.weight_block_size is not None - self.moe_n_slice = int(os.environ.get("VLLM_MOE_N_SLICE", 1)) + self.moe_n_slice = int(os.environ.get("VLLM_MOE_N_SLICE", 4)) def create_weights(self, layer: Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, From 866167d4fc704c0ff63a90ebcaefc9bba9f55b35 Mon Sep 17 00:00:00 2001 From: Chendi Xue Date: Thu, 6 Feb 2025 15:20:44 +0000 Subject: [PATCH 2/2] add profile script and json Signed-off-by: Chendi Xue --- scripts/profile.sh | 6 +++ scripts/profile_api_trace_analyzer.json | 65 +++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 scripts/profile.sh create mode 100644 scripts/profile_api_trace_analyzer.json diff --git a/scripts/profile.sh b/scripts/profile.sh new file mode 100644 index 0000000000000..33dcdcda334de --- /dev/null +++ b/scripts/profile.sh @@ -0,0 +1,6 @@ +#!/bin/bash +cur_path=$(pwd) +HABANA_PROFILE_WRITE_HLTV=1 HABANA_PROFILE=1 +VLLM_PT_PROFILE=decode_128_1024_t \ +HABANA_PROF_CONFIG=${cur_path}/profile_api_trace_analyzer.json \ +python ${cur_path}/run_example_tp.py \ No newline at end of file diff --git a/scripts/profile_api_trace_analyzer.json b/scripts/profile_api_trace_analyzer.json new file mode 100644 index 0000000000000..c7d0455cab797 --- /dev/null +++ b/scripts/profile_api_trace_analyzer.json @@ -0,0 +1,65 @@ +{ + "Plugins": [ + { + "enable": true, + "lib": "libhost_profiler.so", + "name": "HostProfiler", + "values": { + "start_disabled": { + "value": true + } + } + }, + { + "enable": true, + "lib": "libhw_trace.so", + "name": "HwTrace", + "values": { + "archProfileUnits": { + "gaudi3": { + "CS": { + "Gaudi3CSAdvancedProfiling": { + "value": 0 + } + }, + "NIC": { + "enable": { + "value": true + } + } + }, + "gaudi2": { + "FW_events": { + "enable": { + "value": true + } + }, + "NIC": { + "enable": { + "value": true + } + } + } + }, + "generalOptions": { + "arch": { + "value": "gaudi3" + }, + "profilePhase": { + "value": "profileApi" + }, + "traceBufferSize": { + "value": "0x8000000" + } + }, + "parseOptions": { + "traceAnalyzer": { + "traceAnalyzerXlsx": { + "value": true + } + } + } + } + } + ] +} \ No newline at end of file