[Deepseek r1] update scripts to make sure it runs with default env var (

HabanaAI#790) Signed-off-by: Chendi Xue <[email protected]>
yangulei · Feb 7, 2025 · d4ad5fb · d4ad5fb
1 parent 33fc2c7
commit d4ad5fb
Show file tree

Hide file tree

Showing 5 changed files with 74 additions and 2 deletions.
diff --git a/examples/offline_inference.py b/examples/offline_inference.py
@@ -2,6 +2,7 @@
 
 os.environ["HABANA_VISIBLE_DEVICES"] = "ALL"
 os.environ["PT_HPU_ENABLE_LAZY_COLLECTIVES"] = "true"
+os.environ["VLLM_MLA_DISABLE_REQUANTIZATION"] = "1"
 os.environ["VLLM_RAY_DISABLE_LOG_TO_DRIVER"] = "1"
 os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1"
 

diff --git a/scripts/profile.sh b/scripts/profile.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+cur_path=$(pwd)
+HABANA_PROFILE_WRITE_HLTV=1 HABANA_PROFILE=1
+VLLM_PT_PROFILE=decode_128_1024_t \
+HABANA_PROF_CONFIG=${cur_path}/profile_api_trace_analyzer.json \
+python ${cur_path}/run_example_tp.py
diff --git a/scripts/profile_api_trace_analyzer.json b/scripts/profile_api_trace_analyzer.json
@@ -0,0 +1,65 @@
+{
+    "Plugins": [
+        {
+            "enable": true,
+            "lib": "libhost_profiler.so",
+            "name": "HostProfiler",
+            "values": {
+                "start_disabled": {
+                    "value": true
+                }
+            }
+        },
+        {
+            "enable": true,
+            "lib": "libhw_trace.so",
+            "name": "HwTrace",
+            "values": {
+                "archProfileUnits": {
+                    "gaudi3": {
+                        "CS": {
+                            "Gaudi3CSAdvancedProfiling": {
+                                "value": 0
+                            }
+                        },
+                        "NIC": {
+                            "enable": {
+                                "value": true
+                            }
+                        }
+                    },
+                    "gaudi2": {
+                        "FW_events": {
+                            "enable": {
+                                "value": true
+                            }
+                        },
+                        "NIC": {
+                            "enable": {
+                                "value": true
+                            }
+                        }
+                    }
+                },
+                "generalOptions": {
+                    "arch": {
+                        "value": "gaudi3"
+                    },
+                    "profilePhase": {
+                        "value": "profileApi"
+                    },
+                    "traceBufferSize": {
+                        "value": "0x8000000"
+                    }
+                },
+                "parseOptions": {
+                    "traceAnalyzer": {
+                        "traceAnalyzerXlsx": {
+                            "value": true
+                        }
+                    }
+                }
+            }
+        }
+    ]
+}
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -529,7 +529,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # matrices to match the activation type. This can lead to higher memory and
     # compute usage but better preserves the accuracy of the original model.
     "VLLM_MLA_DISABLE_REQUANTIZATION":
-    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0")))
+    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "1")))
 }
 
 # end-env-vars-definition

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -415,7 +415,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
         self.block_quant = self.quant_config.weight_block_size is not None
-        self.moe_n_slice = int(os.environ.get("VLLM_MOE_N_SLICE", 1))
+        self.moe_n_slice = int(os.environ.get("VLLM_MOE_N_SLICE", 4))
 
     def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
                        intermediate_size_per_partition: int,