From 0a477ab6e0cdb3ff468c9a1a6dd44e9494770041 Mon Sep 17 00:00:00 2001
From: Chendi Xue <chendi.xue@intel.com>
Date: Thu, 6 Feb 2025 15:12:35 +0000
Subject: [PATCH 1/2] update default envvar setting for deepseek-r1

Signed-off-by: Chendi Xue <chendi.xue@intel.com>
---
 examples/offline_inference.py                  | 1 +
 vllm/envs.py                                   | 2 +-
 vllm/model_executor/layers/quantization/fp8.py | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/offline_inference.py b/examples/offline_inference.py
index c271b465d3af6..e76bfce2765f3 100644
--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
@@ -2,6 +2,7 @@
 
 os.environ["HABANA_VISIBLE_DEVICES"] = "ALL"
 os.environ["PT_HPU_ENABLE_LAZY_COLLECTIVES"] = "true"
+os.environ["VLLM_MLA_DISABLE_REQUANTIZATION"] = "1"
 os.environ["VLLM_RAY_DISABLE_LOG_TO_DRIVER"] = "1"
 os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1"
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 25098070b00c9..da54f71672b5a 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -529,7 +529,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # matrices to match the activation type. This can lead to higher memory and
     # compute usage but better preserves the accuracy of the original model.
     "VLLM_MLA_DISABLE_REQUANTIZATION":
-    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0")))
+    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "1")))
 }
 
 # end-env-vars-definition
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 23d9f49f8ddbb..8b136de33e496 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -415,7 +415,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
         self.block_quant = self.quant_config.weight_block_size is not None
-        self.moe_n_slice = int(os.environ.get("VLLM_MOE_N_SLICE", 1))
+        self.moe_n_slice = int(os.environ.get("VLLM_MOE_N_SLICE", 4))
 
     def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
                        intermediate_size_per_partition: int,

From 866167d4fc704c0ff63a90ebcaefc9bba9f55b35 Mon Sep 17 00:00:00 2001
From: Chendi Xue <chendi.xue@intel.com>
Date: Thu, 6 Feb 2025 15:20:44 +0000
Subject: [PATCH 2/2] add profile script and json

Signed-off-by: Chendi Xue <chendi.xue@intel.com>
---
 scripts/profile.sh                      |  6 +++
 scripts/profile_api_trace_analyzer.json | 65 +++++++++++++++++++++++++
 2 files changed, 71 insertions(+)
 create mode 100644 scripts/profile.sh
 create mode 100644 scripts/profile_api_trace_analyzer.json

diff --git a/scripts/profile.sh b/scripts/profile.sh
new file mode 100644
index 0000000000000..33dcdcda334de
--- /dev/null
+++ b/scripts/profile.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+cur_path=$(pwd)
+HABANA_PROFILE_WRITE_HLTV=1 HABANA_PROFILE=1
+VLLM_PT_PROFILE=decode_128_1024_t \
+HABANA_PROF_CONFIG=${cur_path}/profile_api_trace_analyzer.json \
+python ${cur_path}/run_example_tp.py
\ No newline at end of file
diff --git a/scripts/profile_api_trace_analyzer.json b/scripts/profile_api_trace_analyzer.json
new file mode 100644
index 0000000000000..c7d0455cab797
--- /dev/null
+++ b/scripts/profile_api_trace_analyzer.json
@@ -0,0 +1,65 @@
+{
+    "Plugins": [
+        {
+            "enable": true,
+            "lib": "libhost_profiler.so",
+            "name": "HostProfiler",
+            "values": {
+                "start_disabled": {
+                    "value": true
+                }
+            }
+        },
+        {
+            "enable": true,
+            "lib": "libhw_trace.so",
+            "name": "HwTrace",
+            "values": {
+                "archProfileUnits": {
+                    "gaudi3": {
+                        "CS": {
+                            "Gaudi3CSAdvancedProfiling": {
+                                "value": 0
+                            }
+                        },
+                        "NIC": {
+                            "enable": {
+                                "value": true
+                            }
+                        }
+                    },
+                    "gaudi2": {
+                        "FW_events": {
+                            "enable": {
+                                "value": true
+                            }
+                        },
+                        "NIC": {
+                            "enable": {
+                                "value": true
+                            }
+                        }
+                    }
+                },
+                "generalOptions": {
+                    "arch": {
+                        "value": "gaudi3"
+                    },
+                    "profilePhase": {
+                        "value": "profileApi"
+                    },
+                    "traceBufferSize": {
+                        "value": "0x8000000"
+                    }
+                },
+                "parseOptions": {
+                    "traceAnalyzer": {
+                        "traceAnalyzerXlsx": {
+                            "value": true
+                        }
+                    }
+                }
+            }
+        }
+    ]
+}
\ No newline at end of file