HabanaAI · xuechendi · Feb 6, 2025 · Feb 6, 2025 · Feb 6, 2025
@@ -2,6 +2,7 @@
 
 os.environ["HABANA_VISIBLE_DEVICES"] = "ALL"
 os.environ["PT_HPU_ENABLE_LAZY_COLLECTIVES"] = "true"
+os.environ["VLLM_MLA_DISABLE_REQUANTIZATION"] = "1"
 os.environ["VLLM_RAY_DISABLE_LOG_TO_DRIVER"] = "1"
 os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1"
 
@@ -34,7 +35,7 @@
            trust_remote_code=True,
            max_model_len=1024)
    # Generate texts from the prompts. The output is a list of RequestOutput objects
    # that contain the prompt, generated text, and other information.
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
    for output in outputs:

@@ -0,0 +1,6 @@
+#!/bin/bash
+cur_path=$(pwd)
+HABANA_PROFILE_WRITE_HLTV=1 HABANA_PROFILE=1
+VLLM_PT_PROFILE=decode_128_1024_t \
+HABANA_PROF_CONFIG=${cur_path}/profile_api_trace_analyzer.json \
+python ${cur_path}/run_example_tp.py
@@ -0,0 +1,65 @@
+{
+    "Plugins": [
+        {
+            "enable": true,
+            "lib": "libhost_profiler.so",
+            "name": "HostProfiler",
+            "values": {
+                "start_disabled": {
+                    "value": true
+                }
+            }
+        },
+        {
+            "enable": true,
+            "lib": "libhw_trace.so",
+            "name": "HwTrace",
+            "values": {
+                "archProfileUnits": {
+                    "gaudi3": {
+                        "CS": {
+                            "Gaudi3CSAdvancedProfiling": {
+                                "value": 0
+                            }
+                        },
+                        "NIC": {
+                            "enable": {
+                                "value": true
+                            }
+                        }
+                    },
+                    "gaudi2": {
+                        "FW_events": {
+                            "enable": {
+                                "value": true
+                            }
+                        },
+                        "NIC": {
+                            "enable": {
+                                "value": true
+                            }
+                        }
+                    }
+                },
+                "generalOptions": {
+                    "arch": {
+                        "value": "gaudi3"
+                    },
+                    "profilePhase": {
+                        "value": "profileApi"
+                    },
+                    "traceBufferSize": {
+                        "value": "0x8000000"
+                    }
+                },
+                "parseOptions": {
+                    "traceAnalyzer": {
+                        "traceAnalyzerXlsx": {
+                            "value": true
+                        }
+                    }
+                }
+            }
+        }
+    ]
+}
@@ -529,7 +529,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # matrices to match the activation type. This can lead to higher memory and
     # compute usage but better preserves the accuracy of the original model.
     "VLLM_MLA_DISABLE_REQUANTIZATION":
-    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0")))
+    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "1")))
 }
 
 # end-env-vars-definition

@@ -415,7 +415,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
         self.block_quant = self.quant_config.weight_block_size is not None
-        self.moe_n_slice = int(os.environ.get("VLLM_MOE_N_SLICE", 1))
+        self.moe_n_slice = int(os.environ.get("VLLM_MOE_N_SLICE", 4))
 
     def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
                        intermediate_size_per_partition: int,