From da3442dd3267f80196e66c27ad94a85b47fe49fa Mon Sep 17 00:00:00 2001
From: "Wang, Mengni" <mengni.wang@intel.com>
Date: Mon, 13 Nov 2023 13:07:45 +0800
Subject: [PATCH] Fix ONNXRT example with upgraded optimum 1.14.0 (#1381)

Signed-off-by: Mengni Wang <mengni.wang@intel.com>
Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
---
 .../quantization/ptq_static/prepare_model.py  | 48 +++++++++++-----
 .../llama/quantization/weight_only/README.md  |  2 +-
 .../quantization/weight_only/prepare_model.py | 55 +++++++++++++++++++
 .../test_weight_only_adaptor.py               |  4 +-
 test/model/test_onnx_model.py                 |  6 +-
 5 files changed, 97 insertions(+), 18 deletions(-)
 create mode 100644 examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py

diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/prepare_model.py b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/prepare_model.py
index b951a19bc23..92b423f351a 100644
--- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/prepare_model.py
+++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/prepare_model.py
@@ -1,6 +1,9 @@
 import argparse
 import os
 import subprocess
+import optimum.version
+from packaging.version import Version
+OPTIMUM114_VERSION = Version("1.14.0")
 
 
 def parse_arguments():
@@ -12,20 +15,37 @@ def parse_arguments():
 
 def prepare_model(input_model, output_model):
     print("\nexport model...")
-    subprocess.run(
-        [
-            "optimum-cli",
-            "export",
-            "onnx",
-            "--model",
-            f"{input_model}",
-            "--task",
-            "text-generation-with-past",
-            f"{output_model}",
-        ],
-        stdout=subprocess.PIPE,
-        text=True,
-    )
+    if Version(optimum.version.__version__) >= OPTIMUM114_VERSION:
+        subprocess.run(
+            [
+                "optimum-cli",
+                "export",
+                "onnx",
+                "--model",
+                f"{input_model}",
+                "--task",
+                "text-generation-with-past",
+                "--legacy",
+                f"{output_model}",
+            ],
+            stdout=subprocess.PIPE,
+            text=True,
+        )
+    else:
+        subprocess.run(
+            [
+                "optimum-cli",
+                "export",
+                "onnx",
+                "--model",
+                f"{input_model}",
+                "--task",
+                "text-generation-with-past",
+                f"{output_model}",
+            ],
+            stdout=subprocess.PIPE,
+            text=True,
+        )
 
     assert os.path.exists(output_model), f"{output_model} doesn't exist!"
 
diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md
index e15d0e3c703..68227fcffd4 100644
--- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md
+++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md
@@ -17,7 +17,7 @@ pip install -r requirements.txt
 ## 2. Prepare Model
 
 ```bash
-optimum-cli export onnx --model decapoda-research/llama-7b-hf --task text-generation-with-past ./llama_7b
+python prepare_model.py  --input_model="decapoda-research/llama-7b-hf" --output_model="./llama_7b"
 ```
 
 # Run
diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py
new file mode 100644
index 00000000000..92b423f351a
--- /dev/null
+++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py
@@ -0,0 +1,55 @@
+import argparse
+import os
+import subprocess
+import optimum.version
+from packaging.version import Version
+OPTIMUM114_VERSION = Version("1.14.0")
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_model", type=str, required=False, default="")
+    parser.add_argument("--output_model", type=str, required=True)
+    return parser.parse_args()
+
+
+def prepare_model(input_model, output_model):
+    print("\nexport model...")
+    if Version(optimum.version.__version__) >= OPTIMUM114_VERSION:
+        subprocess.run(
+            [
+                "optimum-cli",
+                "export",
+                "onnx",
+                "--model",
+                f"{input_model}",
+                "--task",
+                "text-generation-with-past",
+                "--legacy",
+                f"{output_model}",
+            ],
+            stdout=subprocess.PIPE,
+            text=True,
+        )
+    else:
+        subprocess.run(
+            [
+                "optimum-cli",
+                "export",
+                "onnx",
+                "--model",
+                f"{input_model}",
+                "--task",
+                "text-generation-with-past",
+                f"{output_model}",
+            ],
+            stdout=subprocess.PIPE,
+            text=True,
+        )
+
+    assert os.path.exists(output_model), f"{output_model} doesn't exist!"
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    prepare_model(args.input_model, args.output_model)
diff --git a/test/adaptor/onnxrt_adaptor/test_weight_only_adaptor.py b/test/adaptor/onnxrt_adaptor/test_weight_only_adaptor.py
index 6d2201df104..6da0152ded8 100644
--- a/test/adaptor/onnxrt_adaptor/test_weight_only_adaptor.py
+++ b/test/adaptor/onnxrt_adaptor/test_weight_only_adaptor.py
@@ -38,7 +38,9 @@ def __iter__(self):
 class TestWeightOnlyAdaptor(unittest.TestCase):
     @classmethod
     def setUpClass(self):
-        cmd = "optimum-cli export onnx --model hf-internal-testing/tiny-random-gptj --task text-generation gptj/"
+        cmd = (
+            "optimum-cli export onnx --model hf-internal-testing/tiny-random-gptj --task text-generation --legacy gptj/"
+        )
         p = subprocess.Popen(
             cmd, preexec_fn=os.setsid, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
         )  # nosec
diff --git a/test/model/test_onnx_model.py b/test/model/test_onnx_model.py
index 4c891781c35..cb4e1a6c704 100644
--- a/test/model/test_onnx_model.py
+++ b/test/model/test_onnx_model.py
@@ -193,7 +193,9 @@ def setUp(self):
         model = onnx.helper.make_model(graph, **{"opset_imports": [onnx.helper.make_opsetid("", 14)]})
         self.matmul_reshape_model = model
 
-        cmd = "optimum-cli export onnx --model hf-internal-testing/tiny-random-gptj --task text-generation gptj/"
+        cmd = (
+            "optimum-cli export onnx --model hf-internal-testing/tiny-random-gptj --task text-generation --legacy gptj/"
+        )
         p = subprocess.Popen(
             cmd, preexec_fn=os.setsid, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
         )  # nosec
@@ -216,7 +218,7 @@ def test_hf_model(self):
 
         config = AutoConfig.from_pretrained("hf_test")
         sessions = ORTModelForCausalLM.load_model("hf_test/decoder_model.onnx")
-        model = ORTModelForCausalLM(sessions[0], config, "hf_test", use_cache=False, use_io_binding=False)
+        model = ORTModelForCausalLM(sessions, config, model_save_dir="hf_test", use_cache=False, use_io_binding=False)
         self.assertNotEqual(model, None)
 
     def test_nodes(self):