From ed920135c8490440453a64e197fce5e1e6459225 Mon Sep 17 00:00:00 2001
From: Reza Salehi <mrsalehi@cs.washington.edu>
Date: Tue, 15 Oct 2024 21:56:09 -0700
Subject: [PATCH] [Bugfix] Molmo text-only input bug fix (#9397)

Co-authored-by: sanghol <sanghol@allenai.org>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/molmo.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index ccfee165368e7..b04916f17088c 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -946,9 +946,12 @@ def pad_images(
 
 
 def input_processor_for_molmo(ctx: InputContext, llm_inputs: LLMInputs):
-    prompt = llm_inputs["prompt"]
-    multi_modal_data = llm_inputs.get("multi_modal_data")
-    image = multi_modal_data.get("image")
+    prompt = llm_inputs.get("prompt", None)
+    multi_modal_data = llm_inputs.get("multi_modal_data", None)
+    if multi_modal_data is not None:
+        image = multi_modal_data.get("image", None)
+    else:
+        image = None
     processor = cached_get_processor(ctx.model_config.model,
                                      trust_remote_code=True,
                                      revision=ctx.model_config.code_revision)