microsoft · aciddelgado · Jan 16, 2025 · Jan 28, 2025
diff --git a/examples/python/phi3v.py b/examples/python/phi3v.py
@@ -91,10 +91,10 @@ def run(args: argparse.Namespace):
 
         print("Generating response...")
         params = og.GeneratorParams(model)
-        params.set_inputs(inputs)
         params.set_search_options(max_length=7680)
 
         generator = og.Generator(model, params)
+        generator.set_inputs(inputs)
         start_time = time.time()
 
         while not generator.is_done():

diff --git a/src/generators.cpp b/src/generators.cpp
@@ -229,23 +229,6 @@ void GeneratorParams::TryGraphCapture(int max_bs) {
   }
 }
 
-void GeneratorParams::SetInputs(const NamedTensors& named_tensors) {
-  if (config.model.type == "gpt2" || config.model.type == "llama" || config.model.type == "gemma" || config.model.type == "gemma2" || config.model.type == "mistral" || config.model.type == "phi" || config.model.type == "phi3" || config.model.type == "phi3small" || config.model.type == "phimoe" || config.model.type == "qwen2" || config.model.type == "decoder-pipeline")
-    throw std::runtime_error("Please use generator.AppendTokens for " + config.model.type + ". SetInputs is not supported for this model type.");
-
-  for (const auto& [name, tensor] : named_tensors) {
-    if (name == Config::Defaults::InputIdsName) {
-      aux_input_ids = cpu_span<int32_t>(tensor->ort_tensor_->GetTensorMutableData<int32_t>(),
-                                        tensor->ort_tensor_->GetTensorTypeAndShapeInfo()->GetElementCount());
-    } else {
-      // If the nominal name is found in the map, use the graph name.
-      // Else, use the nominal name as the graph name.
-      [[maybe_unused]] const auto [graph_name, found] = config.GetGraphName(name);
-      extra_inputs.push_back({graph_name, tensor});
-    }
-  }
-}
-
 std::unique_ptr<Generator> CreateGenerator(const Model& model, const GeneratorParams& params) {
   return std::make_unique<Generator>(model, params);
 }
@@ -268,11 +251,6 @@ Generator::Generator(const Model& model, const GeneratorParams& params) : model_
 
   search_ = CreateSearch(params);
   state_ = model.CreateState(search_->GetSequenceLengths(), params);  // Search sequence lengths set when creating state
-
-  // Temporary solution for multimodal and whisper models
-  if (!params.aux_input_ids.empty() && params.aux_input_ids.data() != nullptr) {
-    AuxAppendTokens(params.aux_input_ids);
-  }
 }
 
 DeviceSpan<int32_t> Generator::AllocateInputIdsOnDevice(cpu_span<const int32_t> input_ids) {
@@ -291,25 +269,34 @@ DeviceSpan<int32_t> Generator::AllocateInputIdsOnDevice(cpu_span<const int32_t>
   return input_ids_device;
 }
 
-// TODO(aciddelgado): Remove this function once SetInputs is moved to generator
-void Generator::AuxAppendTokens(cpu_span<const int32_t> input_ids) {
+void Generator::SetInputs(const NamedTensors& named_tensors) {
   ThrowErrorIfSessionTerminated(state_->session_terminated_);
-  if (input_ids.size() == 0)
-    throw std::runtime_error("input_ids is empty");
-  if (search_->GetSequenceLength() != 0 && state_->params_->search.batch_size > 1)
-    throw std::runtime_error("AppendTokens can only be called once for batch_size > 1. To call AppendTokens again, use RewindToLength(0)");
+  if (model_->config_->model.type == "gpt2" || model_->config_->model.type == "llama" || model_->config_->model.type == "gemma" || model_->config_->model.type == "gemma2" || model_->config_->model.type == "mistral" || model_->config_->model.type == "phi" || model_->config_->model.type == "phi3" || model_->config_->model.type == "phi3small" || model_->config_->model.type == "phimoe" || model_->config_->model.type == "qwen2" || model_->config_->model.type == "decoder-pipeline")
+    throw std::runtime_error("Please use generator.AppendTokens for " + model_->config_->model.type + ". SetInputs is not supported for this model type.");
 
-  auto input_ids_device = AllocateInputIdsOnDevice(input_ids);
-  search_->AppendTokens(input_ids_device);
-  computed_logits_ = false;
-  ComputeLogits(input_ids_device);
+  cpu_span<int32_t> aux_input_ids;
+  std::vector<Input> extra_inputs;
+  for (const auto& [name, tensor] : named_tensors) {
+    if (name == Config::Defaults::InputIdsName) {
+      aux_input_ids = cpu_span<int32_t>(tensor->ort_tensor_->GetTensorMutableData<int32_t>(),
+                                        tensor->ort_tensor_->GetTensorTypeAndShapeInfo()->GetElementCount());
+    } else {
+      // If the nominal name is found in the map, use the graph name.
+      // Else, use the nominal name as the graph name.
+      [[maybe_unused]] const auto [graph_name, found] = model_->config_->GetGraphName(name);
+      extra_inputs.push_back({graph_name, tensor});
+    }
+  }
+  state_->SetExtraInputs(extra_inputs);
+  if (aux_input_ids.size() != 0)
+    AppendTokens(aux_input_ids, false);
 }
 
-void Generator::AppendTokens(cpu_span<const int32_t> input_ids) {
+void Generator::AppendTokens(cpu_span<const int32_t> input_ids, bool api_call) {
   ThrowErrorIfSessionTerminated(state_->session_terminated_);
   if (input_ids.size() == 0)
     throw std::runtime_error("input_ids is empty");
-  if (model_->config_->model.type == "whisper" || model_->config_->model.type == "phi3v")
+  if (api_call && (model_->config_->model.type == "whisper" || model_->config_->model.type == "phi3v"))
     throw std::runtime_error("Please use params.SetInputs for " + model_->config_->model.type + ". AppendTokens is not supported for this model type.");
   if (search_->GetSequenceLength() != 0 && state_->params_->search.batch_size > 1)
     throw std::runtime_error("AppendTokens can only be called once for batch_size > 1. To call AppendTokens again, use RewindToLength(0)");

diff --git a/src/generators.h b/src/generators.h
@@ -63,6 +63,11 @@ enum struct DeviceType {
   QNN,
 };
 
+struct Input {
+  std::string name;
+  std::shared_ptr<Tensor> tensor;
+};
+
 std::string to_string(DeviceType device_type);
 DeviceInterface* GetDeviceInterface(DeviceType type);
 
@@ -81,8 +86,6 @@ struct GeneratorParams : std::enable_shared_from_this<GeneratorParams>, LeakChec
   DeviceType device_type{DeviceType::CPU};
   cudaStream_t cuda_stream{};
 
-  cpu_span<int32_t> aux_input_ids{};  // Intermediate solution to be used with SetInputs function for multimodal and whisper models
-
   struct Whisper {
     std::shared_ptr<Tensor> input_features;   // float32 [batch_size, number_of_mels, number_of_frames]
     std::shared_ptr<Tensor> alignment_heads;  // int32 [num_alignment_heads, 2]
@@ -92,18 +95,8 @@ struct GeneratorParams : std::enable_shared_from_this<GeneratorParams>, LeakChec
 
   std::shared_ptr<GeneratorParams> external_owner_;  // Set to 'this' when created by the C API to preserve lifetime
 
-  struct Input {
-    std::string name;
-    std::shared_ptr<Tensor> tensor;
-  };
-
-  // A list of extra model inputs that will be matched at runtime based on name
-  std::vector<Input> extra_inputs;
-
   void TryGraphCapture(int max_bs);
 
-  void SetInputs(const NamedTensors& inputs);
-
  private:
   bool is_cuda_graph_enabled_{};
 };
@@ -112,7 +105,8 @@ struct Generator : LeakChecked<Generator> {
   Generator(const Model& model, const GeneratorParams& params);
 
   bool IsDone() const;
-  void AppendTokens(cpu_span<const int32_t> input_ids);
+  void SetInputs(const NamedTensors& inputs);
+  void AppendTokens(cpu_span<const int32_t> input_ids, bool api_call=true);
   void GenerateNextToken();
   void RewindToLength(size_t new_length);  // Rewind state to new_length
   DeviceSpan<float> GetLogits();
@@ -129,12 +123,12 @@ struct Generator : LeakChecked<Generator> {
 
  private:
   DeviceSpan<int32_t> AllocateInputIdsOnDevice(cpu_span<const int32_t> input_ids);
-  void AuxAppendTokens(cpu_span<const int32_t> input_ids);
   void ComputeLogits(DeviceSpan<int32_t> next_tokens);
   enum Action { standard,   // Default, set in any other case
                 generated,  // Set after GenerateNextToken
                 rewound };  // Set after RewindToLength
   Action last_action_{standard};
+
 };
 
 struct OrtGlobals {

diff --git a/src/models/captured_graph_pool.cpp b/src/models/captured_graph_pool.cpp
@@ -23,10 +23,12 @@ CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model,
     return nullptr;
   }
 
-  // Multiple generators can reserve graphs in parallel, so we need to make it thread saf
+  // Multiple generators can reserve graphs in parallel, so we need to make it thread safe
   std::unique_lock lock(captured_graph_mutex_);
 
-  auto key = std::make_unique<CapturedGraphKey>(params.max_batch_size, params.search.max_length, params.search.num_beams, params.extra_inputs);
+  // TODO(aciddelgado): no more params.extra_inputs, how to continue?
+  std::vector<Input> temp_empty_extra_inputs;
+  auto key = std::make_unique<CapturedGraphKey>(params.max_batch_size, params.search.max_length, params.search.num_beams, temp_empty_extra_inputs);
   auto& captured_graphs = captured_graphs_map_[*key];
 
   // If no graphs are available, create a graph with a new ID
@@ -90,7 +92,7 @@ CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model,
     }
 
     // Create the extra inputs
-    for (const auto& extra_input : params.extra_inputs) {
+    for (const auto& extra_input : temp_empty_extra_inputs) {
       auto first_dim = extra_input.tensor->ort_tensor_->GetTensorTypeAndShapeInfo()->GetShape()[0];
       new_captured_graph->sb_extra_inputs_[extra_input.name] = std::make_unique<StaticBuffer>(allocator_device_, first_dim);
     }

diff --git a/src/models/captured_graph_pool.h b/src/models/captured_graph_pool.h
@@ -5,6 +5,7 @@
 #include <mutex>
 #include <unordered_map>
 #include "static_buffer.h"
+// #include "model.h"
 #include "../generators.h"
 
 // From boost http://www.boost.org/doc/libs/1_35_0/doc/html/hash/combine.html
@@ -29,7 +30,7 @@ struct InputKey {
 };
 
 struct CapturedGraphKey {
-  CapturedGraphKey(int max_batch_size, int max_length, int num_beams, const std::vector<Generators::GeneratorParams::Input>& extra_inputs)
+  CapturedGraphKey(int max_batch_size, int max_length, int num_beams, const std::vector<Generators::Input>& extra_inputs)
       : max_batch_size_(max_batch_size),
         max_length_(max_length),
         num_beams_(num_beams) {
@@ -100,6 +101,7 @@ struct CapturedGraphInfo;
 struct Config;
 struct SessionInfo;
 struct Model;
+struct State;
 
 struct CapturedGraphInfoRecycler {
   void operator()(CapturedGraphInfo* captured_graph_info);

diff --git a/src/models/decoder_only.cpp b/src/models/decoder_only.cpp
@@ -22,7 +22,6 @@ DecoderOnly_State::DecoderOnly_State(const DecoderOnly_Model& model, DeviceSpan<
   position_inputs_.Add();
   logits_.Add();
   kv_cache_.Add();
-  extra_inputs_.Add();
 }
 
 DeviceSpan<float> DecoderOnly_State::Run(int total_length, DeviceSpan<int32_t>& next_tokens, DeviceSpan<int32_t> next_indices) {

diff --git a/src/models/decoder_only.h b/src/models/decoder_only.h
@@ -4,7 +4,6 @@
 #include "logits.h"
 #include "kv_cache.h"
 #include "position_inputs.h"
-#include "extra_inputs.h"
 
 namespace Generators {
 
@@ -33,7 +32,6 @@ struct DecoderOnly_State : State {
   Logits logits_{*this};
   DefaultKeyValueCache kv_cache_{*this};
   DefaultPositionInputs position_inputs_;
-  ExtraInputs extra_inputs_{*this};
 };
 
 }  // namespace Generators
diff --git a/src/models/decoder_only_pipeline.cpp b/src/models/decoder_only_pipeline.cpp
@@ -100,7 +100,6 @@ DecoderOnlyPipelineState::DecoderOnlyPipelineState(const DecoderOnlyPipelineMode
   if (key_value_cache_) {
     key_value_cache_->Add();
   }
-  extra_inputs_.Add();
 
   for ([[maybe_unused]] const auto& pipeline_model : model_.config_->model.decoder.pipeline) {
     pipeline_states_.emplace_back(std::make_unique<IntermediatePipelineState>(model_, params, pipeline_states_.size()));

diff --git a/src/models/decoder_only_pipeline.h b/src/models/decoder_only_pipeline.h
@@ -8,7 +8,6 @@
 #include "logits.h"
 #include "kv_cache.h"
 #include "position_inputs.h"
-#include "extra_inputs.h"
 
 namespace Generators {
 
@@ -75,7 +74,6 @@ struct DecoderOnlyPipelineState : State {
   Logits logits_{*this};
   std::unique_ptr<KeyValueCache> key_value_cache_;
   std::unique_ptr<PositionInputs> position_inputs_;
-  ExtraInputs extra_inputs_{*this};
 };
 
 }  // namespace Generators
diff --git a/src/models/extra_inputs.cpp b/src/models/extra_inputs.cpp
@@ -46,37 +46,42 @@ void PresetExtraInputs::Add() {
 }
 
 ExtraInputs::ExtraInputs(State& state)
-    : state_{state} {
-  extra_inputs_.reserve(state_.params_->extra_inputs.size());
+    : state_{state} {}
+
+#pragma warning(push)
+#pragma warning(disable : 4065)  // switch statement contains 'default' but no 'case' labels
+#pragma warning(disable : 4189)  // local variable is initialized but not referenced
+#pragma warning(disable : 4702)  // unreachable code
+
+void ExtraInputs::Add() {
+  // Nothing to do here...
+}
+
+void ExtraInputs::Update(const std::vector<Input>& extra_inputs) {
+  // Reserve extra inputs when received at runtime (from set_inputs)
+  extra_inputs_.reserve(extra_inputs.size());
 
   if (state_.GetCapturedGraphInfo()) {
-    owned_extra_inputs_.reserve(state_.params_->extra_inputs.size());
+    owned_extra_inputs_.reserve(extra_inputs.size());
 
-    for (int i = 0; i < state_.params_->extra_inputs.size(); ++i) {
-      auto type_and_shape_info = state_.params_->extra_inputs[i].tensor->ort_tensor_->GetTensorTypeAndShapeInfo();
-      const auto& input_name = state_.params_->extra_inputs[i].name;
+    for (int i = 0; i < extra_inputs.size(); ++i) {
+      auto type_and_shape_info = extra_inputs[i].tensor->ort_tensor_->GetTensorTypeAndShapeInfo();
+      const auto& input_name = extra_inputs[i].name;
 
       sb_extra_inputs_.emplace(input_name, state_.GetCapturedGraphInfo()->sb_extra_inputs_.at(input_name).get());
       owned_extra_inputs_.push_back(sb_extra_inputs_.at(input_name)->CreateTensorOnStaticBuffer(type_and_shape_info->GetShape(), type_and_shape_info->GetElementType()));
       extra_inputs_.push_back(owned_extra_inputs_.back().get());
     }
   } else {
     // We don't use graph capture, so simply use the existing pointers
-    for (auto& extra_input : state_.params_->extra_inputs) {
+    for (auto& extra_input : extra_inputs) {
       extra_inputs_.push_back(extra_input.tensor->ort_tensor_.get());
     }
   }
-}
-
-#pragma warning(push)
-#pragma warning(disable : 4065)  // switch statement contains 'default' but no 'case' labels
-#pragma warning(disable : 4189)  // local variable is initialized but not referenced
-#pragma warning(disable : 4702)  // unreachable code
 
-void ExtraInputs::Add() {
   // Add extra user inputs
-  for (int i = 0; i < state_.params_->extra_inputs.size(); ++i) {
-    state_.input_names_.push_back(state_.params_->extra_inputs[i].name.c_str());
+  for (int i = 0; i < extra_inputs.size(); ++i) {
+    state_.input_names_.push_back(extra_inputs[i].name.c_str());
     state_.inputs_.push_back(extra_inputs_[i]);
   }
 
@@ -93,7 +98,7 @@ void ExtraInputs::Add() {
         ComPtr<ID3D12Resource> target_resource;
         Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, extra_inputs_[i]->GetTensorMutableRawData(), &target_resource));
 
-        auto source = std::span(state_.params_->extra_inputs[i].tensor->ort_tensor_->GetTensorData<const uint8_t>(), copy_size_in_bytes);
+        auto source = std::span(extra_inputs[i].tensor->ort_tensor_->GetTensorData<const uint8_t>(), copy_size_in_bytes);
 
         model_.GetDmlUploadHeap()->BeginUploadToGpu(
             target_resource.Get(),
@@ -107,7 +112,7 @@ void ExtraInputs::Add() {
       case DeviceType::CUDA: {
         cudaMemcpyAsync(
             extra_inputs_[i]->GetTensorMutableRawData(),
-            state_.params_->extra_inputs[i].tensor->ort_tensor_->GetTensorMutableRawData(),
+            extra_inputs[i].tensor->ort_tensor_->GetTensorMutableRawData(),
             copy_size_in_bytes,
             cudaMemcpyHostToDevice,
             model_.cuda_stream_);

diff --git a/src/models/extra_inputs.h b/src/models/extra_inputs.h
@@ -19,6 +19,7 @@ struct PresetExtraInputs {
 struct ExtraInputs {
   ExtraInputs(State& state);
   void Add();
+  void Update(const std::vector<Input>& extra_inputs);
 
  private:
   State& state_;

diff --git a/src/models/gpt.cpp b/src/models/gpt.cpp
@@ -21,7 +21,6 @@ Gpt_State::Gpt_State(const Gpt_Model& model, DeviceSpan<int32_t> sequence_length
   position_inputs_.Add();
   logits_.Add();
   kv_cache_.Add();
-  extra_inputs_.Add();
 }
 
 DeviceSpan<float> Gpt_State::Run(int total_length, DeviceSpan<int32_t>& next_tokens, DeviceSpan<int32_t> next_indices) {

diff --git a/src/models/gpt.h b/src/models/gpt.h
@@ -4,7 +4,6 @@
 #include "logits.h"
 #include "kv_cache.h"
 #include "position_inputs.h"
-#include "extra_inputs.h"
 
 namespace Generators {
 
@@ -31,6 +30,5 @@ struct Gpt_State : State {
   Logits logits_{*this};
   CombinedKeyValueCache kv_cache_{*this};
   DefaultPositionInputs position_inputs_;
-  ExtraInputs extra_inputs_{*this};
 };
 }  // namespace Generators
diff --git a/src/models/image_features.cpp b/src/models/image_features.cpp
@@ -8,7 +8,7 @@ namespace Generators {
 
 ImageFeatures::ImageFeatures(State& state, ImageFeatures::Mode mode, const std::string& name, int64_t num_image_tokens)
     : state_{state},
-      shape_{num_image_tokens, model_.config_->model.decoder.hidden_size},
+      shape_{0, model_.config_->model.decoder.hidden_size},
       type_{mode == ImageFeatures::Mode::Input
                 ? model_.session_info_->GetInputDataType(name)
                 : model_.session_info_->GetOutputDataType(name)},
@@ -45,12 +45,19 @@ void ImageFeatures::Add() {
   }
 }
 
-void ImageFeatures::Update(bool is_prompt) {
+void ImageFeatures::Update(bool is_prompt, int num_image_tokens) {
   // Initialize empty image_features tensor for after-prompt input scenarios
   // num_image_tokens will be 0 when no image is provided
-  if (!is_prompt && shape_[0] > 0) {  // if num_image_tokens > 0
-    shape_[0] = 0;
+
+  // TODO(aciddelgado): is this correct or are there more scenarios
+  if (is_prompt) {
+    shape_[0] = num_image_tokens;
     image_features_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
+    if (mode_ == ImageFeatures::Mode::Output) state_.outputs_[index_] = image_features_.get();
+    else if (mode_ == ImageFeatures::Mode::Input) state_.inputs_[index_] = image_features_.get();
+  } else if (!is_prompt && shape_[0] > 0) {  // if num_image_tokens > 0
+    shape_[0] = 0;
+    image_features_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);    
     state_.inputs_[index_] = image_features_.get();
   }
 }