Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move SetInputs to Generator #1205

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/python/phi3v.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,10 @@ def run(args: argparse.Namespace):

print("Generating response...")
params = og.GeneratorParams(model)
params.set_inputs(inputs)
params.set_search_options(max_length=7680)

generator = og.Generator(model, params)
generator.set_inputs(inputs)
start_time = time.time()

while not generator.is_done():
Expand Down
55 changes: 21 additions & 34 deletions src/generators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,23 +229,6 @@ void GeneratorParams::TryGraphCapture(int max_bs) {
}
}

void GeneratorParams::SetInputs(const NamedTensors& named_tensors) {
if (config.model.type == "gpt2" || config.model.type == "llama" || config.model.type == "gemma" || config.model.type == "gemma2" || config.model.type == "mistral" || config.model.type == "phi" || config.model.type == "phi3" || config.model.type == "phi3small" || config.model.type == "phimoe" || config.model.type == "qwen2" || config.model.type == "decoder-pipeline")
throw std::runtime_error("Please use generator.AppendTokens for " + config.model.type + ". SetInputs is not supported for this model type.");

for (const auto& [name, tensor] : named_tensors) {
if (name == Config::Defaults::InputIdsName) {
aux_input_ids = cpu_span<int32_t>(tensor->ort_tensor_->GetTensorMutableData<int32_t>(),
tensor->ort_tensor_->GetTensorTypeAndShapeInfo()->GetElementCount());
} else {
// If the nominal name is found in the map, use the graph name.
// Else, use the nominal name as the graph name.
[[maybe_unused]] const auto [graph_name, found] = config.GetGraphName(name);
extra_inputs.push_back({graph_name, tensor});
}
}
}

std::unique_ptr<Generator> CreateGenerator(const Model& model, const GeneratorParams& params) {
return std::make_unique<Generator>(model, params);
}
Expand All @@ -268,11 +251,6 @@ Generator::Generator(const Model& model, const GeneratorParams& params) : model_

search_ = CreateSearch(params);
state_ = model.CreateState(search_->GetSequenceLengths(), params); // Search sequence lengths set when creating state

// Temporary solution for multimodal and whisper models
if (!params.aux_input_ids.empty() && params.aux_input_ids.data() != nullptr) {
AuxAppendTokens(params.aux_input_ids);
}
}

DeviceSpan<int32_t> Generator::AllocateInputIdsOnDevice(cpu_span<const int32_t> input_ids) {
Expand All @@ -291,25 +269,34 @@ DeviceSpan<int32_t> Generator::AllocateInputIdsOnDevice(cpu_span<const int32_t>
return input_ids_device;
}

// TODO(aciddelgado): Remove this function once SetInputs is moved to generator
void Generator::AuxAppendTokens(cpu_span<const int32_t> input_ids) {
void Generator::SetInputs(const NamedTensors& named_tensors) {
ThrowErrorIfSessionTerminated(state_->session_terminated_);
if (input_ids.size() == 0)
throw std::runtime_error("input_ids is empty");
if (search_->GetSequenceLength() != 0 && state_->params_->search.batch_size > 1)
throw std::runtime_error("AppendTokens can only be called once for batch_size > 1. To call AppendTokens again, use RewindToLength(0)");
if (model_->config_->model.type == "gpt2" || model_->config_->model.type == "llama" || model_->config_->model.type == "gemma" || model_->config_->model.type == "gemma2" || model_->config_->model.type == "mistral" || model_->config_->model.type == "phi" || model_->config_->model.type == "phi3" || model_->config_->model.type == "phi3small" || model_->config_->model.type == "phimoe" || model_->config_->model.type == "qwen2" || model_->config_->model.type == "decoder-pipeline")
throw std::runtime_error("Please use generator.AppendTokens for " + model_->config_->model.type + ". SetInputs is not supported for this model type.");

auto input_ids_device = AllocateInputIdsOnDevice(input_ids);
search_->AppendTokens(input_ids_device);
computed_logits_ = false;
ComputeLogits(input_ids_device);
cpu_span<int32_t> aux_input_ids;
std::vector<Input> extra_inputs;
for (const auto& [name, tensor] : named_tensors) {
if (name == Config::Defaults::InputIdsName) {
aux_input_ids = cpu_span<int32_t>(tensor->ort_tensor_->GetTensorMutableData<int32_t>(),
tensor->ort_tensor_->GetTensorTypeAndShapeInfo()->GetElementCount());
} else {
// If the nominal name is found in the map, use the graph name.
// Else, use the nominal name as the graph name.
[[maybe_unused]] const auto [graph_name, found] = model_->config_->GetGraphName(name);
extra_inputs.push_back({graph_name, tensor});
}
}
state_->SetExtraInputs(extra_inputs);
if (aux_input_ids.size() != 0)
AppendTokens(aux_input_ids, false);
}

void Generator::AppendTokens(cpu_span<const int32_t> input_ids) {
void Generator::AppendTokens(cpu_span<const int32_t> input_ids, bool api_call) {
ThrowErrorIfSessionTerminated(state_->session_terminated_);
if (input_ids.size() == 0)
throw std::runtime_error("input_ids is empty");
if (model_->config_->model.type == "whisper" || model_->config_->model.type == "phi3v")
if (api_call && (model_->config_->model.type == "whisper" || model_->config_->model.type == "phi3v"))
throw std::runtime_error("Please use params.SetInputs for " + model_->config_->model.type + ". AppendTokens is not supported for this model type.");
if (search_->GetSequenceLength() != 0 && state_->params_->search.batch_size > 1)
throw std::runtime_error("AppendTokens can only be called once for batch_size > 1. To call AppendTokens again, use RewindToLength(0)");
Expand Down
22 changes: 8 additions & 14 deletions src/generators.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ enum struct DeviceType {
QNN,
};

struct Input {
std::string name;
std::shared_ptr<Tensor> tensor;
};

std::string to_string(DeviceType device_type);
DeviceInterface* GetDeviceInterface(DeviceType type);

Expand All @@ -81,8 +86,6 @@ struct GeneratorParams : std::enable_shared_from_this<GeneratorParams>, LeakChec
DeviceType device_type{DeviceType::CPU};
cudaStream_t cuda_stream{};

cpu_span<int32_t> aux_input_ids{}; // Intermediate solution to be used with SetInputs function for multimodal and whisper models

struct Whisper {
std::shared_ptr<Tensor> input_features; // float32 [batch_size, number_of_mels, number_of_frames]
std::shared_ptr<Tensor> alignment_heads; // int32 [num_alignment_heads, 2]
Expand All @@ -92,18 +95,8 @@ struct GeneratorParams : std::enable_shared_from_this<GeneratorParams>, LeakChec

std::shared_ptr<GeneratorParams> external_owner_; // Set to 'this' when created by the C API to preserve lifetime

struct Input {
std::string name;
std::shared_ptr<Tensor> tensor;
};

// A list of extra model inputs that will be matched at runtime based on name
std::vector<Input> extra_inputs;

void TryGraphCapture(int max_bs);

void SetInputs(const NamedTensors& inputs);

private:
bool is_cuda_graph_enabled_{};
};
Expand All @@ -112,7 +105,8 @@ struct Generator : LeakChecked<Generator> {
Generator(const Model& model, const GeneratorParams& params);

bool IsDone() const;
void AppendTokens(cpu_span<const int32_t> input_ids);
void SetInputs(const NamedTensors& inputs);
void AppendTokens(cpu_span<const int32_t> input_ids, bool api_call=true);
void GenerateNextToken();
void RewindToLength(size_t new_length); // Rewind state to new_length
DeviceSpan<float> GetLogits();
Expand All @@ -129,12 +123,12 @@ struct Generator : LeakChecked<Generator> {

private:
DeviceSpan<int32_t> AllocateInputIdsOnDevice(cpu_span<const int32_t> input_ids);
void AuxAppendTokens(cpu_span<const int32_t> input_ids);
void ComputeLogits(DeviceSpan<int32_t> next_tokens);
enum Action { standard, // Default, set in any other case
generated, // Set after GenerateNextToken
rewound }; // Set after RewindToLength
Action last_action_{standard};

};

struct OrtGlobals {
Expand Down
8 changes: 5 additions & 3 deletions src/models/captured_graph_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@ CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model,
return nullptr;
}

// Multiple generators can reserve graphs in parallel, so we need to make it thread saf
// Multiple generators can reserve graphs in parallel, so we need to make it thread safe
std::unique_lock lock(captured_graph_mutex_);

auto key = std::make_unique<CapturedGraphKey>(params.max_batch_size, params.search.max_length, params.search.num_beams, params.extra_inputs);
// TODO(aciddelgado): no more params.extra_inputs, how to continue?
std::vector<Input> temp_empty_extra_inputs;
auto key = std::make_unique<CapturedGraphKey>(params.max_batch_size, params.search.max_length, params.search.num_beams, temp_empty_extra_inputs);
auto& captured_graphs = captured_graphs_map_[*key];

// If no graphs are available, create a graph with a new ID
Expand Down Expand Up @@ -90,7 +92,7 @@ CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model,
}

// Create the extra inputs
for (const auto& extra_input : params.extra_inputs) {
for (const auto& extra_input : temp_empty_extra_inputs) {
auto first_dim = extra_input.tensor->ort_tensor_->GetTensorTypeAndShapeInfo()->GetShape()[0];
new_captured_graph->sb_extra_inputs_[extra_input.name] = std::make_unique<StaticBuffer>(allocator_device_, first_dim);
}
Expand Down
4 changes: 3 additions & 1 deletion src/models/captured_graph_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <mutex>
#include <unordered_map>
#include "static_buffer.h"
// #include "model.h"
#include "../generators.h"

// From boost http://www.boost.org/doc/libs/1_35_0/doc/html/hash/combine.html
Expand All @@ -29,7 +30,7 @@ struct InputKey {
};

struct CapturedGraphKey {
CapturedGraphKey(int max_batch_size, int max_length, int num_beams, const std::vector<Generators::GeneratorParams::Input>& extra_inputs)
CapturedGraphKey(int max_batch_size, int max_length, int num_beams, const std::vector<Generators::Input>& extra_inputs)
: max_batch_size_(max_batch_size),
max_length_(max_length),
num_beams_(num_beams) {
Expand Down Expand Up @@ -100,6 +101,7 @@ struct CapturedGraphInfo;
struct Config;
struct SessionInfo;
struct Model;
struct State;

struct CapturedGraphInfoRecycler {
void operator()(CapturedGraphInfo* captured_graph_info);
Expand Down
1 change: 0 additions & 1 deletion src/models/decoder_only.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ DecoderOnly_State::DecoderOnly_State(const DecoderOnly_Model& model, DeviceSpan<
position_inputs_.Add();
logits_.Add();
kv_cache_.Add();
extra_inputs_.Add();
}

DeviceSpan<float> DecoderOnly_State::Run(int total_length, DeviceSpan<int32_t>& next_tokens, DeviceSpan<int32_t> next_indices) {
Expand Down
2 changes: 0 additions & 2 deletions src/models/decoder_only.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include "logits.h"
#include "kv_cache.h"
#include "position_inputs.h"
#include "extra_inputs.h"

namespace Generators {

Expand Down Expand Up @@ -33,7 +32,6 @@ struct DecoderOnly_State : State {
Logits logits_{*this};
DefaultKeyValueCache kv_cache_{*this};
DefaultPositionInputs position_inputs_;
ExtraInputs extra_inputs_{*this};
};

} // namespace Generators
1 change: 0 additions & 1 deletion src/models/decoder_only_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ DecoderOnlyPipelineState::DecoderOnlyPipelineState(const DecoderOnlyPipelineMode
if (key_value_cache_) {
key_value_cache_->Add();
}
extra_inputs_.Add();

for ([[maybe_unused]] const auto& pipeline_model : model_.config_->model.decoder.pipeline) {
pipeline_states_.emplace_back(std::make_unique<IntermediatePipelineState>(model_, params, pipeline_states_.size()));
Expand Down
2 changes: 0 additions & 2 deletions src/models/decoder_only_pipeline.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#include "logits.h"
#include "kv_cache.h"
#include "position_inputs.h"
#include "extra_inputs.h"

namespace Generators {

Expand Down Expand Up @@ -75,7 +74,6 @@ struct DecoderOnlyPipelineState : State {
Logits logits_{*this};
std::unique_ptr<KeyValueCache> key_value_cache_;
std::unique_ptr<PositionInputs> position_inputs_;
ExtraInputs extra_inputs_{*this};
};

} // namespace Generators
41 changes: 23 additions & 18 deletions src/models/extra_inputs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,37 +46,42 @@ void PresetExtraInputs::Add() {
}

ExtraInputs::ExtraInputs(State& state)
: state_{state} {
extra_inputs_.reserve(state_.params_->extra_inputs.size());
: state_{state} {}

#pragma warning(push)
#pragma warning(disable : 4065) // switch statement contains 'default' but no 'case' labels
#pragma warning(disable : 4189) // local variable is initialized but not referenced
#pragma warning(disable : 4702) // unreachable code

void ExtraInputs::Add() {
// Nothing to do here...
}

void ExtraInputs::Update(const std::vector<Input>& extra_inputs) {
// Reserve extra inputs when received at runtime (from set_inputs)
extra_inputs_.reserve(extra_inputs.size());

if (state_.GetCapturedGraphInfo()) {
owned_extra_inputs_.reserve(state_.params_->extra_inputs.size());
owned_extra_inputs_.reserve(extra_inputs.size());

for (int i = 0; i < state_.params_->extra_inputs.size(); ++i) {
auto type_and_shape_info = state_.params_->extra_inputs[i].tensor->ort_tensor_->GetTensorTypeAndShapeInfo();
const auto& input_name = state_.params_->extra_inputs[i].name;
for (int i = 0; i < extra_inputs.size(); ++i) {
auto type_and_shape_info = extra_inputs[i].tensor->ort_tensor_->GetTensorTypeAndShapeInfo();
const auto& input_name = extra_inputs[i].name;

sb_extra_inputs_.emplace(input_name, state_.GetCapturedGraphInfo()->sb_extra_inputs_.at(input_name).get());
owned_extra_inputs_.push_back(sb_extra_inputs_.at(input_name)->CreateTensorOnStaticBuffer(type_and_shape_info->GetShape(), type_and_shape_info->GetElementType()));
extra_inputs_.push_back(owned_extra_inputs_.back().get());
}
} else {
// We don't use graph capture, so simply use the existing pointers
for (auto& extra_input : state_.params_->extra_inputs) {
for (auto& extra_input : extra_inputs) {
extra_inputs_.push_back(extra_input.tensor->ort_tensor_.get());
}
}
}

#pragma warning(push)
#pragma warning(disable : 4065) // switch statement contains 'default' but no 'case' labels
#pragma warning(disable : 4189) // local variable is initialized but not referenced
#pragma warning(disable : 4702) // unreachable code

void ExtraInputs::Add() {
// Add extra user inputs
for (int i = 0; i < state_.params_->extra_inputs.size(); ++i) {
state_.input_names_.push_back(state_.params_->extra_inputs[i].name.c_str());
for (int i = 0; i < extra_inputs.size(); ++i) {
state_.input_names_.push_back(extra_inputs[i].name.c_str());
state_.inputs_.push_back(extra_inputs_[i]);
}

Expand All @@ -93,7 +98,7 @@ void ExtraInputs::Add() {
ComPtr<ID3D12Resource> target_resource;
Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, extra_inputs_[i]->GetTensorMutableRawData(), &target_resource));

auto source = std::span(state_.params_->extra_inputs[i].tensor->ort_tensor_->GetTensorData<const uint8_t>(), copy_size_in_bytes);
auto source = std::span(extra_inputs[i].tensor->ort_tensor_->GetTensorData<const uint8_t>(), copy_size_in_bytes);

model_.GetDmlUploadHeap()->BeginUploadToGpu(
target_resource.Get(),
Expand All @@ -107,7 +112,7 @@ void ExtraInputs::Add() {
case DeviceType::CUDA: {
cudaMemcpyAsync(
extra_inputs_[i]->GetTensorMutableRawData(),
state_.params_->extra_inputs[i].tensor->ort_tensor_->GetTensorMutableRawData(),
extra_inputs[i].tensor->ort_tensor_->GetTensorMutableRawData(),
copy_size_in_bytes,
cudaMemcpyHostToDevice,
model_.cuda_stream_);
Expand Down
1 change: 1 addition & 0 deletions src/models/extra_inputs.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ struct PresetExtraInputs {
struct ExtraInputs {
ExtraInputs(State& state);
void Add();
void Update(const std::vector<Input>& extra_inputs);

private:
State& state_;
Expand Down
1 change: 0 additions & 1 deletion src/models/gpt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ Gpt_State::Gpt_State(const Gpt_Model& model, DeviceSpan<int32_t> sequence_length
position_inputs_.Add();
logits_.Add();
kv_cache_.Add();
extra_inputs_.Add();
}

DeviceSpan<float> Gpt_State::Run(int total_length, DeviceSpan<int32_t>& next_tokens, DeviceSpan<int32_t> next_indices) {
Expand Down
2 changes: 0 additions & 2 deletions src/models/gpt.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include "logits.h"
#include "kv_cache.h"
#include "position_inputs.h"
#include "extra_inputs.h"

namespace Generators {

Expand All @@ -31,6 +30,5 @@ struct Gpt_State : State {
Logits logits_{*this};
CombinedKeyValueCache kv_cache_{*this};
DefaultPositionInputs position_inputs_;
ExtraInputs extra_inputs_{*this};
};
} // namespace Generators
15 changes: 11 additions & 4 deletions src/models/image_features.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ namespace Generators {

ImageFeatures::ImageFeatures(State& state, ImageFeatures::Mode mode, const std::string& name, int64_t num_image_tokens)
: state_{state},
shape_{num_image_tokens, model_.config_->model.decoder.hidden_size},
shape_{0, model_.config_->model.decoder.hidden_size},
type_{mode == ImageFeatures::Mode::Input
? model_.session_info_->GetInputDataType(name)
: model_.session_info_->GetOutputDataType(name)},
Expand Down Expand Up @@ -45,12 +45,19 @@ void ImageFeatures::Add() {
}
}

void ImageFeatures::Update(bool is_prompt) {
void ImageFeatures::Update(bool is_prompt, int num_image_tokens) {
// Initialize empty image_features tensor for after-prompt input scenarios
// num_image_tokens will be 0 when no image is provided
if (!is_prompt && shape_[0] > 0) { // if num_image_tokens > 0
shape_[0] = 0;

// TODO(aciddelgado): is this correct or are there more scenarios
if (is_prompt) {
shape_[0] = num_image_tokens;
image_features_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
if (mode_ == ImageFeatures::Mode::Output) state_.outputs_[index_] = image_features_.get();
else if (mode_ == ImageFeatures::Mode::Input) state_.inputs_[index_] = image_features_.get();
} else if (!is_prompt && shape_[0] > 0) { // if num_image_tokens > 0
shape_[0] = 0;
image_features_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
state_.inputs_[index_] = image_features_.get();
}
}
Expand Down
Loading