Skip to content

Commit

Permalink
[GPU] allow to read activations scale factor from rt_info for non-LLMs (
Browse files Browse the repository at this point in the history
openvinotoolkit#28449)

### Details:
- allows to read `ACTIVATIONS_SCALE_FACTOR` from rt_info for non-LLMs,
such as FLUX.1 and SDXL.
 - assumes that LLMs have `ReadValue` layers and non-LLMs does not.
  • Loading branch information
e-ddykim authored Jan 16, 2025
1 parent e13b4c8 commit 4faa82d
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ class ExecutionConfig {

// Note that RT info property value has lower priority than values set by user via core.set_property or passed to compile_model call
// So this method should be called after setting all user properties, but before apply_user_properties() call.
void apply_rt_info(const cldnn::device_info& info, const ov::RTMap& rt_info);
void apply_rt_info(const cldnn::device_info& info, const ov::RTMap& rt_info, const bool is_llm);

std::string to_string() const;

Expand Down
34 changes: 32 additions & 2 deletions src/plugins/intel_gpu/src/plugin/plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@
#include "intel_gpu/runtime/execution_config.hpp"
#include "intel_gpu/runtime/itt.hpp"
#include "openvino/core/deprecated.hpp"
#include "openvino/op/gather.hpp"
#include "openvino/op/concat.hpp"
#include "openvino/pass/manager.hpp"
#include "openvino/pass/pattern/op/wrap_type.hpp"
#include "openvino/pass/pattern/op/or.hpp"
#include "openvino/pass/visualize_tree.hpp"
#include "openvino/runtime/device_id_parser.hpp"
#include "openvino/runtime/intel_gpu/properties.hpp"
Expand Down Expand Up @@ -62,6 +66,32 @@ namespace intel_gpu {
#include "intel_gpu/plugin/primitives_list.hpp"
#undef REGISTER_FACTORY

const auto is_llm = [](const std::shared_ptr<const ov::Model>& model) -> bool {
using namespace ov::pass::pattern;

auto past = wrap_type<ov::op::v6::ReadValue>();
auto convert_past = wrap_type<ov::op::v0::Convert>({past});
auto gather_input = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{past, convert_past});
auto beam_idx = wrap_type<ov::op::v0::Parameter>();
auto gather_past = wrap_type<ov::op::v8::Gather>({gather_input, beam_idx, wrap_type<ov::op::v0::Constant>()});
auto gather_convert = wrap_type<ov::op::v0::Convert>({gather_past});
auto concat_past_input = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{past, convert_past, gather_past, gather_convert});
auto concat = wrap_type<ov::op::v0::Concat>({concat_past_input, any_input()});
auto convert_present = wrap_type<ov::op::v0::Convert>({concat});
auto present_input = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{concat, convert_present});
auto present = wrap_type<ov::op::v6::Assign>({present_input});

auto kvcache_matcher = std::make_shared<ov::pass::pattern::Matcher>(present, "KVCacheMatcher");

for (auto& op : model->get_ordered_ops()) {
if (kvcache_matcher->match(op)) {
return true;
}
}

return false;
};

void Plugin::register_primitives() const {
#define REGISTER_FACTORY(op_version, op_name) FACTORY_CALL(op_version, op_name)
#include "intel_gpu/plugin/primitives_list.hpp"
Expand Down Expand Up @@ -190,7 +220,7 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
ExecutionConfig config = m_configs_map.at(device_id);
config.set_user_property(orig_config);
if (model->has_rt_info("runtime_options"))
config.apply_rt_info(context->get_engine().get_device_info(), model->get_rt_info<ov::AnyMap>("runtime_options"));
config.apply_rt_info(context->get_engine().get_device_info(), model->get_rt_info<ov::AnyMap>("runtime_options"), is_llm(model));
config.apply_user_properties(context->get_engine().get_device_info());

set_cache_info(model, config);
Expand Down Expand Up @@ -281,7 +311,7 @@ ov::SupportedOpsMap Plugin::query_model(const std::shared_ptr<const ov::Model>&
ExecutionConfig config = m_configs_map.at(device_id);
config.set_user_property(orig_config);
if (model->has_rt_info("runtime_options"))
config.apply_rt_info(ctx->get_engine().get_device_info(), model->get_rt_info<ov::AnyMap>("runtime_options"));
config.apply_rt_info(ctx->get_engine().get_device_info(), model->get_rt_info<ov::AnyMap>("runtime_options"), is_llm(model));
config.apply_user_properties(ctx->get_engine().get_device_info());

ProgramBuilder prog(ctx->get_engine(), config);
Expand Down
5 changes: 3 additions & 2 deletions src/plugins/intel_gpu/src/runtime/execution_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -272,11 +272,12 @@ void ExecutionConfig::apply_user_properties(const cldnn::device_info& info) {
user_properties.clear();
}

void ExecutionConfig::apply_rt_info(const cldnn::device_info& info, const ov::RTMap& rt_info) {
void ExecutionConfig::apply_rt_info(const cldnn::device_info& info, const ov::RTMap& rt_info, const bool is_llm) {
if (!info.supports_immad) {
apply_rt_info_property(ov::hint::kv_cache_precision, rt_info);
apply_rt_info_property(ov::hint::activations_scale_factor, rt_info);
}
if (!info.supports_immad || !is_llm)
apply_rt_info_property(ov::hint::activations_scale_factor, rt_info);
apply_rt_info_property(ov::hint::dynamic_quantization_group_size, rt_info);
}

Expand Down

0 comments on commit 4faa82d

Please sign in to comment.