forked from openvinotoolkit/openvino.genai
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcasual_lm.cpp
93 lines (88 loc) · 4.5 KB
/
casual_lm.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#include <openvino/openvino.hpp>
#include <openvino_extensions/strings.hpp>
namespace {
std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest&& tokenizer, std::string_view prompt) {
constexpr size_t BATCH_SIZE = 1;
ov::Tensor destination = tokenizer.get_input_tensor();
openvino_extensions::pack_strings(std::array<std::string_view, BATCH_SIZE>{prompt}, destination);
tokenizer.infer();
return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")};
}
void print_token(ov::InferRequest& detokenizer, int64_t out_token) {
constexpr size_t BATCH_SIZE = 1;
ov::Tensor inp = detokenizer.get_input_tensor();
inp.set_shape({BATCH_SIZE, 1});
inp.data<int64_t>()[0] = out_token;
detokenizer.infer();
std::cout << openvino_extensions::unpack_strings(detokenizer.get_output_tensor()).front() << std::flush;
}
}
int main(int argc, char* argv[]) try {
if (argc != 5) {
throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <openvino_model.xml> <tokenizer.xml> <detokenizer.xml> '<prompt>'");
}
ov::Core core;
core.add_extension(USER_OV_EXTENSIONS_PATH); // USER_OV_EXTENSIONS_PATH is defined in CMakeLists.txt
auto [input_ids, attention_mask] = tokenize(core.compile_model(argv[2], "CPU").create_infer_request(), argv[4]);
ov::InferRequest detokenizer = core.compile_model(argv[3], "CPU").create_infer_request();
std::shared_ptr<ov::Model> model = core.read_model(argv[1]);
constexpr size_t BATCH_SIZE = 1;
std::map<size_t, ov::PartialShape> shapes = {
{0, ov::PartialShape{
BATCH_SIZE, -1
}},
{1, ov::PartialShape{
BATCH_SIZE, -1
}},
{2, ov::PartialShape{
BATCH_SIZE, -1
}}
};
std::vector<ov::Output<ov::Node>> inputs = model->inputs();
for (size_t idx = 3; idx < inputs.size(); ++idx) {
ov::PartialShape shape = inputs.at(idx).get_partial_shape();
shape[0] = BATCH_SIZE;
shapes.emplace(idx, shape);
}
model->reshape(shapes);
ov::InferRequest ireq = core.compile_model(model, "CPU").create_infer_request();
for (size_t idx = 2; idx < inputs.size(); ++idx) {
ireq.get_input_tensor(idx).set_shape(inputs.at(idx).get_partial_shape().get_min_shape());
}
ireq.get_tensor("input_ids").set_shape(input_ids.get_shape()); // TODO: replace with ireq.set_tensor("input_ids", input_ids); after it's fixed
std::copy_n(input_ids.data<const int64_t>(), input_ids.get_size(), ireq.get_tensor("input_ids").data<int64_t>());
ireq.get_tensor("attention_mask").set_shape(attention_mask.get_shape());
std::fill_n(ireq.get_tensor("attention_mask").data<int64_t>(), attention_mask.get_size(), 1);
ireq.get_tensor("position_ids").set_shape(input_ids.get_shape());
std::iota(ireq.get_tensor("position_ids").data<int64_t>(), ireq.get_tensor("position_ids").data<int64_t>() + ireq.get_tensor("position_ids").get_size(), 0);
ireq.infer();
size_t vocab_size = ireq.get_tensor("logits").get_shape().back();
float* logits = ireq.get_tensor("logits").data<float>() + (input_ids.get_size() - 1) * vocab_size;
int64_t out_token = std::max_element(logits, logits + vocab_size) - logits;
ireq.get_tensor("input_ids").set_shape({BATCH_SIZE, 1});
ireq.get_tensor("position_ids").set_shape({BATCH_SIZE, 1});
constexpr int64_t SPECIAL_EOS_TOKEN = 2; // There's no way to extract the value from the detokenizer for now
while (out_token != SPECIAL_EOS_TOKEN) {
ireq.get_tensor("input_ids").data<int64_t>()[0] = out_token;
ireq.get_tensor("attention_mask").set_shape({BATCH_SIZE, ireq.get_tensor("attention_mask").get_shape()[1] + 1});
std::fill_n(ireq.get_tensor("attention_mask").data<int64_t>(), ireq.get_tensor("attention_mask").get_size(), 1);
ireq.get_tensor("position_ids").data<int64_t>()[0] = ireq.get_tensor("attention_mask").get_size() - 2;
for (size_t idx = 3; idx < inputs.size(); ++idx) {
ireq.set_input_tensor(idx, ireq.get_output_tensor(idx - 2));
}
ireq.start_async();
print_token(detokenizer, out_token);
ireq.wait();
logits = ireq.get_tensor("logits").data<float>();
out_token = std::max_element(logits, logits + vocab_size) - logits;
}
std::cout << '\n';
} catch (const std::exception& error) {
std::cerr << error.what() << '\n';
return 1;
} catch (...) {
std::cerr << "Non-exception object thrown\n";
return 1;
}