Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add MiniCPM 2B demo #132

Merged
merged 2 commits into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,20 @@ else ()
target_link_libraries(demo_phi3 PUBLIC MLLM_CPU -fopenmp)
endif ()


add_executable(demo_minicpm ${PROJECT_SOURCE_DIR}/examples/demo_minicpm.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC}
src/tokenizers/Tokenizer.cpp
src/tokenizers/BPE/Bpe.cpp
src/processor/PreProcess.cpp
)
if (ARM AND NOT APK)
target_compile_options(demo_minicpm PRIVATE -fopenmp)
target_link_libraries(demo_minicpm PUBLIC MLLM_CPU -fopenmp -static-openmp)
else ()
target_compile_options(demo_minicpm PRIVATE -fopenmp)
target_link_libraries(demo_minicpm PUBLIC MLLM_CPU -fopenmp)
endif ()

# add_executable(demo_deepseek ${PROJECT_SOURCE_DIR}/examples/demo_deepseek.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC}
# src/tokenizers/Tokenizer.cpp
# src/tokenizers/BPE/Bpe.cpp
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ Wait.. why on-device multimodal LLM? - It's a key building block for [intelligen
| [StableLM 1.6B](https://github.com/Stability-AI/StableLM) | [✔️](https://huggingface.co/mllmTeam/stablelm-2-1.6b-chat-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/stablelm-2-1.6b-chat-mllm/tree/main) | |
| [OPT 1.3B](https://github.com/facebookresearch/metaseq/tree/main/projects/OPT) | [✔️](https://huggingface.co/mllmTeam/opt-1.3b-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/opt-1.3b-mllm/tree/main) | |
| [Phi-3-mini 3.8B](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) | [✔️](https://huggingface.co/mllmTeam/phi-3-mini-instruct-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/phi-3-mini-instruct-mllm/tree/main) | |
| [MiniCPM 2B](https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp32) | [✔️](https://huggingface.co/mllmTeam/minicpm-2b-dpo-mllm/tree/main) | [✔️](https://huggingface.co/mllmTeam/minicpm-2b-dpo-mllm/tree/main) | |

## Quick Start

Expand Down
66 changes: 66 additions & 0 deletions examples/demo_minicpm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#include "cmdline.h"
#include "models/minicpm/configuration_minicpm.hpp"
#include "models/minicpm/modeling_minicpm.hpp"
#include "models/minicpm/tokenization_minicpm.hpp"
#include "processor/PostProcess.hpp"

using namespace mllm;

int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/minicpm_vocab.mllm");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/minicpm-2b-dpo-q4_k.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);

string vocab_path = cmdParser.get<string>("vocab");
string model_path = cmdParser.get<string>("model");
int tokens_limit = cmdParser.get<int>("limits");
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

auto tokenizer = MiniCPMTokenizer(vocab_path, "../vocab/minicpm_merges.txt");
MiniCPMConfig config(tokens_limit, "2B");
auto model = MiniCPMForCausalLM(config);
model.load(model_path);

vector<string> in_strs = {
"Hello, who are you?",
"山东省最高的山是哪座山, 它比黄山高还是矮?差距多少?",
"Please introduce Beijing University of Posts and Telecommunications.",
};

string system_prompt_start = tokenizer.token_user_o;
string system_prompt_end = tokenizer.token_user_c;

auto processOutput = [&](unsigned int id, std::string &text) -> std::pair<bool, std::string> {
text = std::regex_replace(text, std::regex("▁"), " ");
if (text == "<0x0A>") return {true, "\n"};
if (text == "</s>") return {false, ""};
if (id == 2) return {false, ""};
return {true, text};
};

for (int i = 0; i < in_strs.size(); ++i) {
auto in_str_origin = in_strs[i];
auto in_str = system_prompt_start + in_str_origin + system_prompt_end;
auto input_tensor = tokenizer.tokenize(in_str);
std::cout << "[Q] " << in_str << std::endl;
std::cout << "[A] " << std::flush;
for (int step = 0; step < 100; step++) {
auto result = model({input_tensor});
auto outputs = tokenizer.detokenize(result[0]);
auto out_string = outputs.first;
auto out_token = outputs.second;
auto [isOk, print_string] = processOutput(out_token, out_string);
if (isOk) {
std::cout << print_string << std::flush;
} else {
break;
}
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
model.clear_kvcache();
}
}
92 changes: 92 additions & 0 deletions src/models/minicpm/configuration_minicpm.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#ifndef CONFIG_MINICPM_HPP
#define CONFIG_MINICPM_HPP
#include "models/transformer/configuration_transformer.hpp"

using namespace mllm;

class MiniCPMNameConfig : public TransformerNameConfig {
public:
std::string blk_name;
std::string token_embd_name;
std::string post_norm_name;
std::string lm_head_name;
std::string _gate_proj_name;

void init() {
blk_name = "model.layers.";
_attn_base_name = "self_attn.";
_ffn_base_name = "mlp.";
_q_proj_name = "q_proj";
_k_proj_name = "k_proj";
_v_proj_name = "v_proj";
_o_proj_name = "o_proj";
_gate_proj_name = "gate_proj";
_up_proj_name = "up_proj";
_down_proj_name = "down_proj";
_attn_norm_name = "input_layernorm";
_ffn_norm_name = "post_attention_layernorm";
token_embd_name = "model.embed_tokens";
post_norm_name = "model.norm";
lm_head_name = "lm_head";

}
};

struct MiniCPMConfig {
explicit MiniCPMConfig(int token_limit, string billions = "2B") :
cache_limit(token_limit) {
names_config.init();
string billionsType;
std::transform(billions.begin(), billions.end(), std::back_inserter(billionsType),
::tolower);
if (billionsType == "2b") {
attention_dropout = 0.0;
bos_token_id = 1;
eos_token_id = 2;
hidden_act = "silu";
hidden_size = 2304;
initializer_range = 0.1;
intermediate_size = 5760;
max_position_embeddings = 4096;
model_type = "minicpm";
num_attention_heads = 36;
num_hidden_layers = 40;
num_key_value_heads = 36;
rms_norm_eps = 1e-05;
rope_theta = 10000.0;
vocab_size = 122753;
head_dim = 64;
scale_depth = 1.4;
scale_emb = 12;
dim_model_base = 256;
} else {
throw std::runtime_error("Unsupported model size");
}
}

float attention_dropout = 0.0;
int bos_token_id = 1;
int eos_token_id = 2;
std::string hidden_act = "silu";
int hidden_size = 2304;
float initializer_range = 0.1;
int intermediate_size = 5760;
int max_position_embeddings = 4096;
std::string model_type = "minicpm";
int num_attention_heads = 36;
int num_hidden_layers = 40;
int num_key_value_heads = 36;
double rms_norm_eps = 1e-05;
float rope_theta = 10000.0;
int vocab_size = 122753;
int head_dim = 64; //self.hidden_size // self.num_heads
float scale_depth = 1.4;
float scale_emb = 12;
float dim_model_base = 256;

int cache_limit;
RoPEType RoPE_type = RoPEType::HFHUBROPE;
MiniCPMNameConfig names_config;
};

#endif // CONFIG_MINICPM_HPP
141 changes: 141 additions & 0 deletions src/models/minicpm/modeling_minicpm.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#ifndef MODELING_MINICPM_HPP
#define MODELING_MINICPM_HPP

#include "Layer.hpp"
#include "Module.hpp"
#include "configuration_minicpm.hpp"
#include "models/transformer/modeling_transformer.hpp"
#include <any>
#include <cmath>

using namespace mllm;

class MiniCPMMLP final : public Module {
public:
MiniCPMMLP() = default;
MiniCPMMLP(int hidden_size, int intermediate_size, const MiniCPMNameConfig &names, const std::string &base_name) {
gate_proj = Linear(hidden_size, intermediate_size, false, base_name + names._gate_proj_name);
silu = SiLU(base_name + "act");
up_proj = Linear(hidden_size, intermediate_size, false, base_name + names._up_proj_name);
down_proj = Linear(intermediate_size, hidden_size, false, base_name + names._down_proj_name);
}

std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
auto x = gate_proj(inputs[0]);
x = silu(x);
auto y = up_proj(inputs[0]); //ERROR
x = x * y;
x = down_proj(x);
return {x};
}

private:
Layer gate_proj;
Layer up_proj;
Layer down_proj;

Layer silu;
};

class MiniCPMDecoder final : public Module {
public:
MiniCPMDecoder() = default;
MiniCPMDecoder(const MiniCPMConfig &config, const MiniCPMNameConfig &names, const string &base_name) {
self_atten = MultiHeadAttention(config.hidden_size, config.num_attention_heads, config.num_key_value_heads,
config.hidden_size / config.num_attention_heads, SPLIT_NONE, false, false,
config.RoPE_type, config.rope_theta, config.max_position_embeddings, config.cache_limit,
true, false, names, base_name + names._attn_base_name);
mlp = MiniCPMMLP(config.hidden_size, config.intermediate_size, names, base_name + names._ffn_base_name);
input_layernorm = RMSNorm(config.hidden_size, config.rms_norm_eps, base_name + names._attn_norm_name);
post_attention_layernorm = RMSNorm(config.hidden_size, config.rms_norm_eps, base_name + names._ffn_norm_name);
scale_depth = config.scale_depth;
num_hidden_layers = config.num_hidden_layers;
}

std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
auto hidden_states = input_layernorm(inputs[0]);
hidden_states = self_atten({hidden_states, hidden_states, hidden_states})[0];
auto tmp = hidden_states * (scale_depth / std::sqrt(num_hidden_layers)) + inputs[0];
hidden_states = post_attention_layernorm(tmp);
hidden_states = mlp({hidden_states})[0];
hidden_states = hidden_states * (scale_depth / std::sqrt(num_hidden_layers)) + tmp;
return {hidden_states};
}

MultiHeadAttention &get_attention() {
return self_atten;
}

private:
MultiHeadAttention self_atten;
MiniCPMMLP mlp;
Layer input_layernorm;
Layer post_attention_layernorm;
float scale_depth;
int num_hidden_layers;
};

class MiniCPMModel final : public Module {
public:
MiniCPMModel() = default;
MiniCPMModel(const MiniCPMConfig &config, const MiniCPMNameConfig &names, const string &base_name) {
blocks = List<MiniCPMDecoder>(config.num_hidden_layers, config, names, base_name);
norm = RMSNorm(config.hidden_size, config.rms_norm_eps, names.post_norm_name);
}
//receive embeds
std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
auto hidden_states = inputs[0];
for (auto &block : blocks) {
hidden_states = block({hidden_states})[0];
}
hidden_states = norm(hidden_states);
return {hidden_states};
}

void clear_kvcache() {
for (auto &block : blocks) {
auto kvcahce = block.get_attention().get_cache();
for (auto &cache : kvcahce) {
cache->clearCache();
}
}
}
private:
std::vector<MiniCPMDecoder> blocks;
Layer norm;
};

class MiniCPMForCausalLM final : public Module {
public:
MiniCPMForCausalLM(MiniCPMConfig &config) {
auto names = config.names_config;
scale_emb = config.scale_emb;
dim_model_base = config.dim_model_base;
hidden_size = config.hidden_size;
embedding = Embedding(config.vocab_size, config.hidden_size, names.token_embd_name);
model = MiniCPMModel(config, names, names.blk_name);
lm_head = Parameter(1, config.vocab_size, 1, config.hidden_size, names.token_embd_name + ".weight");
}

std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
auto x = embedding(inputs[0])*scale_emb;
auto outputs = model({x})[0];
outputs = outputs/(hidden_size / dim_model_base);
outputs = Tensor::mm(outputs, lm_head().transpose(Chl::SEQUENCE, Chl::DIMENSION));
return {outputs};
}
void clear_kvcache() {
model.clear_kvcache();
}

private:
int hidden_size;
float dim_model_base;
bool tie_embedding_words;
float scale_emb;
Layer embedding;
Parameter lm_head;
MiniCPMModel model;
};

#endif // MODELING_MINICPM_HPP
Loading
Loading