Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cam: parallel simple fixes #3348

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 16 additions & 13 deletions examples/simple/simple.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "common.h"
#include "llama.h"

#include <algorithm>
#include <cmath>
#include <cstdio>
#include <string>
Expand Down Expand Up @@ -42,7 +43,9 @@ int main(int argc, char ** argv) {
llama_context_params ctx_params = llama_context_default_params();

ctx_params.seed = 1234;
ctx_params.n_ctx = 2048;
ctx_params.n_ctx = n_len*n_parallel; // FIXME: use n_kv_req instead (tokenize with model after #3301)
ctx_params.n_batch = std::max(n_len, n_parallel);
// ctx_params.n_gpu_layers = 99; // offload all layers to the GPU

llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);

Expand All @@ -66,11 +69,11 @@ int main(int argc, char ** argv) {
const int n_ctx = llama_n_ctx(ctx);
const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;

LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_parallel, n_kv_req);
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);

// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
return 1;
}
Expand All @@ -88,7 +91,7 @@ int main(int argc, char ** argv) {
// create a llama_batch with size 512
// we use this object to submit token data for decoding

llama_batch batch = llama_batch_init(512, 0);
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0);

// evaluate the initial prompt
batch.n_tokens = tokens_list.size();
Expand Down Expand Up @@ -133,12 +136,6 @@ int main(int argc, char ** argv) {
const auto t_main_start = ggml_time_us();

while (n_cur <= n_len) {
// evaluate the current batch with the transformer model
if (llama_decode(ctx, batch, params.n_threads)) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
return 1;
}

// prepare the next batch
batch.n_tokens = 0;

Expand All @@ -149,8 +146,8 @@ int main(int argc, char ** argv) {
continue;
}

auto n_vocab = llama_n_vocab(ctx);
auto logits = llama_get_logits(ctx) + i_batch[i] * n_vocab;
auto n_vocab = llama_n_vocab(ctx);
auto * logits = llama_get_logits(ctx) + i_batch[i] * n_vocab;

std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
Expand Down Expand Up @@ -178,7 +175,7 @@ int main(int argc, char ** argv) {
i_batch[i] = -1;
LOG_TEE("\n");
if (n_parallel > 1) {
LOG_TEE("%s: stream %d finished", __func__, i);
LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
}

continue;
Expand Down Expand Up @@ -211,6 +208,12 @@ int main(int argc, char ** argv) {
}

n_cur += 1;

// evaluate the current batch with the transformer model
if (llama_decode(ctx, batch, params.n_threads)) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
return 1;
}
}

LOG_TEE("\n");
Expand Down