diff --git a/README.md b/README.md index 79b22f9a6..c672f6a77 100644 --- a/README.md +++ b/README.md @@ -433,9 +433,10 @@ LD_LIBRARY_PATH=$CONDA_PREFIX/lib ./build/run ../${MODEL_NAME}.so -z ../${MODEL_ ## Mobile and Edge Execution Test (x86) You can also run the model with the runner-et. This requires you first build the runner. See instructions [here](#setting-up-executorch-and-runner-et). +After this is done, you can run runner-et with ``` -./runner-et/cmake-out/runner_et run ${MODEL_OUT}/model.pte -z ${MODEL_OUT}/tokenizer.bin -i "Hello my name is" +./build/cmake-out/runner_et ${MODEL_OUT}/model.pte -z ${MODEL_OUT}/tokenizer.bin -i "Once upon a time in a land far away" ``` While we have shown the export and execution of a small model to a mobile/edge diff --git a/runner/run.cpp b/runner/run.cpp index edf5b1eb7..27c16cc95 100644 --- a/runner/run.cpp +++ b/runner/run.cpp @@ -52,12 +52,12 @@ typedef struct { Config config; // the hyperparameters of the architecture (the blueprint) RunState state; // buffers for the "wave" of activations in the forward pass -#ifdef __AOTI_MODEL__ +#ifdef __AOTI_MODEL__ torch::inductor::AOTIModelContainerRunnerCpu *runner; #else // __ET_MODEL__ Module* runner; #endif - + } Transformer; void malloc_run_state(RunState* s, Config* p) { @@ -102,6 +102,7 @@ void build_transformer(Transformer *t, char* checkpoint_path, int vocab_size, in t->runner = new Module( /* path to PTE model */ checkpoint_path, /* PTE mmap settings */ Module::MlockConfig::UseMlockIgnoreErrors + ); #endif } @@ -153,7 +154,7 @@ float* forward(Transformer* transformer, int token, int pos) { torch::Tensor result = transformer->runner->run(inputs)[0]; auto logits = result[0].data_ptr(); - + #else // __ET_MODEL__ ManagedTensor pos_managed( pos_buffer, sizeof(int64_t), { 1 }, ScalarType::Long); @@ -171,11 +172,14 @@ float* forward(Transformer* transformer, int token, int pos) { inputs.push_back(tmp1); inputs.push_back(tmp2); Result> outputs_res = transformer->runner->forward(inputs); - assert (outputs_res.ok()); + if (!outputs_res.ok()) { + fprintf(stderr, "Executorch forward() failed."); + exit(EXIT_FAILURE); + } std::vector result = outputs_res.get(); auto logits = result[0].toTensor().const_data_ptr(); #endif - + memcpy(s->logits, logits, p->vocab_size * sizeof(float)); return s->logits; } @@ -547,9 +551,8 @@ long time_in_ms() { // generation loop void generate(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler, const char *prompt, int steps) { - const char *empty_prompt = ""; - if (prompt == NULL) { prompt = empty_prompt; } - prompt = "Once upon a time"; + const char *default_prompt = "Once upon a time"; + if (prompt == NULL) { prompt = default_prompt; } // encode the (string) prompt into tokens sequence int num_prompt_tokens = 0; @@ -560,10 +563,12 @@ void generate(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler, exit(EXIT_FAILURE); } + #ifdef DEBUG std::cerr << "# " << num_prompt_tokens << "\n"; for(int i = 0; i < num_prompt_tokens; i++) std::cerr << "[" << i << "] " << prompt_tokens[i]; std::cerr << "\n"; + #endif // start the main loop long start = 0; // used to time our code, only initialized after first iteration