Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Include server in releases + other build system cleanups #1610

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ on:
push:
branches:
- master
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']

env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
Expand Down Expand Up @@ -157,15 +157,15 @@ jobs:
matrix:
include:
- build: 'avx2'
defines: ''
defines: '-DLLAMA_BUILD_SERVER=ON'
- build: 'avx'
defines: '-DLLAMA_AVX2=OFF'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
- build: 'avx512'
defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'clblast'
defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas'
defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'

steps:
- name: Clone
Expand Down Expand Up @@ -292,7 +292,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DLLAMA_CUBLAS=ON
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
cmake --build . --config Release

- name: Get commit hash
Expand Down
13 changes: 11 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Define the default target now so that it is always the first target
default: main quantize quantize-stats perplexity embedding vdot
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot

ifdef LLAMA_BUILD_SERVER
BUILD_TARGETS += server
endif

default: $(BUILD_TARGETS)

ifndef UNAME_S
UNAME_S := $(shell uname -s)
Expand Down Expand Up @@ -210,7 +216,7 @@ libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

clean:
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h

#
# Examples
Expand All @@ -237,6 +243,9 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)

build-info.h: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh > [email protected]
@if ! cmp -s [email protected] $@; then \
Expand Down
16 changes: 8 additions & 8 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ struct llama_server_context
std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
// compare the evaluated prompt with the new prompt
int new_prompt_len = 0;
for (int i = 0;i < prompt_tokens.size(); i++) {
for (size_t i = 0; i < prompt_tokens.size(); i++) {
if (i < processed_tokens.size() &&
processed_tokens[i] == prompt_tokens[i])
{
Expand All @@ -71,7 +71,7 @@ struct llama_server_context
{
embd_inp.push_back(prompt_tokens[i]);
if(new_prompt_len == 0) {
if(i - 1 < n_past) {
if(int32_t(i) - 1 < n_past) {
processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end());
}
// Evaluate the new fragment prompt from the last token processed.
Expand Down Expand Up @@ -136,7 +136,7 @@ struct llama_server_context
{
// out of user input, sample next token
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
// const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
const float top_p = params.top_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
Expand Down Expand Up @@ -306,12 +306,12 @@ struct llama_server_context
// Avoid add the no show words to the response
for (std::vector<llama_token> word_tokens : no_show_words)
{
int match_token = 1;
size_t match_token = 1;
if (tokens_predicted.front() == word_tokens.front())
{
bool execute_matching = true;
if (tokens_predicted.size() > 1) { // if previus tokens had been tested
for (int i = 1; i < word_tokens.size(); i++)
for (size_t i = 1; i < word_tokens.size(); i++)
{
if (i >= tokens_predicted.size()) {
match_token = i;
Expand Down Expand Up @@ -601,7 +601,7 @@ int main(int argc, char **argv)

Server svr;

svr.Get("/", [](const Request &req, Response &res)
svr.Get("/", [](const Request &, Response &res)
{ res.set_content("<h1>llama.cpp server works</h1>", "text/html"); });

svr.Post("/completion", [&llama](const Request &req, Response &res)
Expand Down Expand Up @@ -649,7 +649,7 @@ int main(int argc, char **argv)
{"tokens_predicted", llama.num_tokens_predicted}};
return res.set_content(data.dump(), "application/json");
}
catch (json::exception e)
catch (const json::exception &e)
{
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
json data = {
Expand Down Expand Up @@ -701,7 +701,7 @@ int main(int argc, char **argv)
{"content", result },
{"stop", !llama.has_next_token }};
return res.set_content(data.dump(), "application/json");
} catch (json::exception e) {
} catch (const json::exception &e) {
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
json data = {
{"content", "" },
Expand Down