From 6f2aab2176b789111c853107d2a06fb3b8f55e89 Mon Sep 17 00:00:00 2001 From: Mike Heavers Date: Fri, 22 Nov 2024 11:28:00 -0800 Subject: [PATCH] Update README.md Add an example of using the /completions endpoint with Curl or Javascript - shows how to use Llamafile to do code completion. Could consider posting this on the main README.md as well. --- llama.cpp/server/README.md | 65 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 3 deletions(-) diff --git a/llama.cpp/server/README.md b/llama.cpp/server/README.md index 6a2333452a..d3420a26c9 100755 --- a/llama.cpp/server/README.md +++ b/llama.cpp/server/README.md @@ -189,6 +189,65 @@ node index.js `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) +### Examples + +**CODE COMPLETION** + +You can use the completions endpoint for Code Completion (Fill-In-the Middle or FIM Completion) with the following prompt syntax: + +
+Curl API Client Example + +```bash +curl 'http://127.0.0.1:8081/completion' \ +-X POST -H "Content-Type: application/json" \ +-H "Authorization: Bearer no-key" --data-binary \ +'{ + "model": "LlaMA_CPP", + "stream": false, + "prompt": "<|fim_prefix|>[CODE_BEFORE_CURSOR]<|fim_suffix|>[CODE_AFTER_CURSOR]<|fim_middle|>", + "temperature": 0.1, + "n_predict": 512, + "cache_prompt": true, + "stop": ["<|fim_middle|>", "\n\n", "<|endoftext|>"] +}' +``` +
+ +
+ Javascript API Client Example + + ```typescript + const generateCompletion = async (prefix: string, suffix: string) => { + try { + const response = await fetch('http://127.0.0.1:8080/completion', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: 'Bearer no-key', + }, + body: JSON.stringify({ + model: 'LlaMA_CPP', + stream: false, + prompt: `<|fim_prefix|>${prefix}<|fim_suffix|>${suffix}<|fim_middle|>`, + temperature: 0.1, + max_new_tokens: 512, + do_sample: false, + stop: ['<|fim_middle|>', '\n\n', '<|endoftext|>'], + }), + }); + const data = await response.json(); + } catch (error) { + console.error('Completion error:', error); + return null; + } + }; + + const completionResult = await generateCompletion('[YOUR_PREFIX', 'YOUR_SUFFIX'); + ``` +
+ + ### Result JSON: * Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion. @@ -274,12 +333,12 @@ Notice that each `probs` is an array of length `n_probs`. ```python import openai - + client = openai.OpenAI( base_url="http://localhost:8080/v1", # "http://:port" api_key = "sk-no-key-required" ) - + completion = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ @@ -287,7 +346,7 @@ Notice that each `probs` is an array of length `n_probs`. {"role": "user", "content": "Write a limerick about python exceptions"} ] ) - + print(completion.choices[0].message) ``` ... or raw HTTP requests: