minimize diff

ochafik · Feb 19, 2025 · e36f75a · e36f75a
1 parent 83aa5bf
commit e36f75a
Show file tree

Hide file tree

Showing 4 changed files with 1 addition and 9 deletions.
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -165,9 +165,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
         GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
     } else {
-        // ^(a|b|(?:[\s\S]*?)(c|d))(?:.*)
-        // ^[\s\S]*?(c|d)[\s\S]*
-        // ^(a|b)[\s\S]*
         std::vector<std::string> escaped_triggers_at_start;
         std::vector<std::string> escaped_triggers_anywhere;
         for (const auto & trigger : params.grammar_trigger_words) {

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -2046,8 +2046,8 @@ struct server_context {
 
         if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
             // Might be better to reject the request with a 400 ?
+            SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d\n", slot.params.n_predict, slot.n_predict);
             slot.params.n_predict = slot.n_predict;
-            SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict);
         }
 
         if (slot.params.ignore_eos && has_eos_token) {

diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
@@ -51,7 +51,6 @@ class ServerProcess:
     model_url: str | None = None
     model_file: str | None = None
     model_draft: str | None = None
-    model_draft_hf_repo: str | None = None
     n_threads: int | None = None
     n_gpu_layer: int | None = None
     n_batch: int | None = None
@@ -119,8 +118,6 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
             server_args.extend(["--model-draft", self.model_draft])
         if self.model_hf_repo:
             server_args.extend(["--hf-repo", self.model_hf_repo])
-        if self.model_draft_hf_repo:
-            server_args.extend(["--hf-repo-draft", self.model_draft_hf_repo])
         if self.model_hf_file:
             server_args.extend(["--hf-file", self.model_hf_file])
         if self.n_batch:

diff --git a/scripts/tool_bench.py b/scripts/tool_bench.py
@@ -67,7 +67,6 @@ def stop():
     parser = argparse.ArgumentParser(description='Run tests for the chat server.')
     parser.add_argument('--model', type=str, help='Name of the model to test (implementation agnostic)', required=True)
     parser.add_argument('--hf', type=str, help='GGUF huggingface model repo id (+ optional quant) to test w/ llama-server')
-    parser.add_argument('--hfd', type=str, help='GGUF huggingface draft model repo id (+ optional quant) to test w/ llama-server')
     parser.add_argument('--chat-template', type=str, help='Chat template override for llama-server')
     parser.add_argument('--ollama', type=str, help='Ollama model tag to test')
     parser.add_argument('--n', type=int, help='Number of times to run each test', default=30)
@@ -162,7 +161,6 @@ def elapsed():
                 server.n_predict = 512 # High because of DeepSeek R1
                 server.model_hf_repo = args.hf
                 server.model_hf_file = None
-                server.model_draft_hf_repo = args.hfd
                 server.chat_template = args.chat_template
                 if args.port is not None:
                     server.server_port = args.port