From cda83f8b9fbb3f4259ab13114f0b4f3724f42eeb Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@mozilla.com>
Date: Thu, 1 Aug 2024 16:06:23 -0700
Subject: [PATCH] Make GGML threads spawn 10x faster

This change causes GGML threads to be recycled. That way there's less of
a gap between predictions in the tracing diagram. It has the most impact
on smaller models like TinyLLaMA where I'm seeing a ~15% boost in tokens
per second when generating text. The new llamafiler embeddings server is
going ~25% faster, handling ~1000 requests per second on my workstation.
This change also boosts my TinyLLaMA prefill speed ~20%, by letting CONT
be multi-threaded which Slaren apparently discovered upstream last month
---
 llama.cpp/ggml.c                  |  34 +++--
 llamafile/BUILD.mk                |  28 ++--
 llamafile/core_manager.cpp        |   6 +-
 llamafile/llamafile.c             |   2 +-
 llamafile/pool.cpp                | 212 ++++++++++++++++++++++++++++++
 llamafile/pool.h                  |  15 +++
 llamafile/pool_cancel_test.cpp    |  36 +++++
 llamafile/pool_test.cpp           |  87 ++++++++++++
 llamafile/server/main.cpp         |   4 +-
 llamafile/server/server.cpp       |   8 +-
 llamafile/server/worker.cpp       |   4 +-
 llamafile/tinyblas_cpu_mixmul.inc |   1 +
 llamafile/zipalign.c              |  12 +-
 13 files changed, 406 insertions(+), 43 deletions(-)
 create mode 100644 llamafile/pool.cpp
 create mode 100644 llamafile/pool.h
 create mode 100644 llamafile/pool_cancel_test.cpp
 create mode 100644 llamafile/pool_test.cpp

diff --git a/llama.cpp/ggml.c b/llama.cpp/ggml.c
index fc306a8f2f..519c25deae 100644
--- a/llama.cpp/ggml.c
+++ b/llama.cpp/ggml.c
@@ -44,6 +44,7 @@ SOFTWARE.");
 #include "llamafile/thread.h"
 #include "llamafile/crash.h"
 #include "llamafile/trace.h"
+#include "llamafile/pool.h"
 
 #include <alloca.h>
 #include <assert.h>
@@ -1651,7 +1652,7 @@ struct ggml_compute_state_shared {
     void* abort_callback_data;
 };
 
-typedef pthread_t ggml_thread_t;
+typedef llamafile_task_t ggml_thread_t;
 
 struct ggml_compute_state {
     _Atomic(ggml_thread_t) thrd;
@@ -13302,6 +13303,7 @@ GGML_CALL void ggml_rope_yarn_corr_dims(
     dims[1] = MIN(n_dims - 1, end);
 }
 
+__target_clones("avx2") // [jart]
 static void ggml_compute_forward_rope_f32(
         const struct ggml_compute_params * params,
         struct ggml_tensor * dst,
@@ -18355,10 +18357,11 @@ typedef int ggml_lock_t;
 
 #define GGML_LOCK_INITIALIZER 0
 
-typedef pthread_t ggml_thread_t;
+typedef llamafile_task_t ggml_thread_t;
 
-#define ggml_thread_create llamafile_thread_create // [jart]
-#define ggml_thread_join   pthread_join
+#define ggml_thread_create llamafile_task_create // [jart]
+#define ggml_thread_cancel llamafile_task_cancel
+#define ggml_thread_join   llamafile_task_join
 
 #else
 
@@ -18382,10 +18385,11 @@ typedef int ggml_lock_t;
 
 #define GGML_LOCK_INITIALIZER 0
 
-typedef pthread_t ggml_thread_t;
+typedef llamafile_task_t ggml_thread_t;
 
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
+#define ggml_thread_create llamafile_task_create // [jart]
+#define ggml_thread_cancel llamafile_task_cancel
+#define ggml_thread_join   llamafile_task_join
 
 #endif
 
@@ -18484,6 +18488,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
     switch (node->op) {
         case GGML_OP_CPY:
         case GGML_OP_DUP:
+        case GGML_OP_CONT: // [jart] don't move me
         case GGML_OP_ADD:
         case GGML_OP_ADD1:
         case GGML_OP_ACC:
@@ -18568,7 +18573,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
             } break;
         case GGML_OP_SCALE:
         case GGML_OP_SET:
-        case GGML_OP_CONT:
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
@@ -19185,10 +19189,10 @@ static void ggml_compute_canceled(void *arg) {
     struct ggml_compute_cleanup *cleanup = arg;
     clear_numa_thread_affinity();
     for (int j = 1; j < cleanup->n_threads; j++) {
-        pthread_t t;
+        ggml_thread_t t;
         if ((t = atomic_exchange_explicit(&cleanup->workers[j].thrd, 0,
                                           memory_order_relaxed))) {
-            pthread_cancel(t);
+            ggml_thread_cancel(t);
             const int rc = ggml_thread_join(t, NULL);
             GGML_ASSERT(rc == 0);
         }
@@ -19241,14 +19245,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
                 .is_main_thread = false, // [jart]
             };
 
-            pthread_attr_t attr;
-            pthread_attr_init(&attr);
-            pthread_attr_setstacksize(&attr, 128 * 1024);
-            pthread_attr_setguardsize(&attr, sysconf(_SC_PAGESIZE));
-            pthread_attr_setsigaltstacksize_np(&attr, sysconf(_SC_MINSIGSTKSZ) + 16384);
-            const int rc = ggml_thread_create((pthread_t *)&workers[j].thrd, &attr,
+            const int rc = ggml_thread_create((ggml_thread_t *)&workers[j].thrd,
                                               ggml_graph_compute_thread, &workers[j]);
-            pthread_attr_destroy(&attr);
             GGML_ASSERT(rc == 0);
             UNUSED(rc);
         }
@@ -19276,7 +19274,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     int cs;
     pthread_setcancelstate(PTHREAD_CANCEL_MASKED, &cs);
     for (int j = 1; j < n_threads; j++) {
-        pthread_t t;
+        ggml_thread_t t;
         if ((t = atomic_exchange_explicit(&workers[j].thrd, 0,
                                           memory_order_relaxed))) {
             const int rc = ggml_thread_join(t, NULL);
diff --git a/llamafile/BUILD.mk b/llamafile/BUILD.mk
index 0e599dbb18..86dae40a73 100644
--- a/llamafile/BUILD.mk
+++ b/llamafile/BUILD.mk
@@ -42,14 +42,16 @@ o/$(MODE)/llamafile/tokenize:				\
 		o/$(MODE)/llama.cpp/llama.cpp.a
 
 .PHONY: o/$(MODE)/llamafile
-o/$(MODE)/llamafile:					\
-		$(LLAMAFILE_OBJS)			\
-		o/$(MODE)/llamafile/server		\
-		o/$(MODE)/llamafile/simple		\
-		o/$(MODE)/llamafile/zipalign		\
-		o/$(MODE)/llamafile/zipcheck		\
-		o/$(MODE)/llamafile/tokenize		\
-		o/$(MODE)/llamafile/addnl		\
+o/$(MODE)/llamafile:						\
+		$(LLAMAFILE_OBJS)				\
+		o/$(MODE)/llamafile/server			\
+		o/$(MODE)/llamafile/simple			\
+		o/$(MODE)/llamafile/zipalign			\
+		o/$(MODE)/llamafile/zipcheck			\
+		o/$(MODE)/llamafile/tokenize			\
+		o/$(MODE)/llamafile/addnl			\
+		o/$(MODE)/llamafile/pool_test.runs		\
+		o/$(MODE)/llamafile/pool_cancel_test.runs	\
 
 ################################################################################
 # microarchitectures
@@ -141,6 +143,16 @@ o/$(MODE)/llamafile/tinyblas_cpu_sgemm_arm82.o:		\
 ################################################################################
 # testing
 
+o/$(MODE)/llamafile/pool_test:				\
+		o/$(MODE)/llamafile/pool_test.o		\
+		o/$(MODE)/llamafile/crash.o		\
+		o/$(MODE)/llamafile/pool.o		\
+
+o/$(MODE)/llamafile/pool_cancel_test:			\
+		o/$(MODE)/llamafile/pool_cancel_test.o	\
+		o/$(MODE)/llamafile/crash.o		\
+		o/$(MODE)/llamafile/pool.o		\
+
 o/$(MODE)/llamafile/thread_test:			\
 		o/$(MODE)/llamafile/thread_test.o	\
 		o/$(MODE)/llamafile/crash.o		\
diff --git a/llamafile/core_manager.cpp b/llamafile/core_manager.cpp
index 05bbf65531..d7627c0fd8 100644
--- a/llamafile/core_manager.cpp
+++ b/llamafile/core_manager.cpp
@@ -36,8 +36,8 @@ static void unlock_mutex(void *arg) {
 }
 
 int CoreManager::acquire(int need, int greed) {
-    unassert(need >= 1);
-    unassert(greed >= need);
+    npassert(need >= 1);
+    npassert(greed >= need);
 
     int got = 0;
 
@@ -80,5 +80,5 @@ void CoreManager::release(int count) {
     }
     pthread_cond_signal(&cv_);
     pthread_mutex_unlock(&mu_);
-    unassert(ok);
+    npassert(ok);
 }
diff --git a/llamafile/llamafile.c b/llamafile/llamafile.c
index a98755f05f..161d31155d 100644
--- a/llamafile/llamafile.c
+++ b/llamafile/llamafile.c
@@ -330,7 +330,7 @@ size_t llamafile_tell(struct llamafile *file) {
     if (!file->fp)
         return file->position;
     long ret = ftell(file->fp);
-    unassert(ret != -1); // shouldn't fail because we seeked earlier
+    npassert(ret != -1); // shouldn't fail because we seeked earlier
     return (size_t)ret;
 }
 
diff --git a/llamafile/pool.cpp b/llamafile/pool.cpp
new file mode 100644
index 0000000000..0fc5ce426b
--- /dev/null
+++ b/llamafile/pool.cpp
@@ -0,0 +1,212 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pool.h"
+
+#include <assert.h>
+#include <cosmo.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdatomic.h>
+#include <unistd.h>
+
+#include "threadlocal.h"
+
+struct llamafile_thread;
+static void llamafile_thread_canceled(llamafile_thread *);
+static ThreadLocal<llamafile_thread> g_key(llamafile_thread_canceled);
+
+struct llamafile_task {
+    pthread_cond_t cv = PTHREAD_COND_INITIALIZER;
+    pthread_mutex_t mu = PTHREAD_MUTEX_INITIALIZER;
+    void *(*func)(void *);
+    void *arg;
+    void *res;
+    pthread_t th = -1;
+};
+
+struct llamafile_thread {
+    pthread_cond_t cv = PTHREAD_COND_INITIALIZER;
+    pthread_mutex_t mu = PTHREAD_MUTEX_INITIALIZER;
+    llamafile_task *task;
+    llamafile_thread *next;
+    pthread_t th;
+};
+
+static atomic_int g_active;
+static _Atomic(llamafile_thread *) g_idle;
+
+static void unlock_mutex(void *arg) {
+    pthread_mutex_t *mu = (pthread_mutex_t *)arg;
+    pthread_mutex_unlock(mu);
+}
+
+static void idle_push(llamafile_thread *thread) {
+    int backoff = 0;
+    thread->next = atomic_load_explicit(&g_idle, memory_order_relaxed);
+    while (!atomic_compare_exchange_weak_explicit(&g_idle, &thread->next, thread,
+                                                  memory_order_acq_rel, memory_order_relaxed))
+        backoff = pthread_delay_np(&g_idle, backoff);
+}
+
+static llamafile_thread *idle_pop(void) {
+    int backoff = 0;
+    llamafile_thread *thread;
+    for (;;) {
+        if ((thread = atomic_load_explicit(&g_idle, memory_order_relaxed))) {
+            if (atomic_compare_exchange_weak_explicit(&g_idle, &thread, thread->next,
+                                                      memory_order_acq_rel, memory_order_relaxed))
+                return thread;
+            backoff = pthread_delay_np(g_idle, backoff);
+        } else {
+            return nullptr;
+        }
+    }
+}
+
+static void cancel_task(llamafile_task *task) {
+    pthread_mutex_lock(&task->mu);
+    task->res = PTHREAD_CANCELED;
+    task->th = 0;
+    pthread_cond_signal(&task->cv);
+    pthread_mutex_unlock(&task->mu);
+}
+
+static void llamafile_thread_canceled(llamafile_thread *thread) {
+    thread->th = 0;
+    cancel_task(thread->task);
+    delete thread;
+    --g_active;
+}
+
+static void *llamafile_thread_worker(void *arg) {
+    errno_t err;
+    llamafile_thread *thread = (llamafile_thread *)arg;
+
+    ++g_active;
+    g_key.set(thread);
+    do {
+        void *res = thread->task->func(thread->task->arg);
+        pthread_setcancelstate(PTHREAD_CANCEL_MASKED, 0);
+
+        pthread_mutex_lock(&thread->task->mu);
+        thread->task->res = res;
+        thread->task->th = 0;
+        pthread_cond_signal(&thread->task->cv);
+        pthread_mutex_unlock(&thread->task->mu);
+
+        pthread_cleanup_push(unlock_mutex, &thread->mu);
+        pthread_mutex_lock(&thread->mu);
+        thread->task = nullptr;
+        idle_push(thread);
+        while (!thread->task) {
+            err = pthread_cond_wait(&thread->cv, &thread->mu);
+            if (err == ECANCELED)
+                break;
+        }
+        pthread_cleanup_pop(true);
+        pthread_setcancelstate(PTHREAD_CANCEL_DEFERRED, 0);
+    } while (err != ECANCELED);
+
+    if (thread->task)
+        cancel_task(thread->task);
+
+    thread->th = 0;
+    g_key.set(nullptr);
+    delete thread;
+    --g_active;
+
+    return 0;
+}
+
+static errno_t llamafile_thread_create(llamafile_task *task) {
+    llamafile_thread *thread = new llamafile_thread;
+    thread->task = task;
+    pthread_attr_t attr;
+    pthread_attr_init(&attr);
+    pthread_attr_setstacksize(&attr, 128 * 1024);
+    pthread_attr_setguardsize(&attr, sysconf(_SC_PAGESIZE));
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+    pthread_attr_setsigaltstacksize_np(&attr, sysconf(_SC_MINSIGSTKSZ) + 16384);
+    errno_t err = pthread_create(&thread->th, &attr, llamafile_thread_worker, thread);
+    pthread_attr_destroy(&attr);
+    if (!err) {
+        task->th = thread->th;
+    } else {
+        delete thread;
+    }
+    return err;
+}
+
+errno_t llamafile_task_create(llamafile_task **out_task, void *(*func)(void *), void *arg) {
+    llamafile_task *task = new llamafile_task;
+    task->func = func;
+    task->arg = arg;
+    errno_t err;
+    llamafile_thread *thread;
+    if ((thread = idle_pop())) {
+        pthread_mutex_lock(&thread->mu);
+        thread->task = task;
+        task->th = thread->th;
+        pthread_cond_signal(&thread->cv);
+        pthread_mutex_unlock(&thread->mu);
+        err = 0;
+    } else {
+        err = llamafile_thread_create(task);
+    }
+    if (!err) {
+        *out_task = task;
+    } else {
+        delete task;
+    }
+    return err;
+}
+
+errno_t llamafile_task_join(llamafile_task *task, void **out_res) {
+    pthread_cleanup_push(unlock_mutex, &task->mu);
+    pthread_mutex_lock(&task->mu);
+    while (task->th)
+        pthread_cond_wait(&task->cv, &task->mu);
+    pthread_cleanup_pop(true);
+    if (out_res)
+        *out_res = task->res;
+    delete task;
+    return 0;
+}
+
+errno_t llamafile_task_cancel(llamafile_task *task) {
+    errno_t err = 0;
+    if (task->th)
+        err = pthread_cancel(task->th);
+    return err;
+}
+
+void llamafile_task_shutdown(void) {
+    llamafile_thread *thread;
+    while ((thread = idle_pop()))
+        if (thread->th)
+            pthread_cancel(thread->th);
+    int backoff = 0;
+    while (g_active)
+        backoff = pthread_delay_np(&g_idle, backoff);
+}
+
+static struct llamafile_tasks {
+    ~llamafile_tasks(void) {
+        llamafile_task_shutdown();
+    }
+} g_tasks;
diff --git a/llamafile/pool.h b/llamafile/pool.h
new file mode 100644
index 0000000000..22d047787b
--- /dev/null
+++ b/llamafile/pool.h
@@ -0,0 +1,15 @@
+#pragma once
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct llamafile_task *llamafile_task_t;
+
+errno_t llamafile_task_create(llamafile_task_t *, void *(*)(void *), void *);
+errno_t llamafile_task_join(llamafile_task_t, void **);
+errno_t llamafile_task_cancel(llamafile_task_t);
+void llamafile_task_shutdown(void);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/llamafile/pool_cancel_test.cpp b/llamafile/pool_cancel_test.cpp
new file mode 100644
index 0000000000..bd52177dd9
--- /dev/null
+++ b/llamafile/pool_cancel_test.cpp
@@ -0,0 +1,36 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pool.h"
+
+#include <assert.h>
+#include <cosmo.h>
+#include <pthread.h>
+
+void *waiter(void *arg) {
+    pause();
+    return 0;
+}
+
+int main(int argc, char *argv[]) {
+    ShowCrashReports();
+    llamafile_task_t task;
+    npassert(!llamafile_task_create(&task, waiter, 0));
+    npassert(!llamafile_task_cancel(task));
+    npassert(!llamafile_task_join(task, 0));
+    CheckForMemoryLeaks();
+}
diff --git a/llamafile/pool_test.cpp b/llamafile/pool_test.cpp
new file mode 100644
index 0000000000..148835e3c1
--- /dev/null
+++ b/llamafile/pool_test.cpp
@@ -0,0 +1,87 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pool.h"
+
+#include <assert.h>
+#include <cosmo.h>
+#include <pthread.h>
+#include <time.h>
+
+#define BENCHMARK(ITERATIONS, WORK_PER_RUN, CODE) \
+    do { \
+        struct timespec start = timespec_real(); \
+        for (int __i = 0; __i < ITERATIONS; ++__i) { \
+            asm volatile("" ::: "memory"); \
+            CODE; \
+        } \
+        long long work = ((WORK_PER_RUN) ? (WORK_PER_RUN) : 1) * (ITERATIONS); \
+        double nanos = \
+            (timespec_tonanos(timespec_sub(timespec_real(), start)) + work - 1) / (double)work; \
+        if (nanos < 1000) { \
+            kprintf("%10g ns %2dx %s\n", nanos, (ITERATIONS), #CODE); \
+        } else { \
+            kprintf("%10lld ns %2dx %s\n", (long long)nanos, (ITERATIONS), #CODE); \
+        } \
+    } while (0)
+
+void *noop(void *arg) {
+    return arg;
+}
+
+void run_task() {
+    llamafile_task_t task;
+    npassert(!llamafile_task_create(&task, noop, 0));
+    npassert(!llamafile_task_join(task, 0));
+}
+
+void run_thread() {
+    pthread_t task;
+    npassert(!pthread_create(&task, 0, noop, 0));
+    npassert(!pthread_join(task, 0));
+}
+
+#define N 20
+
+void run_many_tasks() {
+    llamafile_task_t task[N];
+    for (int i = 0; i < N; ++i)
+        npassert(!llamafile_task_create(&task[i], noop, 0));
+    for (int i = 0; i < N; ++i)
+        npassert(!llamafile_task_join(task[i], 0));
+}
+
+void run_many_threads() {
+    pthread_t task[N];
+    for (int i = 0; i < N; ++i)
+        npassert(!pthread_create(&task[i], 0, noop, 0));
+    for (int i = 0; i < N; ++i)
+        npassert(!pthread_join(task[i], 0));
+}
+
+int main(int argc, char *argv[]) {
+    ShowCrashReports();
+    run_many_tasks();
+    BENCHMARK(10, 1, run_task());
+    BENCHMARK(10, 1, run_thread());
+    BENCHMARK(10, N, run_many_tasks());
+    BENCHMARK(10, N, run_many_threads());
+    llamafile_task_shutdown();
+    while (!pthread_orphan_np())
+        pthread_decimate_np();
+    CheckForMemoryLeaks();
+}
diff --git a/llamafile/server/main.cpp b/llamafile/server/main.cpp
index 1740cb55ea..9f7055de6b 100644
--- a/llamafile/server/main.cpp
+++ b/llamafile/server/main.cpp
@@ -20,6 +20,7 @@
 
 #include "llama.cpp/llama.h"
 #include "llamafile/llamafile.h"
+#include "llamafile/pool.h"
 #include "llamafile/version.h"
 
 #include "log.h"
@@ -75,7 +76,7 @@ main(int argc, char* argv[])
     set_thread_name("server");
     g_server = new Server(create_listening_socket(FLAG_listen));
     for (int i = 0; i < FLAG_workers; ++i)
-        unassert(!g_server->spawn());
+        npassert(!g_server->spawn());
 
     // run server
     signals_init();
@@ -94,6 +95,7 @@ main(int argc, char* argv[])
     SLOG("exit");
 
     // quality assurance
+    llamafile_task_shutdown();
     while (!pthread_orphan_np())
         pthread_decimate_np();
     CheckForMemoryLeaks();
diff --git a/llamafile/server/server.cpp b/llamafile/server/server.cpp
index bf9fa248f9..d3cf36aca4 100644
--- a/llamafile/server/server.cpp
+++ b/llamafile/server/server.cpp
@@ -38,10 +38,10 @@ Server::Server(int fd) : fd(fd)
 
 Server::~Server()
 {
-    unassert(fd == -1);
-    unassert(!worker_count.load(std::memory_order_relaxed));
-    unassert(dll_is_empty(active_workers));
-    unassert(dll_is_empty(idle_workers));
+    npassert(fd == -1);
+    npassert(!worker_count.load(std::memory_order_relaxed));
+    npassert(dll_is_empty(active_workers));
+    npassert(dll_is_empty(idle_workers));
     pthread_mutex_destroy(&lock_);
     pthread_cond_destroy(&cond_);
 }
diff --git a/llamafile/server/worker.cpp b/llamafile/server/worker.cpp
index e7d0dd3e78..2750225f80 100644
--- a/llamafile/server/worker.cpp
+++ b/llamafile/server/worker.cpp
@@ -42,7 +42,7 @@ Worker::kill()
 void
 Worker::begin()
 {
-    unassert(!working);
+    npassert(!working);
     server->lock();
     dll_remove(&server->idle_workers, &elem);
     if (dll_is_empty(server->idle_workers)) {
@@ -60,7 +60,7 @@ Worker::begin()
 void
 Worker::end()
 {
-    unassert(working);
+    npassert(working);
     server->lock();
     dll_remove(&server->active_workers, &elem);
     working = false;
diff --git a/llamafile/tinyblas_cpu_mixmul.inc b/llamafile/tinyblas_cpu_mixmul.inc
index eb61f93c89..f897bdafd5 100644
--- a/llamafile/tinyblas_cpu_mixmul.inc
+++ b/llamafile/tinyblas_cpu_mixmul.inc
@@ -288,6 +288,7 @@ class MixMul {
         case GGML_TASK_TYPE_COMPUTE:
             assert(!(cols % BS));
             assert(!(weights->nb[1] % sizeof(TA)));
+            // TODO(jart): parallelize this loop
             for (int expert = 0; expert < experts; ++expert) {
                 BLAS tb{cols / BS,
                         (const TA *)((const char *)weights->data + expert * weights->nb[2]),
diff --git a/llamafile/zipalign.c b/llamafile/zipalign.c
index 3ef8d6c623..7dca10a087 100644
--- a/llamafile/zipalign.c
+++ b/llamafile/zipalign.c
@@ -312,7 +312,7 @@ int main(int argc, char *argv[]) {
             case Z_MEM_ERROR:
                 DieOom();
             default:
-                unassert(!"deflateInit2() called with invalid parameters");
+                npassert(!"deflateInit2() called with invalid parameters");
             }
         }
 
@@ -345,7 +345,7 @@ int main(int argc, char *argv[]) {
                     case Z_MEM_ERROR:
                         DieOom();
                     case Z_STREAM_ERROR:
-                        unassert(!"deflate() stream error");
+                        npassert(!"deflate() stream error");
                     default:
                         break;
                     }
@@ -357,7 +357,7 @@ int main(int argc, char *argv[]) {
             }
         }
         if (flag_level)
-            unassert(deflateEnd(&zs) == Z_OK);
+            npassert(deflateEnd(&zs) == Z_OK);
 
         // write local file header
         uint8_t *lochdr = Malloc(hdrlen);
@@ -381,7 +381,7 @@ int main(int argc, char *argv[]) {
         p = ZIP_WRITE64(p, size); // uncompressed size
         p = ZIP_WRITE64(p, compsize); // compressed size
 
-        unassert(p == lochdr + hdrlen);
+        npassert(p == lochdr + hdrlen);
         if (pwrite(zfd, lochdr, hdrlen, zsize) != hdrlen)
             DieSys(zpath);
         free(lochdr);
@@ -418,7 +418,7 @@ int main(int argc, char *argv[]) {
         p = ZIP_WRITE64(p, size); // uncompressed size
         p = ZIP_WRITE64(p, compsize); // compressed size
         p = ZIP_WRITE64(p, zsize); // lfile offset
-        unassert(p == cdirhdr + hdrlen);
+        npassert(p == cdirhdr + hdrlen);
 
         // finish up
         ++cnt;
@@ -461,7 +461,7 @@ int main(int argc, char *argv[]) {
     p = ZIP_WRITE32(p, cdirsize); // size of central directory
     p = ZIP_WRITE32(p, 0xffffffffu); // offset of central directory
     p = ZIP_WRITE16(p, 0); // comment length
-    unassert(p == eocd + sizeof(eocd));
+    npassert(p == eocd + sizeof(eocd));
     if (pwrite(zfd, eocd, sizeof(eocd), zsize + cdirsize) != sizeof(eocd))
         DieSys(zpath);