From 20d5f465285332dade7e445c44f7559953b8ad39 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Fri, 5 Jan 2024 20:11:21 -0800 Subject: [PATCH] Make CLIP GPU acceleration work on UNIX / Windows In order to use ggml-backend safely with Cosmopolitan, this change converts all its function pointered functions to use Microsoft ABI --- build/config.mk | 6 +-- llama.cpp/ggml-backend-impl.h | 55 +++++++++++++------------ llama.cpp/ggml-backend.c | 38 +++++++++--------- llama.cpp/ggml-backend.h | 8 +++- llama.cpp/ggml-cuda.cu | 76 +++++++++++++++++------------------ llama.cpp/ggml-metal.m | 34 ++++++++-------- llama.cpp/llava/clip.cpp | 2 - 7 files changed, 111 insertions(+), 108 deletions(-) diff --git a/build/config.mk b/build/config.mk index 931b3aff07..4334a88a42 100644 --- a/build/config.mk +++ b/build/config.mk @@ -2,7 +2,7 @@ #── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘ PREFIX = /usr/local -COSMOCC = cosmocc/3.2.1 +COSMOCC = cosmocc/3.2.2 TOOLCHAIN = $(COSMOCC)/bin/cosmo AR = $(TOOLCHAIN)ar @@ -50,5 +50,5 @@ clean:; rm -rf o .PHONY: distclean distclean:; rm -rf o cosmocc -cosmocc/3.2.1: - build/download-cosmocc.sh $@ 3.2.1 c392eceb65736ab2b2700e3e0c3b05abbddd45ac9a7456c3e461f70d0ed8d01b +cosmocc/3.2.2: + build/download-cosmocc.sh $@ 3.2.2 9eb32720050b8c186a83be66b471059b3f5b76fc0d6a58bbed6a1cf8359b7b99 diff --git a/llama.cpp/ggml-backend-impl.h b/llama.cpp/ggml-backend-impl.h index 05859935a3..319ededb11 100644 --- a/llama.cpp/ggml-backend-impl.h +++ b/llama.cpp/ggml-backend-impl.h @@ -14,15 +14,14 @@ extern "C" { // buffer type typedef void * ggml_backend_buffer_type_context_t; - struct ggml_backend_buffer_type_i { - ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); - size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment - size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding - bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend + ggml_backend_buffer_t (*GGML_BACKEND_ABI alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); + size_t (*GGML_BACKEND_ABI get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment + size_t (*GGML_BACKEND_ABI get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding + bool (*GGML_BACKEND_ABI supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend // check if tensor data is in host memory // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) - bool (*is_host) (ggml_backend_buffer_type_t buft); + bool (*GGML_BACKEND_ABI is_host) (ggml_backend_buffer_type_t buft); }; struct ggml_backend_buffer_type { @@ -34,16 +33,16 @@ extern "C" { typedef void * ggml_backend_buffer_context_t; struct ggml_backend_buffer_i { - void (*free_buffer) (ggml_backend_buffer_t buffer); - //void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras - void * (*get_base) (ggml_backend_buffer_t buffer); - void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + void (*GGML_BACKEND_ABI free_buffer) (ggml_backend_buffer_t buffer); + //void (*GGML_BACKEND_ABI reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras + void * (*GGML_BACKEND_ABI get_base) (ggml_backend_buffer_t buffer); + void (*GGML_BACKEND_ABI init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + void (*GGML_BACKEND_ABI set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*GGML_BACKEND_ABI get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); // (optional) copy tensor between different buffer-type, allow for single-copy tranfers - void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); + void (*GGML_BACKEND_ABI cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*GGML_BACKEND_ABI cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*GGML_BACKEND_ABI clear) (ggml_backend_buffer_t buffer, uint8_t value); }; struct ggml_backend_buffer { @@ -67,33 +66,33 @@ extern "C" { typedef void * ggml_backend_context_t; struct ggml_backend_i { - const char * (*get_name)(ggml_backend_t backend); + const char * (*GGML_BACKEND_ABI get_name)(ggml_backend_t backend); - void (*free)(ggml_backend_t backend); + void (*GGML_BACKEND_ABI free)(ggml_backend_t backend); // buffer allocation - ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend); + ggml_backend_buffer_type_t (*GGML_BACKEND_ABI get_default_buffer_type)(ggml_backend_t backend); // (optional) asynchroneous tensor data access - void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + void (*GGML_BACKEND_ABI set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*GGML_BACKEND_ABI get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); // (optional) asynchroneous tensor copy - void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*GGML_BACKEND_ABI cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*GGML_BACKEND_ABI cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*synchronize)(ggml_backend_t backend); + void (*GGML_BACKEND_ABI synchronize)(ggml_backend_t backend); // compute graph with a plan - ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); - void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + ggml_backend_graph_plan_t (*GGML_BACKEND_ABI graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); + void (*GGML_BACKEND_ABI graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + void (*GGML_BACKEND_ABI graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph without a plan - void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); + void (*GGML_BACKEND_ABI graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); // check if the backend supports an operation - bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); + bool (*GGML_BACKEND_ABI supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); }; struct ggml_backend { diff --git a/llama.cpp/ggml-backend.c b/llama.cpp/ggml-backend.c index 414e784466..87b8f26359 100644 --- a/llama.cpp/ggml-backend.c +++ b/llama.cpp/ggml-backend.c @@ -393,39 +393,39 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) { // backend CPU -static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { +GGML_BACKEND_ABI static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { return (void *)buffer->context; } -static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { free(buffer->context); } -static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); GGML_UNUSED(buffer); } -static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { memcpy(data, (const char *)tensor->data + offset, size); GGML_UNUSED(buffer); } -static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { +GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); GGML_UNUSED(buffer); } -static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { +GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); GGML_UNUSED(buffer); } -static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { memset(buffer->context, value, buffer->size); } @@ -454,7 +454,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 -static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_BACKEND_ABI static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC? @@ -463,19 +463,19 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size); } -static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_BACKEND_ABI static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return TENSOR_ALIGNMENT; GGML_UNUSED(buft); } -static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { +GGML_BACKEND_ABI static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { return ggml_backend_is_cpu(backend); GGML_UNUSED(buft); } -static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { +GGML_BACKEND_ABI static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; GGML_UNUSED(buft); @@ -545,20 +545,20 @@ struct ggml_backend_cpu_context { size_t work_size; }; -static const char * ggml_backend_cpu_name(ggml_backend_t backend) { +GGML_BACKEND_ABI static const char * ggml_backend_cpu_name(ggml_backend_t backend) { return "CPU"; GGML_UNUSED(backend); } -static void ggml_backend_cpu_free(ggml_backend_t backend) { +GGML_BACKEND_ABI static void ggml_backend_cpu_free(ggml_backend_t backend) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; free(cpu_ctx->work_data); free(cpu_ctx); free(backend); } -static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) { +GGML_BACKEND_ABI static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_cpu_buffer_type(); GGML_UNUSED(backend); @@ -569,7 +569,7 @@ struct ggml_backend_plan_cpu { struct ggml_cgraph cgraph; }; -static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +GGML_BACKEND_ABI static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); @@ -584,7 +584,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend return cpu_plan; } -static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +GGML_BACKEND_ABI static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; free(cpu_plan->cplan.work_data); @@ -593,7 +593,7 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backen GGML_UNUSED(backend); } -static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +GGML_BACKEND_ABI static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); @@ -601,7 +601,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac GGML_UNUSED(backend); } -static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +GGML_BACKEND_ABI static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); @@ -617,7 +617,7 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c ggml_graph_compute(cgraph, &cplan); } -static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { +GGML_BACKEND_ABI static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { switch (op->op) { case GGML_OP_MUL_MAT: return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; diff --git a/llama.cpp/ggml-backend.h b/llama.cpp/ggml-backend.h index a9d2fddd72..6ad670c7f2 100644 --- a/llama.cpp/ggml-backend.h +++ b/llama.cpp/ggml-backend.h @@ -3,6 +3,12 @@ #include "ggml.h" #include "ggml-alloc.h" +#if defined(_WIN32) +#define GGML_BACKEND_ABI +#else +#define GGML_BACKEND_ABI __attribute__((__ms_abi__)) +#endif + #ifdef __cplusplus extern "C" { #endif @@ -173,7 +179,7 @@ extern "C" { GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph); GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy); - typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); + typedef bool (*GGML_BACKEND_ABI ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); // Compare the output of two backends GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); diff --git a/llama.cpp/ggml-cuda.cu b/llama.cpp/ggml-cuda.cu index cfbf9b4a63..86075d9d43 100644 --- a/llama.cpp/ggml-cuda.cu +++ b/llama.cpp/ggml-cuda.cu @@ -415,11 +415,11 @@ static ggml_backend_buffer_t ggml_backend_buffer_init( #define ggml_backend_cpu_buffer_type ggml_backend_cpu_buffer_type_ static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); -static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer); -static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); -static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); -static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); -static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); +GGML_BACKEND_ABI static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer); +GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); +GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); +GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); +GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); // for buffers from ptr, free is not called static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { @@ -437,7 +437,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size); } -static const char * ggml_backend_cpu_name(ggml_backend_t backend) { +GGML_BACKEND_ABI static const char * ggml_backend_cpu_name(ggml_backend_t backend) { return "CPU"; GGML_UNUSED(backend); } @@ -449,32 +449,32 @@ static bool ggml_backend_is_cpu(ggml_backend_t backend) { static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 -static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_BACKEND_ABI static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return TENSOR_ALIGNMENT; GGML_UNUSED(buft); } -static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { +GGML_BACKEND_ABI static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { return ggml_backend_is_cpu(backend); GGML_UNUSED(buft); } -static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { fprintf(stderr, "WARNING: using untested foreign free due to ggml backend\n"); free(buffer->context); GGML_UNUSED(buffer); } -static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { +GGML_BACKEND_ABI static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { return (void *)buffer->context; } -static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); GGML_UNUSED(buffer); } -static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { memcpy(data, (const char *)tensor->data + offset, size); GGML_UNUSED(buffer); } @@ -484,7 +484,7 @@ static void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * da tensor->buffer->iface.get_tensor(tensor->buffer, tensor, data, offset, size); } -static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { +GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); GGML_UNUSED(buffer); } @@ -494,7 +494,7 @@ static void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * da tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size); } -static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { +GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); GGML_UNUSED(buffer); } @@ -509,7 +509,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = { /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, }; -static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_BACKEND_ABI static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC? return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size); @@ -10069,18 +10069,18 @@ struct ggml_backend_buffer_context_cuda { } }; -static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_BACKEND_ABI static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; CUDA_CHECK(cudaFree(ctx->dev_ptr)); delete ctx; } -static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { +GGML_BACKEND_ABI static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; return ctx->dev_ptr; } -static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +GGML_BACKEND_ABI static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; if (tensor->view_src != NULL && tensor->view_offs == 0) { @@ -10114,7 +10114,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g UNUSED(buffer); } -static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_BACKEND_ABI static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; @@ -10125,7 +10125,7 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice)); } -static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_BACKEND_ABI static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; @@ -10136,7 +10136,7 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost)); } -static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +GGML_BACKEND_ABI static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; ggml_cuda_set_device(ctx->device); @@ -10158,7 +10158,7 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { // cuda buffer type -static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_BACKEND_ABI static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { int device = (int) (intptr_t) buft->context; ggml_cuda_set_device(device); @@ -10173,13 +10173,13 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size); } -static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_BACKEND_ABI static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 128; UNUSED(buft); } -static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) { +GGML_BACKEND_ABI static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) { int64_t row_low = 0; int64_t row_high = ggml_nrows(tensor); int64_t nrows_split = row_high - row_low; @@ -10199,7 +10199,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t UNUSED(buft); } -static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { +GGML_BACKEND_ABI static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { return ggml_backend_is_cuda(backend); UNUSED(buft); @@ -10233,11 +10233,11 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { // host buffer type -static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_BACKEND_ABI static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_cuda_host_free(buffer->context); } -static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_BACKEND_ABI static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { void * ptr = ggml_cuda_host_malloc(size); if (ptr == nullptr) { @@ -10275,26 +10275,26 @@ struct ggml_backend_context_cuda { int device; }; -static const char * ggml_backend_cuda_name(ggml_backend_t backend) { +GGML_BACKEND_ABI static const char * ggml_backend_cuda_name(ggml_backend_t backend) { return GGML_CUDA_NAME; UNUSED(backend); } -static void ggml_backend_cuda_free(ggml_backend_t backend) { +GGML_BACKEND_ABI static void ggml_backend_cuda_free(ggml_backend_t backend) { ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; delete cuda_ctx; delete backend; } -static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { +GGML_BACKEND_ABI static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; return ggml_backend_cuda_buffer_type(cuda_ctx->device); } -static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_BACKEND_ABI static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); @@ -10303,7 +10303,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0])); } -static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_BACKEND_ABI static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); @@ -10312,7 +10312,7 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0])); } -static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { +GGML_BACKEND_ABI static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0])); @@ -10320,7 +10320,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { UNUSED(backend); } -static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) { +GGML_BACKEND_ABI static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) { GGML_ASSERT(!"not implemented"); return nullptr; @@ -10329,21 +10329,21 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen UNUSED(cgraph); } -static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +GGML_BACKEND_ABI static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { GGML_ASSERT(!"not implemented"); UNUSED(backend); UNUSED(plan); } -static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +GGML_BACKEND_ABI static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { GGML_ASSERT(!"not implemented"); UNUSED(backend); UNUSED(plan); } -static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +GGML_BACKEND_ABI static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; ggml_cuda_set_main_device(cuda_ctx->device); @@ -10402,7 +10402,7 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph UNUSED(backend); } -static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { +GGML_BACKEND_ABI static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { diff --git a/llama.cpp/ggml-metal.m b/llama.cpp/ggml-metal.m index fed9917da5..9db8d3b084 100644 --- a/llama.cpp/ggml-metal.m +++ b/llama.cpp/ggml-metal.m @@ -29,7 +29,7 @@ static ggml_backend_buffer_t ggml_backend_buffer_init( return buffer; } -static const char * ggml_backend_cpu_name(ggml_backend_t backend) { +GGML_BACKEND_ABI static const char * ggml_backend_cpu_name(ggml_backend_t backend) { return "CPU"; GGML_UNUSED(backend); } @@ -2672,13 +2672,13 @@ static void ggml_backend_metal_free_device(void) { } } -static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { +GGML_BACKEND_ABI static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; return ctx->all_data; } -static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_BACKEND_ABI static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; for (int i = 0; i < ctx->n_buffers; i++) { @@ -2693,31 +2693,31 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) free(ctx); } -static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_BACKEND_ABI static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); UNUSED(buffer); } -static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_BACKEND_ABI static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { memcpy(data, (const char *)tensor->data + offset, size); UNUSED(buffer); } -static void ggml_backend_metal_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { +GGML_BACKEND_ABI static void ggml_backend_metal_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); UNUSED(buffer); } -static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { +GGML_BACKEND_ABI static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); UNUSED(buffer); } -static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +GGML_BACKEND_ABI static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; memset(ctx->all_data, value, ctx->all_size); @@ -2736,7 +2736,7 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_ // default buffer type -static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_BACKEND_ABI static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); const size_t size_page = sysconf(_SC_PAGESIZE); @@ -2788,18 +2788,18 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); } -static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_BACKEND_ABI static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 32; UNUSED(buft); } -static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { +GGML_BACKEND_ABI static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend); UNUSED(buft); } -static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { +GGML_BACKEND_ABI static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; UNUSED(buft); @@ -2901,31 +2901,31 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz // backend -static const char * ggml_backend_metal_name(ggml_backend_t backend) { +GGML_BACKEND_ABI static const char * ggml_backend_metal_name(ggml_backend_t backend) { return "Metal"; UNUSED(backend); } -static void ggml_backend_metal_free(ggml_backend_t backend) { +GGML_BACKEND_ABI static void ggml_backend_metal_free(ggml_backend_t backend) { struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; ggml_metal_free(ctx); free(backend); } -static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) { +GGML_BACKEND_ABI static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_metal_buffer_type(); UNUSED(backend); } -static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +GGML_BACKEND_ABI static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; ggml_metal_graph_compute(metal_ctx, cgraph); } -static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { +GGML_BACKEND_ABI static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { return ggml_metal_supports_op(op); UNUSED(backend); diff --git a/llama.cpp/llava/clip.cpp b/llama.cpp/llava/clip.cpp index 71db914bd2..1ca7351bfa 100644 --- a/llama.cpp/llava/clip.cpp +++ b/llama.cpp/llava/clip.cpp @@ -540,12 +540,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { clip_ctx * new_clip = new clip_ctx; -#if 0 if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) { new_clip->backend = ggml_backend_cuda_init(0); tinylogf("%s: CLIP using CUDA backend\n", __func__); } -#endif if (llamafile_gpu_supported() == LLAMAFILE_GPU_APPLE) { new_clip->backend = ggml_backend_metal_init();