diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c3aa6f9924db8..6dcb432d625cc 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -143,6 +143,47 @@ jobs:
cd build
ctest --verbose
+ ubuntu-22-cmake-sycl:
+ runs-on: ubuntu-22.04
+
+ continue-on-error: true
+
+ steps:
+ - uses: actions/checkout@v2
+
+ - name: add oneAPI to apt
+ shell: bash
+ run: |
+ cd /tmp
+ wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+ sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+ rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+ sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+
+ - name: install oneAPI dpcpp compiler
+ shell: bash
+ run: |
+ sudo apt update
+ sudo apt install intel-oneapi-compiler-dpcpp-cpp
+
+ - name: install oneAPI MKL library
+ shell: bash
+ run: |
+ sudo apt install intel-oneapi-mkl-devel
+
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v3
+
+ - name: Build
+ id: cmake_build
+ run: |
+ source /opt/intel/oneapi/setvars.sh
+ mkdir build
+ cd build
+ cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
+ cmake --build . --config Release -j $(nproc)
+
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it.
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
diff --git a/CMakeLists.txt b/CMakeLists.txt
index af36651297b76..4a755b7b42bef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,6 @@
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
project("llama.cpp" C CXX)
+include(CheckIncludeFileCXX)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -103,6 +104,8 @@ option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging"
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
+option(LLAMA_SYCL "llama: use SYCL" OFF)
+option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@@ -121,8 +124,12 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
#
# Compile flags
#
+if (LLAMA_SYCL)
+ set(CMAKE_CXX_STANDARD 17)
+else()
+ set(CMAKE_CXX_STANDARD 11)
+endif()
-set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED true)
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true)
@@ -454,6 +461,32 @@ if (LLAMA_HIPBLAS)
endif()
endif()
+
+if (LLAMA_SYCL)
+ if ( NOT DEFINED ENV{ONEAPI_ROOT})
+ message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
+ endif()
+ #todo: AOT
+
+ find_package(IntelSYCL REQUIRED)
+ if (LLAMA_SYCL_F16)
+ add_compile_definitions(GGML_SYCL_F16)
+ endif()
+ add_compile_definitions(GGML_USE_SYCL)
+
+ add_compile_options(-I./) #include DPCT
+ add_compile_options(-I/${SYCL_INCLUDE_DIR})
+
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
+
+ set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h)
+ set(GGML_SOURCES_SYCL ggml-sycl.cpp)
+
+ set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
+endif()
+
function(get_flags CCID CCVER)
set(C_FLAGS "")
set(CXX_FLAGS "")
@@ -479,10 +512,12 @@ function(get_flags CCID CCVER)
set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
endif()
elseif (CCID MATCHES "Intel")
- # enable max optimization level when using Intel compiler
- set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
- set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
- add_link_options(-fuse-ld=lld -static-intel)
+ if (NOT LLAMA_SYCL)
+ # enable max optimization level when using Intel compiler
+ set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
+ set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
+ add_link_options(-fuse-ld=lld -static-intel)
+ endif()
endif()
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
@@ -795,6 +830,7 @@ add_library(ggml OBJECT
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
+ ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
)
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
diff --git a/README.md b/README.md
index cbfba01bc4b62..8a0b363409410 100644
--- a/README.md
+++ b/README.md
@@ -63,7 +63,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
- AVX, AVX2 and AVX512 support for x86 architectures
- Mixed F16 / F32 precision
- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
-- CUDA, Metal and OpenCL GPU backend support
+- CUDA, Metal, OpenCL, SYCL GPU backend support
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
@@ -597,6 +597,15 @@ Building the program with BLAS support may lead to some performance improvements
You can get a list of platforms and devices from the `clinfo -l` command, etc.
+- #### SYCL
+
+ SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
+
+ llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
+
+ For detailed info, please refer to [llama.cpp for SYCL](README_sycl.md).
+
+
### Prepare Data & Run
```bash
diff --git a/README_sycl.md b/README_sycl.md
new file mode 100644
index 0000000000000..d5a1818f5a268
--- /dev/null
+++ b/README_sycl.md
@@ -0,0 +1,252 @@
+# llama.cpp for SYCL
+
+[Background](#background)
+
+[OS](#os)
+
+[Intel GPU](#intel-gpu)
+
+[Linux](#linux)
+
+[Environment Variable](#environment-variable)
+
+[Known Issue](#known-issue)
+
+[Todo](#todo)
+
+## Background
+
+SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
+
+oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
+
+Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
+
+To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
+
+The llama.cpp for SYCL is used to support Intel GPUs.
+
+For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
+
+## OS
+
+|OS|Status|Verified|
+|-|-|-|
+|Linux|Support|Ubuntu 22.04|
+|Windows|Ongoing| |
+
+
+## Intel GPU
+
+|Intel GPU| Status | Verified Model|
+|-|-|-|
+|Intel Data Center Max Series| Support| Max 1550|
+|Intel Data Center Flex Series| Support| Flex 170|
+|Intel Arc Series| Support| Arc 770|
+|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
+|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
+
+
+## Linux
+
+### Setup Environment
+
+1. Install Intel GPU driver.
+
+a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
+
+Note: for iGPU, please install the client GPU driver.
+
+b. Add user to group: video, render.
+
+```
+sudo usermod -aG render username
+sudo usermod -aG video username
+```
+
+Note: re-login to enable it.
+
+c. Check
+
+```
+sudo apt install clinfo
+sudo clinfo -l
+```
+
+Output (example):
+
+```
+Platform #0: Intel(R) OpenCL Graphics
+ `-- Device #0: Intel(R) Arc(TM) A770 Graphics
+
+
+Platform #0: Intel(R) OpenCL HD Graphics
+ `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
+```
+
+2. Install Intel® oneAPI Base toolkit.
+
+
+a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
+
+Recommend to install to default folder: **/opt/intel/oneapi**.
+
+Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
+
+b. Check
+
+```
+source /opt/intel/oneapi/setvars.sh
+
+sycl-ls
+```
+
+There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
+
+Output (example):
+```
+[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
+[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
+[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
+[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
+
+```
+
+2. Build locally:
+
+```
+mkdir -p build
+cd build
+source /opt/intel/oneapi/setvars.sh
+
+#for FP16
+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
+
+#for FP32
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+#build example/main only
+#cmake --build . --config Release --target main
+
+#build all binary
+cmake --build . --config Release -v
+
+```
+
+or
+
+```
+./examples/sycl/build.sh
+```
+
+Note:
+
+- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
+
+### Run
+
+1. Put model file to folder **models**
+
+2. Enable oneAPI running environment
+
+```
+source /opt/intel/oneapi/setvars.sh
+```
+
+3. List device ID
+
+Run without parameter:
+
+```
+./build/bin/ls-sycl-device
+
+or
+
+./build/bin/main
+```
+
+Check the ID in startup log, like:
+
+```
+found 4 SYCL devices:
+ Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
+ max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
+ Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
+ max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
+ Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
+ max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
+ Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
+ max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
+
+```
+
+|Attribute|Note|
+|-|-|
+|compute capability 1.3|Level-zero running time, recommended |
+|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
+
+4. Set device ID and execute llama.cpp
+
+Set device ID = 0 by **GGML_SYCL_DEVICE=0**
+
+```
+GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+```
+or run by script:
+
+```
+./examples/sycl/run_llama2.sh
+```
+
+Note:
+
+- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
+
+
+5. Check the device ID in output
+
+Like:
+```
+Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
+```
+
+
+## Environment Variable
+
+#### Build
+
+|Name|Value|Function|
+|-|-|-|
+|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.
For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
+|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference.
For FP32, not set it.|
+|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
+|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
+
+#### Running
+
+
+|Name|Value|Function|
+|-|-|-|
+|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
+|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
+
+## Known Issue
+
+- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
+
+ Miss to enable oneAPI running environment.
+
+ Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
+
+
+- Hang during startup
+
+ llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
+
+ Solution: add **--no-mmap**.
+
+## Todo
+
+- Support to build in Windows.
+
+- Support multiple cards.
diff --git a/ci/README.md b/ci/README.md
index 65cfe63ebe8e9..4064705190697 100644
--- a/ci/README.md
+++ b/ci/README.md
@@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt
# with CUDA support
GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+
+# with SYCL support
+source /opt/intel/oneapi/setvars.sh
+GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
```
diff --git a/ci/run.sh b/ci/run.sh
index 791b17a191a2b..7584f651fffc7 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -10,6 +10,9 @@
# # with CUDA support
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
+# # with SYCL support
+# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
if [ -z "$2" ]; then
echo "usage: $0 "
@@ -40,6 +43,14 @@ if [ ! -z ${GG_BUILD_CUDA} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
fi
+if [ ! -z ${GG_BUILD_SYCL} ]; then
+ if [ -z ${ONEAPI_ROOT} ]; then
+ echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
+ exit 1
+ fi
+
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
+fi
## helpers
# download a file if it does not exist or if it is outdated
diff --git a/common/common.cpp b/common/common.cpp
index 6b07f119718c0..2880136761afb 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -42,6 +42,10 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
+#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
+#define GGML_USE_CUBLAS_SYCL
+#endif
+
int32_t get_num_physical_cores() {
#ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores
@@ -599,9 +603,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUBLAS
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CUBLAS_SYCL
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUBLAS_SYCL
} else if (arg == "--split-mode" || arg == "-sm") {
if (++i >= argc) {
invalid_param = true;
@@ -618,9 +622,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
-#ifndef GGML_USE_CUBLAS
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CUBLAS_SYCL
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUBLAS_SYCL
+
} else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) {
invalid_param = true;
@@ -643,9 +648,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
params.tensor_split[i] = 0.0f;
}
}
-#ifndef GGML_USE_CUBLAS
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CUBLAS_SYCL
+ fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
+#endif // GGML_USE_CUBLAS_SYCL
} else if (arg == "--no-mmap") {
params.use_mmap = false;
} else if (arg == "--numa") {
@@ -1007,7 +1012,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
-#endif
+#endif // LLAMA_SUPPORTS_GPU_OFFLOAD
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
printf(" -gan N, --grp-attn-n N\n");
@@ -1514,7 +1519,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
- fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index f67d74c5530c9..68ad899648137 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -23,6 +23,9 @@ else()
add_subdirectory(infill)
add_subdirectory(llama-bench)
add_subdirectory(llava)
+ if (LLAMA_SYCL)
+ add_subdirectory(sycl)
+ endif()
add_subdirectory(main)
add_subdirectory(tokenize)
add_subdirectory(parallel)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 0462fbd24739b..c97ad70527ba3 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2319,7 +2319,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
-#ifdef GGML_USE_CUBLAS
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
std::string arg_next = argv[i];
// split string by , and /
@@ -2345,7 +2345,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
else if (arg == "--no-mul-mat-q" || arg == "-nommq")
{
-#ifdef GGML_USE_CUBLAS
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
params.mul_mat_q = false;
#else
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
@@ -2358,7 +2358,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
-#ifdef GGML_USE_CUBLAS
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
params.main_gpu = std::stoi(argv[i]);
#else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
new file mode 100644
index 0000000000000..69cf8932eb5c7
--- /dev/null
+++ b/examples/sycl/CMakeLists.txt
@@ -0,0 +1,9 @@
+# MIT license
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+set(TARGET ls-sycl-device)
+add_executable(${TARGET} ls-sycl-device.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/sycl/README.md b/examples/sycl/README.md
new file mode 100644
index 0000000000000..b46f17f39efd4
--- /dev/null
+++ b/examples/sycl/README.md
@@ -0,0 +1,47 @@
+# llama.cpp/example/sycl
+
+This example program provide the tools for llama.cpp for SYCL on Intel GPU.
+
+## Tool
+
+|Tool Name| Function|Status|
+|-|-|-|
+|ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
+
+### ls-sycl-device
+
+List all SYCL devices with ID, compute capability, max work group size, ect.
+
+1. Build the llama.cpp for SYCL for all targets.
+
+2. Enable oneAPI running environment
+
+```
+source /opt/intel/oneapi/setvars.sh
+```
+
+3. Execute
+
+```
+./build/bin/ls-sycl-device
+```
+
+Check the ID in startup log, like:
+
+```
+found 4 SYCL devices:
+ Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
+ max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
+ Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
+ max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
+ Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
+ max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
+ Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
+ max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
+
+```
+
+|Attribute|Note|
+|-|-|
+|compute capability 1.3|Level-zero running time, recommended |
+|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh
new file mode 100755
index 0000000000000..26ad2f7da754e
--- /dev/null
+++ b/examples/sycl/build.sh
@@ -0,0 +1,20 @@
+
+# MIT license
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+mkdir -p build
+cd build
+source /opt/intel/oneapi/setvars.sh
+
+#for FP16
+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
+
+#for FP32
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+#build example/main only
+#cmake --build . --config Release --target main
+
+#build all binary
+cmake --build . --config Release -v
diff --git a/examples/sycl/ls-sycl-device.cpp b/examples/sycl/ls-sycl-device.cpp
new file mode 100644
index 0000000000000..42847154aa3ea
--- /dev/null
+++ b/examples/sycl/ls-sycl-device.cpp
@@ -0,0 +1,11 @@
+/*MIT license
+ Copyright (C) 2024 Intel Corporation
+ SPDX-License-Identifier: MIT
+*/
+
+#include "ggml-sycl.h"
+
+int main(int argc, char ** argv) {
+ ggml_backend_sycl_print_sycl_devices();
+ return 0;
+}
diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh
new file mode 100755
index 0000000000000..f5f4c1e980de4
--- /dev/null
+++ b/examples/sycl/run-llama2.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# MIT license
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: MIT
+
+INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
+source /opt/intel/oneapi/setvars.sh
+
+if [ $# -gt 0 ]; then
+ export GGML_SYCL_DEVICE=$1
+else
+ export GGML_SYCL_DEVICE=0
+fi
+echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
+#export GGML_SYCL_DEBUG=1
+./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
+
diff --git a/ggml-backend.c b/ggml-backend.c
index 423512defc132..a20bdf4c97505 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -337,6 +337,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
ggml_backend_cuda_reg_devices();
#endif
+#ifdef GGML_USE_SYCL
+ extern void ggml_backend_sycl_reg_devices(void);
+ ggml_backend_sycl_reg_devices();
+#endif
+
#ifdef GGML_USE_METAL
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
new file mode 100644
index 0000000000000..9764d9c351516
--- /dev/null
+++ b/ggml-sycl.cpp
@@ -0,0 +1,15197 @@
+/*MIT license
+ Copyright (C) 2024 Intel Corporation
+ SPDX-License-Identifier: MIT
+*/
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+
+#include
+#include
+
+#include "ggml-sycl.h"
+#include "ggml.h"
+#include "ggml-backend-impl.h"
+
+/*
+Following definition copied from DPCT head files, which are used by ggml-sycl.cpp
+*/
+// COPY from DPCT head files
+#include
+#include
+#include